diff --git a/packs/nvidia-vss-vllm-1.0.0/README.md b/packs/nvidia-vss-vllm-1.0.0/README.md new file mode 100644 index 00000000..0f1865ea --- /dev/null +++ b/packs/nvidia-vss-vllm-1.0.0/README.md @@ -0,0 +1,23 @@ +# NVIDIA VSS LLM (vLLM) + +The bounded raw vLLM LLM backend for NVIDIA VSS on platforms where the TensorRT-LLM NIM is unsupported (DGX Spark / GB10 sm_121). Serves the OpenAI API as `llm-nim-svc:8000` for the CA-RAG pipeline. Part of the upstream **VSS 2.4.1** blueprint. + +## Prerequisites + +- An NGC API key (the vLLM image is `nvcr.io/nvidia/vllm`). +- A GPU node (validated on NVIDIA GB10 / DGX Spark). + +## Parameters + +| **Parameter** | **Description** | **Type** | **Default** | **Required** | +|---|---|---|---|---| +| `spectro.var.VSS_PLATFORM` | Hardware platform preset | String | `DGX-SPARK` | No | + +## Usage + +Add to vLLM-backed VSS profiles at install-priority 12 (after `nvidia-vss-core-nims`). Omit on trtllm profiles (H100/L40S) which use the NIM-LLM subchart instead. + +--- +**Upstream:** NVIDIA VSS Blueprint 2.4.1 (`vllm:25.12.post1-py3`). **Pack version:** 1.0.x. + +> **CVE bump:** nginx health-proxy sidecar `1.27-alpine` → `1.30.2-alpine` (verified on GB10, `/v1/health/live` → 200; pack-central pax-cve confirms **0 Critical**, down from `1.27`'s 3 / 137 total). The vLLM image stays at `nvcr.io/nvidia/vllm:25.12.post1-py3` — a bump to `26.05.post1-py3` was tested on the GB10 (serves fine) but **reverted**: that image ships a JWT in its pip HTTP-cache (`/root/.cache/pip/...`, an upstream NVIDIA build-hygiene leak flagged by the secret scan) and gives **no CVE benefit** (the runtime image scans clean at both versions). The Jetson-Thor `ghcr.io/nvidia-ai-iot/vllm` tag is left floating (Tegra-only, verify before use). diff --git a/packs/nvidia-vss-vllm-1.0.0/logo.png b/packs/nvidia-vss-vllm-1.0.0/logo.png new file mode 100644 index 00000000..a98c7869 Binary files /dev/null and b/packs/nvidia-vss-vllm-1.0.0/logo.png differ diff --git a/packs/nvidia-vss-vllm-1.0.0/manifests/llm-vllm.yaml b/packs/nvidia-vss-vllm-1.0.0/manifests/llm-vllm.yaml new file mode 100644 index 00000000..84b670a4 --- /dev/null +++ b/packs/nvidia-vss-vllm-1.0.0/manifests/llm-vllm.yaml @@ -0,0 +1,129 @@ +# Raw bounded vLLM LLM backend, Go-templated against the manifests.llm-vllm.* +# values block (Palette renders kubeManifests as Helm templates + resolves +# spectro.var / spectro.macro macros at deploy time). vLLM serves the OpenAI API on :8001; the +# nginx sidecar on :8000 answers /v1/health/live (the path the vss-engine +# check-llm-up init container probes, which raw vLLM lacks) and proxies the rest. +# Service llm-nim-svc:8000 is what the application pack's egress (llm-openai-api) +# targets. hf-token-secret is created by the data-infrastructure pack (priority 5). +# Mirrors the validated ansible harness (roles/vss_deploy/templates/llm-vllm.yaml.j2). +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: llm-vllm-cache + namespace: {{ .Values.namespace }} +spec: + accessModes: [ReadWriteOnce] + resources: + requests: + storage: {{ .Values.cacheSize }} +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: llm-vllm-nginx + namespace: {{ .Values.namespace }} +data: + default.conf: | + server { + listen 8000; + location = /v1/health/live { proxy_pass http://127.0.0.1:8001/health; } + location = /v1/health/ready { proxy_pass http://127.0.0.1:8001/health; } + location / { + proxy_pass http://127.0.0.1:8001; + proxy_read_timeout 600s; + proxy_buffering off; + } + } +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: llm-vllm + namespace: {{ .Values.namespace }} + labels: { app: llm-vllm } +spec: + replicas: {{ .Values.replicas }} + selector: { matchLabels: { app: llm-vllm } } + strategy: { type: Recreate } + template: + metadata: + labels: { app: llm-vllm } + spec: + nodeSelector: + nvidia.com/gpu.present: "true" + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + imagePullSecrets: + - name: ngc-pull-secret + containers: + - name: vllm + image: {{ .Values.image }} + command: ["python3", "-m", "vllm.entrypoints.openai.api_server"] + args: + - --model + - {{ .Values.hfModel }} + - --served-model-name + - {{ .Values.servedModelName }} + - --tensor-parallel-size + - "{{ .Values.tensorParallel }}" + - --gpu-memory-utilization + - "{{ .Values.gpuMemUtil }}" + - --max-model-len + - "{{ .Values.maxModelLen }}" + - --max-num-seqs + - "{{ .Values.maxNumSeqs }}" + {{- range .Values.extraArgs }} + - {{ . | quote }} + {{- end }} + - --host + - 0.0.0.0 + - --port + - "8001" + env: + - name: HF_HOME + value: /cache + - name: HF_TOKEN + valueFrom: + secretKeyRef: { name: hf-token-secret, key: HF_TOKEN, optional: true } + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: { name: hf-token-secret, key: HF_TOKEN, optional: true } + ports: + - { containerPort: 8001, name: openai } + resources: + limits: + nvidia.com/gpu: {{ .Values.gpuCount }} + requests: + memory: {{ .Values.memRequest | quote }} + cpu: {{ .Values.cpuRequest | quote }} + volumeMounts: + - { name: cache, mountPath: /cache } + - { name: dshm, mountPath: /dev/shm } + - name: nginx + image: {{ .Values.nginxImage }} + ports: + - { containerPort: 8000, name: http } + volumeMounts: + - { name: nginx-conf, mountPath: /etc/nginx/conf.d } + volumes: + - name: cache + persistentVolumeClaim: { claimName: llm-vllm-cache } + - name: dshm + emptyDir: { medium: Memory, sizeLimit: {{ .Values.dshmSize }} } + - name: nginx-conf + configMap: { name: llm-vllm-nginx } +--- +apiVersion: v1 +kind: Service +metadata: + name: llm-nim-svc + namespace: {{ .Values.namespace }} +spec: + selector: { app: llm-vllm } + ports: + - name: openai + port: 8000 + targetPort: 8000 diff --git a/packs/nvidia-vss-vllm-1.0.0/pack.json b/packs/nvidia-vss-vllm-1.0.0/pack.json new file mode 100644 index 00000000..c4a7b2cc --- /dev/null +++ b/packs/nvidia-vss-vllm-1.0.0/pack.json @@ -0,0 +1,31 @@ +{ + "addonType": "system app", + "annotations": { + "source": "community", + "contributor": "spectrocloud", + "docsURL": "https://docs.nvidia.com/vss/latest/index.html", + "description": "NVIDIA VSS LLM backend (bounded vLLM) serving the OpenAI API as llm-nim-svc.", + "upstreamVersion": "2.4.1", + "upstreamRef": "NVIDIA VSS Blueprint 2.4.1" + }, + "cloudTypes": [ + "all" + ], + "displayName": "NVIDIA VSS LLM (vLLM) (VSS 2.4.1)", + "kubeManifests": [ + "manifests/llm-vllm.yaml" + ], + "layer": "addon", + "name": "nvidia-vss-vllm", + "version": "1.0.0", + "constraints": { + "dependencies": [ + { + "packName": "nvidia-vss-core-nims", + "layer": "addon", + "minVersion": "1.0.0", + "type": "optional" + } + ] + } +} \ No newline at end of file diff --git a/packs/nvidia-vss-vllm-1.0.0/presets.yaml b/packs/nvidia-vss-vllm-1.0.0/presets.yaml new file mode 100644 index 00000000..bdebe72e --- /dev/null +++ b/packs/nvidia-vss-vllm-1.0.0/presets.yaml @@ -0,0 +1,124 @@ +# VSS Platform profiles for the vLLM LLM backend (the vLLM-backed hardware only). +# +# ⚠️ OPERATOR CONTRACT: add this pack ONLY to vLLM profiles (DGX-SPARK / OTHER / +# RTXPRO6000BW / AGX-THOR / IGX-THOR), and select the SAME "VSS Platform" preset +# here as in the other VSS packs. NEVER add this pack to an H100/L40S (trtllm) +# profile — the core-nims nim-llm subchart already creates llm-nim-svc there, and +# two Services with the same name collide. See P1-P5-IMPLEMENTATION.md. +# +# Mutually-exclusive presets (group "VSS Platform"); DGX-SPARK is the default and +# is the empirically-validated GB10 config (P1). The others (P3/P4) are authored +# from the NVIDIA blueprint but NOT hardware-validated here — verify NGC/GHCR image +# tags and tool-call-parser flags against an authenticated registry before use. +# +# NOTE: H100 / L40S are NOT in this group — they use the TensorRT-LLM nim-llm +# subchart in nvidia-vss-core-nims and must NOT include this pack in their profile. +presets: + - name: "DGX-SPARK" + displayName: "DGX Spark (GB10, arm64, sm_121) — llama-3.1-8b" + group: "VSS Platform" + remove: [] + add: | + manifests: + llm-vllm: + replicas: 1 + gpuCount: 1 + image: "nvcr.io/nvidia/vllm:25.12.post1-py3" + hfModel: "NousResearch/Meta-Llama-3.1-8B-Instruct" + servedModelName: "meta/llama-3.1-8b-instruct" + tensorParallel: "1" + gpuMemUtil: "0.35" + maxModelLen: "8192" + maxNumSeqs: "4" + extraArgs: + - "--enforce-eager" + + - name: "OTHER" + displayName: "Other / generic single GPU — llama-3.1-8b" + group: "VSS Platform" + remove: [] + add: | + manifests: + llm-vllm: + replicas: 1 + gpuCount: 1 + image: "nvcr.io/nvidia/vllm:25.12.post1-py3" + hfModel: "NousResearch/Meta-Llama-3.1-8B-Instruct" + servedModelName: "meta/llama-3.1-8b-instruct" + tensorParallel: "1" + gpuMemUtil: "0.5" + maxModelLen: "8192" + maxNumSeqs: "4" + extraArgs: + - "--enforce-eager" + + # ── P3 — RTX PRO 6000 Blackwell (sm_120) workstation. UNVALIDATED. ─────────── + - name: "RTXPRO6000BW" + displayName: "RTX PRO 6000 Blackwell — nemotron-nano-9b-fp8 (unvalidated)" + group: "VSS Platform" + remove: [] + add: | + manifests: + llm-vllm: + replicas: 1 + gpuCount: 1 + image: "nvcr.io/nvidia/vllm:25.12.post1-py3" + hfModel: "nvidia/NVIDIA-Nemotron-Nano-9B-v2-FP8" + servedModelName: "nvidia/nemotron-nano-9b-v2-fp8" + tensorParallel: "1" + gpuMemUtil: "0.5" + maxModelLen: "16384" + maxNumSeqs: "4" + extraArgs: + - "--trust-remote-code" + - "--enable-auto-tool-choice" + - "--tool-call-parser" + - "nemotron_json" + + # ── P4 — Jetson AGX/IGX Thor (arm64 Tegra/iGPU). UNVALIDATED. ──────────────── + # Uses the Jetson-specific vLLM image; the host's NVIDIA container runtime injects + # the L4T/Tegra libs+devices (no manual hostPath mounts). The application pack's + # Thor preset also sets runtimeClassName: nvidia on the engine pod. + - name: "AGX-THOR" + displayName: "Jetson AGX Thor (arm64, iGPU) — nemotron-nano-9b-v2-fp8 (unvalidated)" + group: "VSS Platform" + remove: [] + add: | + manifests: + llm-vllm: + replicas: 1 + gpuCount: 1 + image: "ghcr.io/nvidia-ai-iot/vllm:latest-jetson-thor" + hfModel: "nvidia/NVIDIA-Nemotron-Nano-9B-v2-FP8" + servedModelName: "nvidia/nemotron-nano-9b-v2-fp8" + tensorParallel: "1" + gpuMemUtil: "0.25" + maxModelLen: "16384" + maxNumSeqs: "4" + extraArgs: + - "--trust-remote-code" + - "--enable-auto-tool-choice" + - "--tool-call-parser" + - "nemotron_json" + + - name: "IGX-THOR" + displayName: "IGX + Thor iGPU (arm64) — nemotron-nano-9b-v2-fp8 (unvalidated)" + group: "VSS Platform" + remove: [] + add: | + manifests: + llm-vllm: + replicas: 1 + gpuCount: 1 + image: "ghcr.io/nvidia-ai-iot/vllm:latest-jetson-thor" + hfModel: "nvidia/NVIDIA-Nemotron-Nano-9B-v2-FP8" + servedModelName: "nvidia/nemotron-nano-9b-v2-fp8" + tensorParallel: "1" + gpuMemUtil: "0.25" + maxModelLen: "16384" + maxNumSeqs: "4" + extraArgs: + - "--trust-remote-code" + - "--enable-auto-tool-choice" + - "--tool-call-parser" + - "nemotron_json" diff --git a/packs/nvidia-vss-vllm-1.0.0/schema.yaml b/packs/nvidia-vss-vllm-1.0.0/schema.yaml new file mode 100644 index 00000000..e520081c --- /dev/null +++ b/packs/nvidia-vss-vllm-1.0.0/schema.yaml @@ -0,0 +1,16 @@ +# Pack value constraints for the vLLM LLM backend. +# Keys are full dotted paths into values.yaml (the manifests.llm-vllm.* sub-tree). +manifests.llm-vllm.image: + schema: '{{ required | format "${string}" | hints "vLLM container image (nvcr.io/nvidia/vllm:* on x86/SBSA; ghcr.io/nvidia-ai-iot/vllm:*-jetson-thor on Tegra)." }}' +manifests.llm-vllm.hfModel: + schema: '{{ required | format "${string}" | hints "HF model id vLLM loads (e.g. NousResearch/Meta-Llama-3.1-8B-Instruct)." }}' +manifests.llm-vllm.servedModelName: + schema: '{{ required | format "${string}" | hints "OpenAI served-model name the application pack references via llmModel." }}' +manifests.llm-vllm.gpuMemUtil: + schema: '{{ format "${number}" | hints "vLLM --gpu-memory-utilization (LLM share of the GPU pool, e.g. 0.35 on GB10)." }}' +manifests.llm-vllm.maxModelLen: + schema: '{{ format "${number}" | hints "vLLM --max-model-len." }}' +manifests.llm-vllm.maxNumSeqs: + schema: '{{ format "${number}" | hints "vLLM --max-num-seqs." }}' +manifests.llm-vllm.replicas: + schema: '{{ format "${number}" | hints "vLLM Deployment replicas (1)." }}' diff --git a/packs/nvidia-vss-vllm-1.0.0/values.yaml b/packs/nvidia-vss-vllm-1.0.0/values.yaml new file mode 100644 index 00000000..a762f704 --- /dev/null +++ b/packs/nvidia-vss-vllm-1.0.0/values.yaml @@ -0,0 +1,68 @@ +# spectrocloud.com/enabled-presets: VSS Platform:DGX-SPARK +# +# ┌─ VSS PLATFORM MATRIX (canonical — keep identical across all VSS packs) ─────── +# │ Set ONE profile variable VSS_PLATFORM and select the matching "VSS Platform" +# │ preset in EVERY VSS pack. Packs per profile: data-infra + core-nims + +# │ application (always) + nvidia-vss-vllm (vLLM profiles only; H100/L40S omit it). +# │ +# │ Platform LLM backend LLM model VLM mem/len decode validated +# │ DGX-SPARK vLLM llama-3.1-8b 0.28/10240 disabled YES (GB10) +# │ OTHER vLLM llama-3.1-8b 0.4 /16384 disabled no +# │ H100 nim (trtllm) llama-3.1-70b (gpu 4) 0.8 /32768 dGPU no +# │ L40S nim (trtllm) llama-3.1-8b (gpu 2) 0.8 /32768 dGPU no +# │ RTXPRO6000BW vLLM nemotron-nano-9b-fp8 0.4 /32768 dGPU no +# │ AGX-THOR/IGX-THOR vLLM (jetson) nemotron-nano-9b-fp8 0.4 /16384 Tegra* no +# │ "VLM mem/len" = the application vss-engine VLM; the vLLM LLM engine has its own +# │ gpu-mem/max-len in the nvidia-vss-vllm presets (do not conflate the two). +# │ * Tegra: runtimeClassName nvidia; the NVIDIA container runtime injects L4T libs. +# └────────────────────────────────────────────────────────────────────────────── +# NVIDIA VSS LLM backend — raw bounded vLLM (manifest-only pack). +# +# ═══════════════════════════════════════════════════════════════════════════════ +# WHY A SEPARATE PACK (P2-P4, see PLATFORM-SUPPORT-PLAN.md §6.4 / §8) +# ═══════════════════════════════════════════════════════════════════════════════ +# VSS needs one of two mutually-exclusive LLM backends, selected by hardware: +# - raw vLLM (THIS pack) → DGX-SPARK / RTXPRO6000BW / AGX-THOR / IGX-THOR / OTHER +# - nim-llm TensorRT-LLM subchart → H100 / L40S (in nvidia-vss-core-nims) +# Palette presets can parameterize a kubeManifest but cannot EXCLUDE one, and both +# backends create a Service named llm-nim-svc — so they must never coexist. We make +# the choice by PROFILE COMPOSITION: add this pack only to vLLM profiles; trtllm +# profiles simply omit it. (Confirmed against docs.spectrocloud.com.) +# +# The manifest (manifests/llm-vllm.yaml) is Go-templated against the +# manifests.llm-vllm.* block below; the "VSS Platform" presets set per-profile +# values. install-priority 12 → after core-nims (10, creates ngc-pull-secret) and +# data-infra (5, creates hf-token-secret), before application (15). + +pack: + namespace: nvidia-vss + spectrocloud.com/install-priority: "12" + content: + images: + - image: nvcr.io/nvidia/vllm:25.12.post1-py3 + - image: nginx:1.30.2-alpine + # Jetson Thor (AGX-THOR/IGX-THOR presets) — floating tag, verify before use + - image: ghcr.io/nvidia-ai-iot/vllm:latest-jetson-thor + +manifests: + llm-vllm: + namespace: nvidia-vss + # DGX-SPARK (GB10) default — the empirically-validated config. + replicas: 1 + gpuCount: 1 + image: "nvcr.io/nvidia/vllm:25.12.post1-py3" + nginxImage: "nginx:1.30.2-alpine" + hfModel: "NousResearch/Meta-Llama-3.1-8B-Instruct" + servedModelName: "meta/llama-3.1-8b-instruct" + tensorParallel: "1" + gpuMemUtil: "0.35" + maxModelLen: "8192" + maxNumSeqs: "4" + cacheSize: "80Gi" + dshmSize: "16Gi" + memRequest: "8Gi" + cpuRequest: "2" + # Extra vLLM args appended verbatim. DGX-SPARK uses --enforce-eager. + # Nemotron/Edge models also need their tool-call parser flags (see presets). + extraArgs: + - "--enforce-eager"