spectrocloud · blik616287 · Jun 11, 2026 · Jun 12, 2026 · Jun 12, 2026 · Jun 15, 2026
diff --git a/packs/nvidia-vss-vllm-1.0.0/README.md b/packs/nvidia-vss-vllm-1.0.0/README.md
@@ -0,0 +1,23 @@
+# NVIDIA VSS LLM (vLLM)
+
+The bounded raw vLLM LLM backend for NVIDIA VSS on platforms where the TensorRT-LLM NIM is unsupported (DGX Spark / GB10 sm_121). Serves the OpenAI API as `llm-nim-svc:8000` for the CA-RAG pipeline. Part of the upstream **VSS 2.4.1** blueprint.
+
+## Prerequisites
+
+- An NGC API key (the vLLM image is `nvcr.io/nvidia/vllm`).
+- A GPU node (validated on NVIDIA GB10 / DGX Spark).
+
+## Parameters
+
+| **Parameter** | **Description** | **Type** | **Default** | **Required** |
+|---|---|---|---|---|
+| `spectro.var.VSS_PLATFORM` | Hardware platform preset | String | `DGX-SPARK` | No |
+
+## Usage
+
+Add to vLLM-backed VSS profiles at install-priority 12 (after `nvidia-vss-core-nims`). Omit on trtllm profiles (H100/L40S) which use the NIM-LLM subchart instead.
+
+---
+**Upstream:** NVIDIA VSS Blueprint 2.4.1 (`vllm:25.12.post1-py3`). **Pack version:** 1.0.x.
+
+> **CVE bump:** nginx health-proxy sidecar `1.27-alpine` → `1.30.2-alpine` (verified on GB10, `/v1/health/live` → 200; pack-central pax-cve confirms **0 Critical**, down from `1.27`'s 3 / 137 total). The vLLM image stays at `nvcr.io/nvidia/vllm:25.12.post1-py3` — a bump to `26.05.post1-py3` was tested on the GB10 (serves fine) but **reverted**: that image ships a JWT in its pip HTTP-cache (`/root/.cache/pip/...`, an upstream NVIDIA build-hygiene leak flagged by the secret scan) and gives **no CVE benefit** (the runtime image scans clean at both versions). The Jetson-Thor `ghcr.io/nvidia-ai-iot/vllm` tag is left floating (Tegra-only, verify before use).
diff --git a/packs/nvidia-vss-vllm-1.0.0/logo.png b/packs/nvidia-vss-vllm-1.0.0/logo.png
diff --git a/packs/nvidia-vss-vllm-1.0.0/manifests/llm-vllm.yaml b/packs/nvidia-vss-vllm-1.0.0/manifests/llm-vllm.yaml
@@ -0,0 +1,129 @@
+# Raw bounded vLLM LLM backend, Go-templated against the manifests.llm-vllm.*
+# values block (Palette renders kubeManifests as Helm templates + resolves
+# spectro.var / spectro.macro macros at deploy time). vLLM serves the OpenAI API on :8001; the
+# nginx sidecar on :8000 answers /v1/health/live (the path the vss-engine
+# check-llm-up init container probes, which raw vLLM lacks) and proxies the rest.
+# Service llm-nim-svc:8000 is what the application pack's egress (llm-openai-api)
+# targets. hf-token-secret is created by the data-infrastructure pack (priority 5).
+# Mirrors the validated ansible harness (roles/vss_deploy/templates/llm-vllm.yaml.j2).
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: llm-vllm-cache
+  namespace: {{ .Values.namespace }}
+spec:
+  accessModes: [ReadWriteOnce]
+  resources:
+    requests:
+      storage: {{ .Values.cacheSize }}
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: llm-vllm-nginx
+  namespace: {{ .Values.namespace }}
+data:
+  default.conf: |
+    server {
+      listen 8000;
+      location = /v1/health/live  { proxy_pass http://127.0.0.1:8001/health; }
+      location = /v1/health/ready { proxy_pass http://127.0.0.1:8001/health; }
+      location / {
+        proxy_pass http://127.0.0.1:8001;
+        proxy_read_timeout 600s;
+        proxy_buffering off;
+      }
+    }
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: llm-vllm
+  namespace: {{ .Values.namespace }}
+  labels: { app: llm-vllm }
+spec:
+  replicas: {{ .Values.replicas }}
+  selector: { matchLabels: { app: llm-vllm } }
+  strategy: { type: Recreate }
+  template:
+    metadata:
+      labels: { app: llm-vllm }
+    spec:
+      nodeSelector:
+        nvidia.com/gpu.present: "true"
+      tolerations:
+        - key: nvidia.com/gpu
+          operator: Exists
+          effect: NoSchedule
+      imagePullSecrets:
+        - name: ngc-pull-secret
+      containers:
+        - name: vllm
+          image: {{ .Values.image }}
+          command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
+          args:
+            - --model
+            - {{ .Values.hfModel }}
+            - --served-model-name
+            - {{ .Values.servedModelName }}
+            - --tensor-parallel-size
+            - "{{ .Values.tensorParallel }}"
+            - --gpu-memory-utilization
+            - "{{ .Values.gpuMemUtil }}"
+            - --max-model-len
+            - "{{ .Values.maxModelLen }}"
+            - --max-num-seqs
+            - "{{ .Values.maxNumSeqs }}"
+            {{- range .Values.extraArgs }}
+            - {{ . | quote }}
+            {{- end }}
+            - --host
+            - 0.0.0.0
+            - --port
+            - "8001"
+          env:
+            - name: HF_HOME
+              value: /cache
+            - name: HF_TOKEN
+              valueFrom:
+                secretKeyRef: { name: hf-token-secret, key: HF_TOKEN, optional: true }
+            - name: HUGGING_FACE_HUB_TOKEN
+              valueFrom:
+                secretKeyRef: { name: hf-token-secret, key: HF_TOKEN, optional: true }
+          ports:
+            - { containerPort: 8001, name: openai }
+          resources:
+            limits:
+              nvidia.com/gpu: {{ .Values.gpuCount }}
+            requests:
+              memory: {{ .Values.memRequest | quote }}
+              cpu: {{ .Values.cpuRequest | quote }}
+          volumeMounts:
+            - { name: cache, mountPath: /cache }
+            - { name: dshm, mountPath: /dev/shm }
+        - name: nginx
+          image: {{ .Values.nginxImage }}
+          ports:
+            - { containerPort: 8000, name: http }
+          volumeMounts:
+            - { name: nginx-conf, mountPath: /etc/nginx/conf.d }
+      volumes:
+        - name: cache
+          persistentVolumeClaim: { claimName: llm-vllm-cache }
+        - name: dshm
+          emptyDir: { medium: Memory, sizeLimit: {{ .Values.dshmSize }} }
+        - name: nginx-conf
+          configMap: { name: llm-vllm-nginx }
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: llm-nim-svc
+  namespace: {{ .Values.namespace }}
+spec:
+  selector: { app: llm-vllm }
+  ports:
+    - name: openai
+      port: 8000
+      targetPort: 8000
diff --git a/packs/nvidia-vss-vllm-1.0.0/pack.json b/packs/nvidia-vss-vllm-1.0.0/pack.json
@@ -0,0 +1,31 @@
+{
+  "addonType": "system app",
+  "annotations": {
+    "source": "community",
+    "contributor": "spectrocloud",
+    "docsURL": "https://docs.nvidia.com/vss/latest/index.html",
+    "description": "NVIDIA VSS LLM backend (bounded vLLM) serving the OpenAI API as llm-nim-svc.",
+    "upstreamVersion": "2.4.1",
+    "upstreamRef": "NVIDIA VSS Blueprint 2.4.1"
+  },
+  "cloudTypes": [
+    "all"
+  ],
+  "displayName": "NVIDIA VSS LLM (vLLM) (VSS 2.4.1)",
+  "kubeManifests": [
+    "manifests/llm-vllm.yaml"
+  ],
+  "layer": "addon",
+  "name": "nvidia-vss-vllm",
+  "version": "1.0.0",
+  "constraints": {
+    "dependencies": [
+      {
+        "packName": "nvidia-vss-core-nims",
+        "layer": "addon",
+        "minVersion": "1.0.0",
+        "type": "optional"
+      }
+    ]
+  }
+}
diff --git a/packs/nvidia-vss-vllm-1.0.0/presets.yaml b/packs/nvidia-vss-vllm-1.0.0/presets.yaml
@@ -0,0 +1,124 @@
+# VSS Platform profiles for the vLLM LLM backend (the vLLM-backed hardware only).
+#
+# ⚠️ OPERATOR CONTRACT: add this pack ONLY to vLLM profiles (DGX-SPARK / OTHER /
+#    RTXPRO6000BW / AGX-THOR / IGX-THOR), and select the SAME "VSS Platform" preset
+#    here as in the other VSS packs. NEVER add this pack to an H100/L40S (trtllm)
+#    profile — the core-nims nim-llm subchart already creates llm-nim-svc there, and
+#    two Services with the same name collide. See P1-P5-IMPLEMENTATION.md.
+#
+# Mutually-exclusive presets (group "VSS Platform"); DGX-SPARK is the default and
+# is the empirically-validated GB10 config (P1). The others (P3/P4) are authored
+# from the NVIDIA blueprint but NOT hardware-validated here — verify NGC/GHCR image
+# tags and tool-call-parser flags against an authenticated registry before use.
+#
+# NOTE: H100 / L40S are NOT in this group — they use the TensorRT-LLM nim-llm
+# subchart in nvidia-vss-core-nims and must NOT include this pack in their profile.
+presets:
+  - name: "DGX-SPARK"
+    displayName: "DGX Spark (GB10, arm64, sm_121) — llama-3.1-8b"
+    group: "VSS Platform"
+    remove: []
+    add: |
+      manifests:
+        llm-vllm:
+          replicas: 1
+          gpuCount: 1
+          image: "nvcr.io/nvidia/vllm:25.12.post1-py3"
+          hfModel: "NousResearch/Meta-Llama-3.1-8B-Instruct"
+          servedModelName: "meta/llama-3.1-8b-instruct"
+          tensorParallel: "1"
+          gpuMemUtil: "0.35"
+          maxModelLen: "8192"
+          maxNumSeqs: "4"
+          extraArgs:
+            - "--enforce-eager"
+
+  - name: "OTHER"
+    displayName: "Other / generic single GPU — llama-3.1-8b"
+    group: "VSS Platform"
+    remove: []
+    add: |
+      manifests:
+        llm-vllm:
+          replicas: 1
+          gpuCount: 1
+          image: "nvcr.io/nvidia/vllm:25.12.post1-py3"
+          hfModel: "NousResearch/Meta-Llama-3.1-8B-Instruct"
+          servedModelName: "meta/llama-3.1-8b-instruct"
+          tensorParallel: "1"
+          gpuMemUtil: "0.5"
+          maxModelLen: "8192"
+          maxNumSeqs: "4"
+          extraArgs:
+            - "--enforce-eager"
+
+  # ── P3 — RTX PRO 6000 Blackwell (sm_120) workstation. UNVALIDATED. ───────────
+  - name: "RTXPRO6000BW"
+    displayName: "RTX PRO 6000 Blackwell — nemotron-nano-9b-fp8 (unvalidated)"
+    group: "VSS Platform"
+    remove: []
+    add: |
+      manifests:
+        llm-vllm:
+          replicas: 1
+          gpuCount: 1
+          image: "nvcr.io/nvidia/vllm:25.12.post1-py3"
+          hfModel: "nvidia/NVIDIA-Nemotron-Nano-9B-v2-FP8"
+          servedModelName: "nvidia/nemotron-nano-9b-v2-fp8"
+          tensorParallel: "1"
+          gpuMemUtil: "0.5"
+          maxModelLen: "16384"
+          maxNumSeqs: "4"
+          extraArgs:
+            - "--trust-remote-code"
+            - "--enable-auto-tool-choice"
+            - "--tool-call-parser"
+            - "nemotron_json"
+
+  # ── P4 — Jetson AGX/IGX Thor (arm64 Tegra/iGPU). UNVALIDATED. ────────────────
+  # Uses the Jetson-specific vLLM image; the host's NVIDIA container runtime injects
+  # the L4T/Tegra libs+devices (no manual hostPath mounts). The application pack's
+  # Thor preset also sets runtimeClassName: nvidia on the engine pod.
+  - name: "AGX-THOR"
+    displayName: "Jetson AGX Thor (arm64, iGPU) — nemotron-nano-9b-v2-fp8 (unvalidated)"
+    group: "VSS Platform"
+    remove: []
+    add: |
+      manifests:
+        llm-vllm:
+          replicas: 1
+          gpuCount: 1
+          image: "ghcr.io/nvidia-ai-iot/vllm:latest-jetson-thor"
+          hfModel: "nvidia/NVIDIA-Nemotron-Nano-9B-v2-FP8"
+          servedModelName: "nvidia/nemotron-nano-9b-v2-fp8"
+          tensorParallel: "1"
+          gpuMemUtil: "0.25"
+          maxModelLen: "16384"
+          maxNumSeqs: "4"
+          extraArgs:
+            - "--trust-remote-code"
+            - "--enable-auto-tool-choice"
+            - "--tool-call-parser"
+            - "nemotron_json"
+
+  - name: "IGX-THOR"
+    displayName: "IGX + Thor iGPU (arm64) — nemotron-nano-9b-v2-fp8 (unvalidated)"
+    group: "VSS Platform"
+    remove: []
+    add: |
+      manifests:
+        llm-vllm:
+          replicas: 1
+          gpuCount: 1
+          image: "ghcr.io/nvidia-ai-iot/vllm:latest-jetson-thor"
+          hfModel: "nvidia/NVIDIA-Nemotron-Nano-9B-v2-FP8"
+          servedModelName: "nvidia/nemotron-nano-9b-v2-fp8"
+          tensorParallel: "1"
+          gpuMemUtil: "0.25"
+          maxModelLen: "16384"
+          maxNumSeqs: "4"
+          extraArgs:
+            - "--trust-remote-code"
+            - "--enable-auto-tool-choice"
+            - "--tool-call-parser"
+            - "nemotron_json"
diff --git a/packs/nvidia-vss-vllm-1.0.0/schema.yaml b/packs/nvidia-vss-vllm-1.0.0/schema.yaml
@@ -0,0 +1,16 @@
+# Pack value constraints for the vLLM LLM backend.
+# Keys are full dotted paths into values.yaml (the manifests.llm-vllm.* sub-tree).
+manifests.llm-vllm.image:
+  schema: '{{ required | format "${string}" | hints "vLLM container image (nvcr.io/nvidia/vllm:* on x86/SBSA; ghcr.io/nvidia-ai-iot/vllm:*-jetson-thor on Tegra)." }}'
+manifests.llm-vllm.hfModel:
+  schema: '{{ required | format "${string}" | hints "HF model id vLLM loads (e.g. NousResearch/Meta-Llama-3.1-8B-Instruct)." }}'
+manifests.llm-vllm.servedModelName:
+  schema: '{{ required | format "${string}" | hints "OpenAI served-model name the application pack references via llmModel." }}'
+manifests.llm-vllm.gpuMemUtil:
+  schema: '{{ format "${number}" | hints "vLLM --gpu-memory-utilization (LLM share of the GPU pool, e.g. 0.35 on GB10)." }}'
+manifests.llm-vllm.maxModelLen:
+  schema: '{{ format "${number}" | hints "vLLM --max-model-len." }}'
+manifests.llm-vllm.maxNumSeqs:
+  schema: '{{ format "${number}" | hints "vLLM --max-num-seqs." }}'
+manifests.llm-vllm.replicas:
+  schema: '{{ format "${number}" | hints "vLLM Deployment replicas (1)." }}'
diff --git a/packs/nvidia-vss-vllm-1.0.0/values.yaml b/packs/nvidia-vss-vllm-1.0.0/values.yaml
@@ -0,0 +1,68 @@
+# spectrocloud.com/enabled-presets: VSS Platform:DGX-SPARK
+#
+# ┌─ VSS PLATFORM MATRIX (canonical — keep identical across all VSS packs) ───────
+# │ Set ONE profile variable VSS_PLATFORM and select the matching "VSS Platform"
+# │ preset in EVERY VSS pack. Packs per profile: data-infra + core-nims +
+# │ application (always) + nvidia-vss-vllm (vLLM profiles only; H100/L40S omit it).
+# │
+# │ Platform          LLM backend   LLM model              VLM mem/len  decode    validated
+# │ DGX-SPARK         vLLM          llama-3.1-8b           0.28/10240   disabled  YES (GB10)
+# │ OTHER             vLLM          llama-3.1-8b           0.4 /16384   disabled  no
+# │ H100              nim (trtllm)  llama-3.1-70b (gpu 4)  0.8 /32768   dGPU      no
+# │ L40S              nim (trtllm)  llama-3.1-8b  (gpu 2)  0.8 /32768   dGPU      no
+# │ RTXPRO6000BW      vLLM          nemotron-nano-9b-fp8   0.4 /32768   dGPU      no
+# │ AGX-THOR/IGX-THOR vLLM (jetson) nemotron-nano-9b-fp8   0.4 /16384   Tegra*    no
+# │ "VLM mem/len" = the application vss-engine VLM; the vLLM LLM engine has its own
+# │   gpu-mem/max-len in the nvidia-vss-vllm presets (do not conflate the two).
+# │ * Tegra: runtimeClassName nvidia; the NVIDIA container runtime injects L4T libs.
+# └──────────────────────────────────────────────────────────────────────────────
+# NVIDIA VSS LLM backend — raw bounded vLLM (manifest-only pack).
+#
+# ═══════════════════════════════════════════════════════════════════════════════
+# WHY A SEPARATE PACK (P2-P4, see PLATFORM-SUPPORT-PLAN.md §6.4 / §8)
+# ═══════════════════════════════════════════════════════════════════════════════
+# VSS needs one of two mutually-exclusive LLM backends, selected by hardware:
+#   - raw vLLM (THIS pack)         → DGX-SPARK / RTXPRO6000BW / AGX-THOR / IGX-THOR / OTHER
+#   - nim-llm TensorRT-LLM subchart → H100 / L40S   (in nvidia-vss-core-nims)
+# Palette presets can parameterize a kubeManifest but cannot EXCLUDE one, and both
+# backends create a Service named llm-nim-svc — so they must never coexist. We make
+# the choice by PROFILE COMPOSITION: add this pack only to vLLM profiles; trtllm
+# profiles simply omit it. (Confirmed against docs.spectrocloud.com.)
+#
+# The manifest (manifests/llm-vllm.yaml) is Go-templated against the
+# manifests.llm-vllm.* block below; the "VSS Platform" presets set per-profile
+# values. install-priority 12 → after core-nims (10, creates ngc-pull-secret) and
+# data-infra (5, creates hf-token-secret), before application (15).
+
+pack:
+  namespace: nvidia-vss
+  spectrocloud.com/install-priority: "12"
+  content:
+    images:
+      - image: nvcr.io/nvidia/vllm:25.12.post1-py3
+      - image: nginx:1.30.2-alpine
+      # Jetson Thor (AGX-THOR/IGX-THOR presets) — floating tag, verify before use
+      - image: ghcr.io/nvidia-ai-iot/vllm:latest-jetson-thor
+
+manifests:
+  llm-vllm:
+    namespace: nvidia-vss
+    # DGX-SPARK (GB10) default — the empirically-validated config.
+    replicas: 1
+    gpuCount: 1
+    image: "nvcr.io/nvidia/vllm:25.12.post1-py3"
+    nginxImage: "nginx:1.30.2-alpine"
+    hfModel: "NousResearch/Meta-Llama-3.1-8B-Instruct"
+    servedModelName: "meta/llama-3.1-8b-instruct"
+    tensorParallel: "1"
+    gpuMemUtil: "0.35"
+    maxModelLen: "8192"
+    maxNumSeqs: "4"
+    cacheSize: "80Gi"
+    dshmSize: "16Gi"
+    memRequest: "8Gi"
+    cpuRequest: "2"
+    # Extra vLLM args appended verbatim. DGX-SPARK uses --enforce-eager.
+    # Nemotron/Edge models also need their tool-call parser flags (see presets).
+    extraArgs:
+      - "--enforce-eager"