From 85dcb498a096ff30f941af38bdbf563bf68c1b54 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Tue, 2 Jun 2026 18:22:11 +0000 Subject: [PATCH 01/14] Add Super-49B Helm answer generation --- nemo_retriever/helm/README.md | 49 +++- nemo_retriever/helm/templates/NOTES.txt | 4 + nemo_retriever/helm/templates/configmap.yaml | 30 +- nemo_retriever/helm/templates/deployment.yaml | 20 ++ .../llama-3-3-nemotron-super-49b-v1-5.yaml | 53 ++++ nemo_retriever/helm/values.yaml | 91 ++++++ nemo_retriever/pyproject.toml | 4 +- .../nemo_retriever/evaluation/generation.py | 6 + .../src/nemo_retriever/llm/clients/litellm.py | 41 ++- .../src/nemo_retriever/params/models.py | 2 + .../src/nemo_retriever/retriever.py | 2 + .../src/nemo_retriever/service/cli.py | 8 + .../src/nemo_retriever/service/config.py | 20 ++ .../service/retriever-service.yaml | 16 + .../nemo_retriever/service/routers/ingest.py | 122 ++++++++ .../tests/test_helm_nimcache_model_profile.py | 96 +++--- .../test_helm_super49b_answer_generation.py | 144 +++++++++ nemo_retriever/tests/test_live_rag.py | 2 + nemo_retriever/tests/test_llm_params.py | 33 +++ .../tests/test_service_answer_generation.py | 276 ++++++++++++++++++ .../tests/test_service_packaging.py | 28 ++ nemo_retriever/uv.lock | 5 +- 22 files changed, 1005 insertions(+), 47 deletions(-) create mode 100644 nemo_retriever/helm/templates/nims/llama-3-3-nemotron-super-49b-v1-5.yaml create mode 100644 nemo_retriever/tests/test_helm_super49b_answer_generation.py create mode 100644 nemo_retriever/tests/test_service_answer_generation.py create mode 100644 nemo_retriever/tests/test_service_packaging.py diff --git a/nemo_retriever/helm/README.md b/nemo_retriever/helm/README.md index 67d5409376..81f8d1a459 100644 --- a/nemo_retriever/helm/README.md +++ b/nemo_retriever/helm/README.md @@ -211,14 +211,15 @@ helm install retriever ./nemo_retriever/helm \ --set ngcApiSecret.password=$NGC_API_KEY ``` -> The VL reranker (`rerankqa`), Nemotron Parse, the Nemotron 3 Nano Omni 30B caption NIM, and the Parakeet `audio` ASR NIM are **all off by default** in 26.05 — they only reconcile when you explicitly opt in. Opt-in flags: +> The VL reranker (`rerankqa`), Nemotron Parse, the Nemotron 3 Nano Omni 30B caption NIM, the Super-49B answer-generation LLM, and the Parakeet `audio` ASR NIM are **all off by default** in 26.05 — they only reconcile when you explicitly opt in. Opt-in flags: > > * VL reranker — `--set nimOperator.rerankqa.enabled=true` > * Nemotron Parse — `--set nimOperator.nemotron_parse.enabled=true` > * Omni 30B captioner — `--set nimOperator.nemotron_3_nano_omni_30b_a3b_reasoning.enabled=true` +> * Super-49B answer generation — `--set nimOperator.llama_3_3_nemotron_super_49b_v1_5.enabled=true` > * Parakeet ASR — `--set nimOperator.audio.enabled=true` (also set `serviceConfig.nimEndpoints.audioGrpcEndpoint=audio:50051` to wire ASR into the service, plus `service.installFfmpeg=true` if your image does not bundle ffmpeg) > -> This matches the "optional and disabled by default" contract in [deployment-options.md](https://github.com/NVIDIA/NeMo-Retriever/blob/main/docs/docs/extraction/deployment-options.md) and avoids silently pulling ≈ 62 GiB of Omni weights or claiming a second dedicated GPU on a "default" install. See the [model hardware requirements](https://github.com/NVIDIA/NeMo-Retriever/blob/main/docs/docs/extraction/prerequisites-support-matrix.md#model-hardware-requirements) table for per-NIM GPU and disk costs. +> This matches the "optional and disabled by default" contract in [deployment-options.md](https://github.com/NVIDIA/NeMo-Retriever/blob/main/docs/docs/extraction/deployment-options.md) and avoids silently pulling ≈ 62 GiB of Omni weights, loading a large two-GPU LLM, or claiming extra dedicated GPUs on a "default" install. See the [model hardware requirements](https://github.com/NVIDIA/NeMo-Retriever/blob/main/docs/docs/extraction/prerequisites-support-matrix.md#model-hardware-requirements) table for per-NIM GPU and disk costs. The chart auto-wires the operator-managed in-cluster URLs of the four "core" NIMs into the service's `nim_endpoints` block: @@ -318,6 +319,12 @@ The retriever service picks up the in-cluster ASR endpoint when `nimOperator.aud | `serviceConfig.pipeline.batchWorkers` | `48` | Per-pod batch worker count. See [Timeouts and alleviating ingest failures](#timeouts-and-alleviating-ingest-failures) if embed or pool errors appear under load. | | `serviceConfig.nimEndpoints.*InvokeUrl` | `""` | Override the auto-resolved NIM Operator URL. Available knobs: `pageElementsInvokeUrl`, `tableStructureInvokeUrl`, `ocrInvokeUrl`, `embedInvokeUrl`, and `captionInvokeUrl` (see [Image captioning (Omni 30B)](#image-captioning-omni-30b)). | | `serviceConfig.nimEndpoints.captionModelName` | `""` | Model id sent to the remote VLM. Auto-set to `nvidia/nemotron-3-nano-omni-30b-a3b-reasoning` whenever a caption URL is resolved. | +| `serviceConfig.llm.enabled` | `false` | Enables `POST /v1/answer`. Auto-flips to true when the Super-49B NIM is enabled and the operator URL resolves. | +| `serviceConfig.llm.apiBase` | `""` | OpenAI-compatible LLM base URL. Explicit value wins; otherwise Super-49B opt-in resolves to `http://llama-3-3-nemotron-super-49b-v1-5:8000/v1`. | +| `serviceConfig.llm.apiKeySecret.name` | `""` | Optional Secret name for external LLM credentials. When set, the service reads the key from `NEMO_RETRIEVER_LLM_API_KEY`; the key is not written to the ConfigMap. | +| `serviceConfig.llm.apiKeySecret.key` | `api_key` | Secret key for external LLM credentials. | +| `serviceConfig.llm.model` | `openai/nvidia/llama-3.3-nemotron-super-49b-v1.5` | Model id passed to LiteLLM for answer generation. | +| `serviceConfig.llm.ragSystemPromptPrefix` | `/no_think` | Prepended to the RAG system prompt for Super-49B. | | `serviceConfig.vectordb.enabled` | `true` | Deploy the LanceDB vectordb Pod. When `true` the chart **requires** a resolvable embed endpoint (see [VectorDB and the embed endpoint](#vectordb-and-the-embed-endpoint)); `helm install` / `helm upgrade` fails fast otherwise. | | `serviceConfig.vectordb.lancedbUri` | `/data/vectordb` | LanceDB on the vectordb Pod's PVC. | | `serviceConfig.vectordb.embedModel` | `nvidia/llama-nemotron-embed-vl-1b-v2` | Passed to vectordb + worker `embed_model_name`. | @@ -355,6 +362,42 @@ sources](#3-install-with-the-nim-operator-in-cluster-nims)): `apps.nvidia.com/v1alpha1` CRDs are installed in the cluster. 3. Otherwise the chart fails the install. +#### Answer generation (Super-49B) { #answer-generation-super-49b } + +Enable the Super-49B NIM to add service-mode answer generation on top of +the VectorDB query path: + +```bash +helm upgrade --install retriever ./nemo_retriever/helm \ + --set nimOperator.llama_3_3_nemotron_super_49b_v1_5.enabled=true +``` + +When the NIM Operator CRDs are present, the chart renders a +`llama-3-3-nemotron-super-49b-v1-5` NIMCache/NIMService and writes this +block into `retriever-service.yaml`: + +```yaml +llm: + enabled: true + model: "openai/nvidia/llama-3.3-nemotron-super-49b-v1.5" + api_base: "http://llama-3-3-nemotron-super-49b-v1-5:8000/v1" + rag_system_prompt_prefix: "/no_think" +``` + +The retriever service then exposes `POST /v1/answer`, which calls the +VectorDB pod's `/v1/query` endpoint for context and sends those chunks to +the configured LLM endpoint. The default Super-49B NIMService resources request two GPUs +(`nvidia.com/gpu: 2`) to match the bundled tensor-parallel NIM +profile. Override `resources`, `modelProfile`, or `env` for deployments +that use a different profile or hardware topology. + +`serviceConfig.llm.apiBase` and `serviceConfig.llm.model` can be set +explicitly to point `/v1/answer` at an external OpenAI-compatible LLM +instead of deploying Super-49B in-cluster. For external credentials, +create a Kubernetes Secret and set `serviceConfig.llm.apiKeySecret.name` +plus `serviceConfig.llm.apiKeySecret.key`; Helm mounts the Secret as an +environment variable instead of writing the key into the ConfigMap. + ### NIM Operator sub-stack Each NIM block under `nimOperator.` renders a `NIMCache` + `NIMService` @@ -377,6 +420,7 @@ pair gated on three conditions ALL holding: | `nimOperator.rerankqa.enabled` | `false` | VL reranker NIM (optional; not auto-wired). Set `true` to opt in. Default `false` so 26.05 installs honor the "optional and disabled by default" contract in [deployment-options.md](https://github.com/NVIDIA/NeMo-Retriever/blob/main/docs/docs/extraction/deployment-options.md) and do not silently provision an extra ≈ 3.1 GiB GPU NIM. The image points at the **VL** SKU (`llama-nemotron-rerank-vl-1b-v2`) per [prerequisites-support-matrix.md](https://github.com/NVIDIA/NeMo-Retriever/blob/main/docs/docs/extraction/prerequisites-support-matrix.md#default-helm-nims) — the text-only `llama-nemotron-rerank-1b-v2` silently degrades multimodal reranking and is not the documented POR. | | `nimOperator.nemotron_parse.enabled` | `false` | Structured-parse NIM (optional). Set `true` when using `extract_method="nemotron_parse"`. Default `false` so 26.05 installs honor the "optional and disabled by default" contract in [deployment-options.md](https://github.com/NVIDIA/NeMo-Retriever/blob/main/docs/docs/extraction/deployment-options.md). Image tag follows the [image tag conventions](#image-tag-conventions). | | `nimOperator.nemotron_3_nano_omni_30b_a3b_reasoning.enabled` | `false` | Omni 30B caption NIM (optional). Set `true` to enable image captioning — see [Image captioning (Omni 30B)](#image-captioning-omni-30b). Default `false` so 26.05 installs do not silently pull ≈ 62 GiB of BF16 weights or claim a second dedicated GPU. Image tag follows the [image tag conventions](#image-tag-conventions). | +| `nimOperator.llama_3_3_nemotron_super_49b_v1_5.enabled` | `false` | Super-49B answer-generation NIM (optional). Set `true` to enable `/v1/answer` — see [Answer generation (Super-49B)](#answer-generation-super-49b). Default `false` so installs do not silently claim two dedicated GPUs. | | `nimOperator.audio.enabled` | `false` | Parakeet ASR NIM (optional). Set `true` for audio/video transcription; pair with `serviceConfig.nimEndpoints.audioGrpcEndpoint=audio:50051` so the retriever-service can reach it. | | `nimOperator..image.repository` | `nvcr.io/nim/nvidia/...` | Per-NIM image. | | `nimOperator..image.pullSecrets` | `[ngc-secret]` | Referenced by the NIMService CR. | @@ -1106,6 +1150,7 @@ Verify tags on the Git branch or tag you ship (for example `26.05` or | VL reranker (optional) | `rerankqa` | `nvcr.io/nim/nvidia/llama-nemotron-rerank-vl-1b-v2:1.10.0` | | Nemotron Parse (optional) | `nemotron_parse` | `nvcr.io/nim/nvidia/nemotron-parse-v1.2:1.7.0-variant` | | Omni caption (optional) | `nemotron_3_nano_omni_30b_a3b_reasoning` | `nvcr.io/nim/nvidia/nemotron-3-nano-omni-30b-a3b-reasoning:1.7.0-variant` | +| Super-49B answer LLM (optional) | `llama_3_3_nemotron_super_49b_v1_5` | `nvcr.io/nim/nvidia/llama-3.3-nemotron-super-49b-v1.5:2.0.5` | | Parakeet ASR (optional) | `audio` | `nvcr.io/nim/nvidia/parakeet-1-1b-ctc-en-us:1.5.0` | GPU SKU support for `audio` is in [Model hardware requirements](https://github.com/NVIDIA/NeMo-Retriever/blob/main/docs/docs/extraction/prerequisites-support-matrix.md#model-hardware-requirements). diff --git a/nemo_retriever/helm/templates/NOTES.txt b/nemo_retriever/helm/templates/NOTES.txt index a78be88fad..d2c1cec6d8 100644 --- a/nemo_retriever/helm/templates/NOTES.txt +++ b/nemo_retriever/helm/templates/NOTES.txt @@ -59,6 +59,10 @@ Services: {{- if .Values.nimOperator.vlm_embed.enabled }} - {{ .Values.nimOperator.vlm_embed.nimServiceName }} → http://{{ .Values.nimOperator.vlm_embed.nimServiceName }}:{{ .Values.nimOperator.vlm_embed.expose.service.port }}/v1/embeddings {{- end }} +{{- if .Values.nimOperator.llama_3_3_nemotron_super_49b_v1_5.enabled }} + - {{ .Values.nimOperator.llama_3_3_nemotron_super_49b_v1_5.nimServiceName }} → http://{{ .Values.nimOperator.llama_3_3_nemotron_super_49b_v1_5.nimServiceName }}:{{ .Values.nimOperator.llama_3_3_nemotron_super_49b_v1_5.expose.service.port }}/v1/chat/completions + (auto-wired into the retriever service config as `llm.api_base` for `/v1/answer`) +{{- end }} {{- if .Values.nimOperator.rerankqa.enabled }} - llama-nemotron-rerank-vl-1b-v2 → http://llama-nemotron-rerank-vl-1b-v2:{{ .Values.nimOperator.rerankqa.expose.service.port }} {{- end }} diff --git a/nemo_retriever/helm/templates/configmap.yaml b/nemo_retriever/helm/templates/configmap.yaml index 62860e4d6b..435acf1942 100644 --- a/nemo_retriever/helm/templates/configmap.yaml +++ b/nemo_retriever/helm/templates/configmap.yaml @@ -13,6 +13,7 @@ inherits the NIMService resource name, so the mapping is fixed: ocr -> nemotron-ocr-v1 /v1/infer vlm_embed -> llama-nemotron-embed-vl-1b-v2 /v1/embeddings nemotron_3_nano_omni_30b_a3b_reasoning -> nemotron-3-nano-omni-30b-a3b-reasoning /v1/chat/completions + llama_3_3_nemotron_super_49b_v1_5 -> llama-3-3-nemotron-super-49b-v1-5 /v1 */}} {{- $ctx := . -}} {{- $pageElementsURL := include "nemo-retriever.nim.endpointURL" (dict "context" $ctx "key" "page_elements" "serviceName" "nemotron-page-elements-v3" "configKey" "pageElementsInvokeUrl" "invokePath" "/v1/infer") -}} @@ -34,6 +35,17 @@ inherits the NIMService resource name, so the mapping is fixed: {{- $captionModelName = "nvidia/nemotron-3-nano-omni-30b-a3b-reasoning" -}} {{- end -}} {{- $audioGrpcEndpoint := $ctx.Values.serviceConfig.nimEndpoints.audioGrpcEndpoint | default "" -}} +{{- $llmAPIBase := $ctx.Values.serviceConfig.llm.apiBase | default "" -}} +{{- $llmOperatorAPIBase := "" -}} +{{- if not $llmAPIBase -}} +{{- $llmOperatorAPIBase = include "nemo-retriever.nimOperator.url" (dict "context" $ctx "key" "llama_3_3_nemotron_super_49b_v1_5" "serviceName" $ctx.Values.nimOperator.llama_3_3_nemotron_super_49b_v1_5.nimServiceName "invokePath" "/v1") -}} +{{- $llmAPIBase = $llmOperatorAPIBase -}} +{{- end -}} +{{- $llmEnabled := or $ctx.Values.serviceConfig.llm.enabled $llmAPIBase -}} +{{- $llmRagSystemPromptPrefix := $ctx.Values.serviceConfig.llm.ragSystemPromptPrefix | default "" -}} +{{- if and (not $llmRagSystemPromptPrefix) $llmOperatorAPIBase -}} +{{- $llmRagSystemPromptPrefix = "/no_think" -}} +{{- end -}} {{- define "nemo-retriever.configBody" -}} server: @@ -56,6 +68,20 @@ nim_endpoints: audio_grpc_endpoint: {{ if .audioGrpcEndpoint }}{{ .audioGrpcEndpoint | quote }}{{ else }}null{{ end }} api_key: null +llm: + enabled: {{ if .llmEnabled }}true{{ else }}false{{ end }} + model: {{ .Values.serviceConfig.llm.model | quote }} + api_base: {{ if .llmAPIBase }}{{ .llmAPIBase | quote }}{{ else }}null{{ end }} + api_key: null + temperature: {{ .Values.serviceConfig.llm.temperature }} + top_p: {{ if eq .Values.serviceConfig.llm.topP nil }}null{{ else }}{{ .Values.serviceConfig.llm.topP }}{{ end }} + max_tokens: {{ .Values.serviceConfig.llm.maxTokens }} + extra_params: {{ toJson .Values.serviceConfig.llm.extraParams }} + num_retries: {{ .Values.serviceConfig.llm.numRetries }} + timeout: {{ .Values.serviceConfig.llm.timeout }} + rag_system_prompt: {{ if .Values.serviceConfig.llm.ragSystemPrompt }}{{ .Values.serviceConfig.llm.ragSystemPrompt | quote }}{{ else }}null{{ end }} + rag_system_prompt_prefix: {{ if .llmRagSystemPromptPrefix }}{{ .llmRagSystemPromptPrefix | quote }}{{ else }}null{{ end }} + pipeline: realtime_workers: {{ .Values.serviceConfig.pipeline.realtimeWorkers }} realtime_queue_size: {{ .Values.serviceConfig.pipeline.realtimeQueueSize }} @@ -100,7 +126,7 @@ metadata: data: retriever-service.yaml: | mode: standalone -{{ include "nemo-retriever.configBody" (dict "Values" .Values "pageElementsURL" $pageElementsURL "tableStructureURL" $tableStructureURL "ocrURL" $ocrURL "embedURL" $embedURL "captionURL" $captionURL "captionModelName" $captionModelName "audioGrpcEndpoint" $audioGrpcEndpoint "vectordbSvc" $vectordbSvc "vectordbPort" $vectordbPort) | indent 4 }} +{{ include "nemo-retriever.configBody" (dict "Values" .Values "pageElementsURL" $pageElementsURL "tableStructureURL" $tableStructureURL "ocrURL" $ocrURL "embedURL" $embedURL "captionURL" $captionURL "captionModelName" $captionModelName "audioGrpcEndpoint" $audioGrpcEndpoint "llmEnabled" $llmEnabled "llmAPIBase" $llmAPIBase "llmRagSystemPromptPrefix" $llmRagSystemPromptPrefix "vectordbSvc" $vectordbSvc "vectordbPort" $vectordbPort) | indent 4 }} {{- else }} # ========================================================================= # Split mode — one ConfigMap per role with the appropriate mode + gateway @@ -128,6 +154,6 @@ data: timeout_s: 300.0 max_connections: 100 {{- end }} -{{ include "nemo-retriever.configBody" (dict "Values" $.Values "pageElementsURL" $pageElementsURL "tableStructureURL" $tableStructureURL "ocrURL" $ocrURL "embedURL" $embedURL "captionURL" $captionURL "captionModelName" $captionModelName "audioGrpcEndpoint" $audioGrpcEndpoint "vectordbSvc" $vectordbSvc "vectordbPort" $vectordbPort) | indent 4 }} +{{ include "nemo-retriever.configBody" (dict "Values" $.Values "pageElementsURL" $pageElementsURL "tableStructureURL" $tableStructureURL "ocrURL" $ocrURL "embedURL" $embedURL "captionURL" $captionURL "captionModelName" $captionModelName "audioGrpcEndpoint" $audioGrpcEndpoint "llmEnabled" $llmEnabled "llmAPIBase" $llmAPIBase "llmRagSystemPromptPrefix" $llmRagSystemPromptPrefix "vectordbSvc" $vectordbSvc "vectordbPort" $vectordbPort) | indent 4 }} {{- end }} {{- end }} diff --git a/nemo_retriever/helm/templates/deployment.yaml b/nemo_retriever/helm/templates/deployment.yaml index 508ad5eed2..85e6152a92 100644 --- a/nemo_retriever/helm/templates/deployment.yaml +++ b/nemo_retriever/helm/templates/deployment.yaml @@ -97,6 +97,16 @@ spec: name: {{ .Values.ngcApiSecret.name }} key: NGC_API_KEY optional: true + {{- with .Values.serviceConfig.llm.apiKeySecret }} + {{- if .name }} + - name: NEMO_RETRIEVER_LLM_API_KEY + valueFrom: + secretKeyRef: + name: {{ .name | quote }} + key: {{ .key | default "api_key" | quote }} + optional: {{ .optional | default false }} + {{- end }} + {{- end }} {{- if $svc.installFfmpeg }} - name: INSTALL_FFMPEG value: "true" @@ -261,6 +271,16 @@ spec: name: {{ $.Values.ngcApiSecret.name }} key: NGC_API_KEY optional: true + {{- with $.Values.serviceConfig.llm.apiKeySecret }} + {{- if .name }} + - name: NEMO_RETRIEVER_LLM_API_KEY + valueFrom: + secretKeyRef: + name: {{ .name | quote }} + key: {{ .key | default "api_key" | quote }} + optional: {{ .optional | default false }} + {{- end }} + {{- end }} {{- if $svc.installFfmpeg }} - name: INSTALL_FFMPEG value: "true" diff --git a/nemo_retriever/helm/templates/nims/llama-3-3-nemotron-super-49b-v1-5.yaml b/nemo_retriever/helm/templates/nims/llama-3-3-nemotron-super-49b-v1-5.yaml new file mode 100644 index 0000000000..53f9660e38 --- /dev/null +++ b/nemo_retriever/helm/templates/nims/llama-3-3-nemotron-super-49b-v1-5.yaml @@ -0,0 +1,53 @@ +{{- if and (and (.Capabilities.APIVersions.Has "apps.nvidia.com/v1alpha1") .Values.nims.enabled) (eq .Values.nimOperator.llama_3_3_nemotron_super_49b_v1_5.enabled true) -}} +{{- $cfg := .Values.nimOperator.llama_3_3_nemotron_super_49b_v1_5 -}} +{{- $name := $cfg.nimServiceName -}} +apiVersion: apps.nvidia.com/v1alpha1 +kind: NIMCache +metadata: + name: {{ $name }} + {{- include "nemo-retriever.nimcache.keepPolicy" . | nindent 2 }} +spec: + source: + ngc: + modelPuller: "{{ $cfg.image.repository }}:{{ $cfg.image.tag }}" + pullSecret: "{{ index $cfg.image.pullSecrets 0 }}" + authSecret: {{ $cfg.authSecret }} + {{- include "nemo-retriever.nimcache.modelBlock" (dict "context" $ "key" "llama_3_3_nemotron_super_49b_v1_5") | nindent 6 }} + storage: + pvc: + create: {{ $cfg.storage.pvc.create }} + storageClass: {{ $cfg.storage.pvc.storageClass | quote }} + size: {{ $cfg.storage.pvc.size }} + volumeAccessMode: {{ $cfg.storage.pvc.volumeAccessMode }} +--- +apiVersion: apps.nvidia.com/v1alpha1 +kind: NIMService +metadata: + name: {{ $name }} +spec: + image: + repository: {{ $cfg.image.repository }} + tag: {{ $cfg.image.tag }} + pullPolicy: {{ $cfg.image.pullPolicy }} + pullSecrets: +{{ toYaml $cfg.image.pullSecrets | indent 6 }} + authSecret: {{ $cfg.authSecret }} + storage: + nimCache: + name: {{ $name }} + replicas: {{ $cfg.replicas }} + nodeSelector: +{{ toYaml $cfg.nodeSelector | indent 4 }} + {{- include "nemo-retriever.nimServiceResources" (dict "context" $ "resources" $cfg.resources) | nindent 2 }} + tolerations: +{{ toYaml $cfg.tolerations | indent 4 }} + expose: +{{ toYaml $cfg.expose | indent 4 }} + env: +{{ toYaml $cfg.env | indent 4 }} + - name: NGC_API_KEY + valueFrom: + secretKeyRef: + name: {{ $cfg.authSecret | quote }} + key: NGC_API_KEY +{{- end }} diff --git a/nemo_retriever/helm/values.yaml b/nemo_retriever/helm/values.yaml index 9fa45dff20..533a8dc697 100644 --- a/nemo_retriever/helm/values.yaml +++ b/nemo_retriever/helm/values.yaml @@ -490,6 +490,35 @@ serviceConfig: # Required for audio/video ingestion in service mode (without torch). audioGrpcEndpoint: "" + # Remote LLM endpoint used by POST /v1/answer. When the Super-49B + # NIM is enabled under nimOperator.llama_3_3_nemotron_super_49b_v1_5 + # (and the NIM Operator CRDs exist), the chart auto-wires apiBase to + # the in-cluster OpenAI-compatible /v1 endpoint and flips answering on. + # Explicit apiBase/model values below always win, which lets operators + # point /v1/answer at an external OpenAI-compatible endpoint instead. + llm: + enabled: false + model: "openai/nvidia/llama-3.3-nemotron-super-49b-v1.5" + apiBase: "" + # Optional Secret reference for external LLM credentials. The Secret is + # mounted as NEMO_RETRIEVER_LLM_API_KEY and read by the service CLI, so + # the key is not rendered into the retriever-service ConfigMap. + apiKeySecret: + name: "" + key: api_key + optional: false + temperature: 0.0 + topP: null + maxTokens: 512 + extraParams: {} + numRetries: 3 + timeout: 180.0 + ragSystemPrompt: "" + # Optional prefix prepended to the RAG system prompt. Leave empty + # for external LLMs; the chart auto-wires /no_think when it + # resolves the in-cluster Super-49B NIM endpoint. + ragSystemPromptPrefix: "" + # Pipeline worker pools. Workers are abstract dispatchers — sizing # depends on whether they do local GPU work or fan out to remote NIMs. # For CPU-only NIM-forwarding nodes, higher worker counts are fine. @@ -805,6 +834,8 @@ nimOperator: # - nemotron_parse (Parse v1.2, ≈ 3.5 GiB GPU) # - nemotron_3_nano_omni_30b_a3b_reasoning (Omni 30B caption NIM, # ≈ 62 GiB BF16 weights) + # - llama_3_3_nemotron_super_49b_v1_5 (Super-49B answer LLM, + # 2-GPU tensor-parallel NIM) # - audio (Parakeet ASR) # # Two flags gate every NIM, so each one is opt-in along two independent @@ -1005,6 +1036,66 @@ nimOperator: # whole sub-stack. # --------------------------------------------------------------------------- + # Llama 3.3 Nemotron Super 49B v1.5. Optional answer-generation LLM. + # + # Disabled by default because it is much larger than the core extraction + # NIMs. The default resources request two GPUs and only auto-wire + # /v1/answer after explicit opt-in. + llama_3_3_nemotron_super_49b_v1_5: + enabled: false + # NIMService / NIMCache resource name and in-cluster Service DNS label. + nimServiceName: llama-3-3-nemotron-super-49b-v1-5 + image: + repository: nvcr.io/nim/nvidia/llama-3.3-nemotron-super-49b-v1.5 + tag: "2.0.5" + pullPolicy: IfNotPresent + pullSecrets: + - ngc-secret + authSecret: ngc-api + # Per-NIM NIMCache GPU/profile filter. See `nimOperator.modelProfile` + # above for the shape; non-empty REPLACES the chart-wide default. + # Cache only the default BF16 2-GPU profile used by the Super-49B + # service resources. Override this when deploying a different profile + # or GPU topology. + modelProfile: + profiles: + - "1146f49f84dff5dea09f5aa633cc70b92d7d972223d67878c841cd0fbccad4fb" + storage: + pvc: + create: true + storageClass: "" + size: "250Gi" + volumeAccessMode: ReadWriteOnce + replicas: 1 + # Super-49B uses a 2-GPU tensor-parallel profile by default. This + # non-empty resources block intentionally replaces + # nimOperator.nimServiceGpuLimit. + resources: + limits: + nvidia.com/gpu: 2 + nodeSelector: {} + tolerations: [] + expose: + service: + type: ClusterIP + port: 8000 + grpcPort: 8001 + env: + - name: NIM_HTTP_API_PORT + value: "8000" + - name: NIM_TENSOR_PARALLEL_SIZE + value: "2" + # Conservative tensor-parallel startup default; override when your + # deployment fabric has been validated with custom all-reduce enabled. + - name: NIM_PASSTHROUGH_ARGS + value: "--disable-custom-all-reduce" + # Conservative single-pod tensor-parallel defaults that avoid + # transport assumptions across GPU fabrics. + - name: NCCL_IB_DISABLE + value: "1" + - name: NCCL_P2P_DISABLE + value: "1" + # Llama Nemotron rerank VL 1B v2 — multimodal reranking NIM. # # Disabled by default per the 26.05 "optional and disabled by default" diff --git a/nemo_retriever/pyproject.toml b/nemo_retriever/pyproject.toml index 355f23f584..7213b35b1a 100644 --- a/nemo_retriever/pyproject.toml +++ b/nemo_retriever/pyproject.toml @@ -90,6 +90,8 @@ dependencies = [ service = [ # HTTP retry decorator used by NIM model interfaces "backoff", + # Remote answer generation client used by serviceConfig.llm and /v1/answer + "litellm>=1.86.0", # Lightweight utilities used by pipeline operators "glom", "easydict", @@ -181,7 +183,7 @@ benchmarks = [ # or construct an ``LLMJudge`` / ``LiteLLMClient`` directly. Powers both the # live-RAG SDK and the batch evaluation framework. llm = [ - "litellm>=1.86.0rc1", + "litellm>=1.86.0", ] dev = [ diff --git a/nemo_retriever/src/nemo_retriever/evaluation/generation.py b/nemo_retriever/src/nemo_retriever/evaluation/generation.py index a3ad7f1af5..db9d474823 100644 --- a/nemo_retriever/src/nemo_retriever/evaluation/generation.py +++ b/nemo_retriever/src/nemo_retriever/evaluation/generation.py @@ -42,6 +42,8 @@ def __init__( num_retries: int = 3, timeout: float = 120.0, max_workers: int = 8, + rag_system_prompt: Optional[str] = None, + rag_system_prompt_prefix: Optional[str] = None, ) -> None: super().__init__( model=model, @@ -54,6 +56,8 @@ def __init__( num_retries=num_retries, timeout=timeout, max_workers=max_workers, + rag_system_prompt=rag_system_prompt, + rag_system_prompt_prefix=rag_system_prompt_prefix, ) self._client = LiteLLMClient.from_kwargs( model=model, @@ -65,6 +69,8 @@ def __init__( extra_params=extra_params, num_retries=num_retries, timeout=timeout, + rag_system_prompt=rag_system_prompt, + rag_system_prompt_prefix=rag_system_prompt_prefix, ) self._max_workers = max_workers diff --git a/nemo_retriever/src/nemo_retriever/llm/clients/litellm.py b/nemo_retriever/src/nemo_retriever/llm/clients/litellm.py index 78895c2989..e69d58649d 100644 --- a/nemo_retriever/src/nemo_retriever/llm/clients/litellm.py +++ b/nemo_retriever/src/nemo_retriever/llm/clients/litellm.py @@ -38,12 +38,39 @@ Answer:""" -def _build_rag_prompt(query: str, chunks: list[str]) -> list[dict]: +def _format_rag_system_prompt( + *, + rag_system_prompt: Optional[str] = None, + rag_system_prompt_prefix: Optional[str] = None, +) -> str: + """Resolve the system prompt used for RAG answer generation.""" + prompt = (rag_system_prompt if rag_system_prompt is not None else _RAG_SYSTEM_PROMPT).strip() + prefix = (rag_system_prompt_prefix or "").strip() + if not prefix: + return prompt + if not prompt: + return prefix + return f"{prefix}\n{prompt}" + + +def _build_rag_prompt( + query: str, + chunks: list[str], + *, + rag_system_prompt: Optional[str] = None, + rag_system_prompt_prefix: Optional[str] = None, +) -> list[dict]: """Build the OpenAI-style messages list for a RAG prompt.""" context = "\n\n---\n\n".join(chunks) if chunks else "(no context retrieved)" user_content = _RAG_USER_TEMPLATE.format(context=context, query=query) return [ - {"role": "system", "content": _RAG_SYSTEM_PROMPT}, + { + "role": "system", + "content": _format_rag_system_prompt( + rag_system_prompt=rag_system_prompt, + rag_system_prompt_prefix=rag_system_prompt_prefix, + ), + }, {"role": "user", "content": user_content}, ] @@ -84,6 +111,10 @@ def __init__( # ``max_tokens=1024`` for captioning/summarization workloads; RAG # answers routinely exceed that, so the client overrides it. self.sampling = sampling if sampling is not None else LLMInferenceParams(temperature=0.0, max_tokens=4096) + self._rag_system_prompt = _format_rag_system_prompt( + rag_system_prompt=transport.rag_system_prompt, + rag_system_prompt_prefix=transport.rag_system_prompt_prefix, + ) @property def model(self) -> str: @@ -103,6 +134,8 @@ def from_kwargs( extra_params: Optional[dict[str, Any]] = None, num_retries: int = 3, timeout: float = 120.0, + rag_system_prompt: Optional[str] = None, + rag_system_prompt_prefix: Optional[str] = None, ) -> "LiteLLMClient": """Flat-kwarg constructor for zero-churn migration from the old signature. @@ -117,6 +150,8 @@ def from_kwargs( num_retries=num_retries, timeout=timeout, extra_params=extra_params or {}, + rag_system_prompt=rag_system_prompt, + rag_system_prompt_prefix=rag_system_prompt_prefix, ) sampling = LLMInferenceParams( temperature=temperature, @@ -169,7 +204,7 @@ def complete(self, messages: list[dict], max_tokens: Optional[int] = None) -> tu def generate(self, query: str, chunks: list[str]) -> GenerationResult: """Generate an answer for the given query using retrieved chunks as context.""" - messages = _build_rag_prompt(query, chunks) + messages = _build_rag_prompt(query, chunks, rag_system_prompt=self._rag_system_prompt) try: raw_answer, latency = self.complete(messages) answer = strip_think_tags(raw_answer) diff --git a/nemo_retriever/src/nemo_retriever/params/models.py b/nemo_retriever/src/nemo_retriever/params/models.py index 6372cccff8..5579b7c648 100644 --- a/nemo_retriever/src/nemo_retriever/params/models.py +++ b/nemo_retriever/src/nemo_retriever/params/models.py @@ -599,6 +599,8 @@ class LLMRemoteClientParams(_ParamsModel): num_retries: int = 3 timeout: float = 120.0 extra_params: dict[str, Any] = Field(default_factory=dict) + rag_system_prompt: Optional[str] = None + rag_system_prompt_prefix: Optional[str] = None @field_validator("num_retries") @classmethod diff --git a/nemo_retriever/src/nemo_retriever/retriever.py b/nemo_retriever/src/nemo_retriever/retriever.py index efafc09322..f2666a3caa 100644 --- a/nemo_retriever/src/nemo_retriever/retriever.py +++ b/nemo_retriever/src/nemo_retriever/retriever.py @@ -457,6 +457,8 @@ def generate( extra_params=dict(transport.extra_params) if transport.extra_params else None, num_retries=transport.num_retries, timeout=transport.timeout, + rag_system_prompt=transport.rag_system_prompt, + rag_system_prompt_prefix=transport.rag_system_prompt_prefix, ) else: operator = QAGenerationOperator(model=model, **kwargs) diff --git a/nemo_retriever/src/nemo_retriever/service/cli.py b/nemo_retriever/src/nemo_retriever/service/cli.py index 85690adb5c..648c0716fa 100644 --- a/nemo_retriever/src/nemo_retriever/service/cli.py +++ b/nemo_retriever/src/nemo_retriever/service/cli.py @@ -30,6 +30,12 @@ def start( nim_api_key: Optional[str] = typer.Option( None, "--nim-api-key", help="API key for NIM endpoints (overrides YAML / $NVIDIA_API_KEY)." ), + llm_api_key: Optional[str] = typer.Option( + None, + "--llm-api-key", + help="API key for LLM answer generation endpoints (overrides YAML / $NEMO_RETRIEVER_LLM_API_KEY).", + envvar="NEMO_RETRIEVER_LLM_API_KEY", + ), gpu_devices: Optional[str] = typer.Option( None, "--gpu-devices", help="Comma-separated GPU device IDs (overrides YAML)." ), @@ -59,6 +65,8 @@ def start( overrides["logging.file"] = log_file if nim_api_key is not None: overrides["nim_endpoints.api_key"] = nim_api_key + if llm_api_key is not None: + overrides["llm.api_key"] = llm_api_key if gpu_devices is not None: overrides["resources.gpu_devices"] = [d.strip() for d in gpu_devices.split(",") if d.strip()] if api_token is not None: diff --git a/nemo_retriever/src/nemo_retriever/service/config.py b/nemo_retriever/src/nemo_retriever/service/config.py index 069f193f68..1e50338c03 100644 --- a/nemo_retriever/src/nemo_retriever/service/config.py +++ b/nemo_retriever/src/nemo_retriever/service/config.py @@ -78,6 +78,25 @@ class NimEndpointsConfig(RichModel): api_key: str | None = None +class LLMConfig(RichModel): + """Remote LLM configuration for service-mode RAG answer generation.""" + + model_config = ConfigDict(extra="forbid") + + enabled: bool = False + model: str = "openai/nvidia/llama-3.3-nemotron-super-49b-v1.5" + api_base: str | None = None + api_key: str | None = None + temperature: float = 0.0 + top_p: float | None = None + max_tokens: int = 512 + extra_params: dict[str, Any] = Field(default_factory=dict) + num_retries: int = 3 + timeout: float = 180.0 + rag_system_prompt: str | None = None + rag_system_prompt_prefix: str | None = None + + class ResourceLimitsConfig(RichModel): model_config = ConfigDict(extra="forbid") @@ -265,6 +284,7 @@ class ServiceConfig(RichModel): server: ServerConfig = Field(default_factory=ServerConfig) logging: LoggingConfig = Field(default_factory=LoggingConfig) nim_endpoints: NimEndpointsConfig = Field(default_factory=NimEndpointsConfig) + llm: LLMConfig = Field(default_factory=LLMConfig) resources: ResourceLimitsConfig = Field(default_factory=ResourceLimitsConfig) auth: AuthConfig = Field(default_factory=AuthConfig) gateway: GatewayConfig = Field(default_factory=GatewayConfig) diff --git a/nemo_retriever/src/nemo_retriever/service/retriever-service.yaml b/nemo_retriever/src/nemo_retriever/service/retriever-service.yaml index d1dd74cef7..5d7472eea8 100644 --- a/nemo_retriever/src/nemo_retriever/service/retriever-service.yaml +++ b/nemo_retriever/src/nemo_retriever/service/retriever-service.yaml @@ -49,6 +49,22 @@ nim_endpoints: caption_model_name: null api_key: null +# Remote LLM endpoint used by POST /v1/answer. Helm auto-wires these +# fields when the Super-49B NIM is enabled. +llm: + enabled: false + model: "openai/nvidia/llama-3.3-nemotron-super-49b-v1.5" + api_base: null + api_key: null + temperature: 0.0 + top_p: null + max_tokens: 512 + extra_params: {} + num_retries: 3 + timeout: 180.0 + rag_system_prompt: null + rag_system_prompt_prefix: null + # Pipeline worker pools. Workers are abstract dispatchers — sizing # depends on whether they do local GPU work or fan out to remote NIMs. # For CPU-only NIM-forwarding nodes, higher worker counts are fine. diff --git a/nemo_retriever/src/nemo_retriever/service/routers/ingest.py b/nemo_retriever/src/nemo_retriever/service/routers/ingest.py index 63117ba58f..9b16a0463c 100644 --- a/nemo_retriever/src/nemo_retriever/service/routers/ingest.py +++ b/nemo_retriever/src/nemo_retriever/service/routers/ingest.py @@ -27,6 +27,7 @@ from fastapi import APIRouter, File, Form, HTTPException, Request, UploadFile from fastapi.responses import JSONResponse, Response +from pydantic import BaseModel, Field from starlette.responses import StreamingResponse from nemo_retriever.service.models.pipeline_spec import PipelineSpec @@ -78,6 +79,23 @@ # re-checks ``request.is_disconnected()`` and exits cleanly. SSE_KEEPALIVE_TIMEOUT_S = 30.0 + +class AnswerRequest(BaseModel): + query: str + top_k: int = Field(default=5, ge=1, le=1000) + include_chunks: bool = False + + +class AnswerResponse(BaseModel): + query: str + answer: str + model: str + latency_s: float + chunk_count: int + chunks: list[str] | None = None + metadata: list[dict[str, Any]] | None = None + + logger = logging.getLogger(__name__) router = APIRouter(tags=["ingest"]) @@ -1305,6 +1323,110 @@ async def pipeline_config(request: Request): ) +# ------------------------------------------------------------------ +# POST /v1/answer -- vector search + configured LLM generation +# ------------------------------------------------------------------ + + +def _text_from_hit(hit: dict[str, Any]) -> str: + value = hit.get("text") or hit.get("content") or hit.get("chunk") or hit.get("page_content") + return str(value or "") + + +def _metadata_from_hit(hit: dict[str, Any]) -> dict[str, Any]: + return {k: v for k, v in hit.items() if k not in {"text", "content", "chunk", "page_content", "vector"}} + + +@router.post( + "/answer", + response_model=AnswerResponse, + summary="Search ingested documents and generate an answer", +) +async def answer(req: AnswerRequest, request: Request) -> Response | AnswerResponse: + """Retrieve context from VectorDB and answer with the configured LLM.""" + import httpx + + config = request.app.state.config + + if not config.vectordb.enabled: + raise HTTPException( + status_code=404, + detail="VectorDB is not enabled in the service configuration.", + ) + + if not config.llm.enabled: + raise HTTPException( + status_code=404, + detail="LLM answer generation is not enabled in the service configuration.", + ) + + mode = _mode(request) + if mode in ("realtime", "batch"): + raise HTTPException( + status_code=404, + detail="Answer endpoint is not available on worker pods. Use the gateway.", + ) + + vectordb_url = config.vectordb.vectordb_url.rstrip("/") + target = f"{vectordb_url}/v1/query" + + try: + async with httpx.AsyncClient(timeout=60.0) as client: + resp = await client.post(target, json={"query": req.query, "top_k": req.top_k}) + except Exception as exc: + logger.exception("Failed to query vectordb at %s for answer generation", target) + raise HTTPException( + status_code=502, + detail=f"Failed to reach VectorDB service: {type(exc).__name__}: {exc}", + ) + + if resp.status_code != 200: + return Response(content=resp.content, status_code=resp.status_code, media_type="application/json") + + payload = resp.json() + result_sets = payload.get("results") or [] + hits = result_sets[0].get("hits", []) if result_sets else [] + chunks = [_text_from_hit(hit) for hit in hits] + metadata = [_metadata_from_hit(hit) for hit in hits] + + from nemo_retriever.llm.clients import LiteLLMClient + + llm_cfg = config.llm + llm = getattr(request.app.state, "answer_llm_client", None) + if llm is None: + llm = LiteLLMClient.from_kwargs( + model=llm_cfg.model, + api_base=llm_cfg.api_base, + api_key=llm_cfg.api_key, + temperature=llm_cfg.temperature, + top_p=llm_cfg.top_p, + max_tokens=llm_cfg.max_tokens, + extra_params=dict(llm_cfg.extra_params), + num_retries=llm_cfg.num_retries, + timeout=llm_cfg.timeout, + rag_system_prompt=llm_cfg.rag_system_prompt, + rag_system_prompt_prefix=llm_cfg.rag_system_prompt_prefix, + ) + request.app.state.answer_llm_client = llm + gen = await asyncio.to_thread(llm.generate, req.query, chunks) + if gen.error: + logger.error("LLM answer generation failed for model %s: %s", gen.model, gen.error) + raise HTTPException( + status_code=502, + detail=f"LLM answer generation failed: {gen.error}", + ) + + return AnswerResponse( + query=req.query, + answer=gen.answer, + model=gen.model, + latency_s=gen.latency_s, + chunk_count=len(chunks), + chunks=chunks if req.include_chunks else None, + metadata=metadata if req.include_chunks else None, + ) + + # ------------------------------------------------------------------ # POST /v1/query — vector search (proxied to vectordb pod) # ------------------------------------------------------------------ diff --git a/nemo_retriever/tests/test_helm_nimcache_model_profile.py b/nemo_retriever/tests/test_helm_nimcache_model_profile.py index 0f68f38c8f..119698013b 100644 --- a/nemo_retriever/tests/test_helm_nimcache_model_profile.py +++ b/nemo_retriever/tests/test_helm_nimcache_model_profile.py @@ -18,13 +18,14 @@ * ``values.yaml`` carries a chart-wide ``nimOperator.modelProfile`` default plus a per-NIM ``nimOperator..modelProfile`` override - for every NIMCache the chart provisions. Both default to ``{}`` so - existing releases keep their pre-fix behaviour. + for every NIMCache the chart provisions. Existing extraction NIMs + default their per-NIM override to ``{}``; Super-49B is the intentional + exception because it pins the bundled two-GPU profile by default. * A ``helm template`` with **no overrides** renders no ``model:`` - block on any NIMCache (preserves operator default). + block on default-empty-profile NIMCaches (preserves operator default). * ``--set nimOperator.modelProfile.gpus[0]...`` renders an identical - ``model:`` block on every NIMCache reconciled by the chart, with the - expected ``gpus`` / ``ids`` / ``product`` shape. + ``model:`` block on every rendered default-empty-profile NIMCache, + with the expected ``gpus`` / ``ids`` / ``product`` shape. * ``--set nimOperator..modelProfile.profiles[0]=...`` renders ``model.profiles`` ONLY on that NIM's NIMCache; the other NIMs inherit the global default (or render no block when no global is @@ -54,11 +55,11 @@ _README_MD = _REPO_ROOT / "nemo_retriever/helm/README.md" _CHART_DIR = _REPO_ROOT / "nemo_retriever/helm" -# The eight per-NIM keys that drive the NIMCache templates. The chart -# carries one ``templates/nims/.yaml`` per key, each of which -# wires `nemo-retriever.nimcache.modelBlock` under -# ``spec.source.ngc``. -_NIM_KEYS: tuple[str, ...] = ( +# Per-NIM keys whose NIMCache modelProfile values intentionally default +# to ``{}``, preserving pre-fix operator behaviour unless an operator +# opts into GPU/profile filtering. Super-49B is tracked separately because +# it ships with a pinned default profile for its bundled two-GPU NIM. +_EMPTY_MODEL_PROFILE_NIM_KEYS: tuple[str, ...] = ( "page_elements", "table_structure", "ocr", @@ -68,6 +69,9 @@ "nemotron_3_nano_omni_30b_a3b_reasoning", "audio", ) +_SUPER49B_NIM_KEY = "llama_3_3_nemotron_super_49b_v1_5" +_SUPER49B_DEFAULT_PROFILE = "1146f49f84dff5dea09f5aa633cc70b92d7d972223d67878c841cd0fbccad4fb" +_ALL_NIM_KEYS: tuple[str, ...] = _EMPTY_MODEL_PROFILE_NIM_KEYS + (_SUPER49B_NIM_KEY,) def _read_required_file(path: Path) -> str: @@ -77,7 +81,7 @@ def _read_required_file(path: Path) -> str: def _helm_template(extra_args: Sequence[str] = ()) -> subprocess.CompletedProcess[str]: - """Render the chart with every NIM opted in so all 8 NIMCaches appear.""" + """Render the chart with each default-empty-profile NIM opted in.""" helm = shutil.which("helm") if helm is None: raise SkipTest("`helm` binary not available in this environment.") @@ -92,8 +96,9 @@ def _helm_template(extra_args: Sequence[str] = ()) -> subprocess.CompletedProces "ngcImagePullSecret.create=false", "--set", "ngcApiSecret.create=false", - # Opt every NIM in so the test exercises all 8 NIMCaches in - # one render. Defaults are covered separately. + # Opt every default-empty-profile NIM in so this suite exercises + # their shared modelProfile contract in one render. Defaults are + # covered separately. "--set", "nimOperator.rerankqa.enabled=true", "--set", @@ -154,10 +159,10 @@ def test_values_exposes_chart_wide_default(self) -> None: ) def test_values_exposes_per_nim_override_for_every_nim(self) -> None: - """Each ``nimOperator.`` block must carry its own ``modelProfile: {}``.""" + """Each ``nimOperator.`` block must carry a modelProfile override.""" values = _read_required_file(_VALUES_YAML) loaded = yaml.safe_load(values) - for key in _NIM_KEYS: + for key in _ALL_NIM_KEYS: with self.subTest(nim=key): cfg = loaded["nimOperator"].get(key) self.assertIsNotNone( @@ -171,13 +176,23 @@ def test_values_exposes_per_nim_override_for_every_nim(self) -> None: "override key — anything else removes the documented " "per-NIM tuning surface.", ) - self.assertEqual( - cfg["modelProfile"], - {}, - f"nimOperator.{key}.modelProfile must default to `{{}}` " - "so the chart's behaviour is unchanged unless the " - "operator opts in.", - ) + if key in _EMPTY_MODEL_PROFILE_NIM_KEYS: + self.assertEqual( + cfg["modelProfile"], + {}, + f"nimOperator.{key}.modelProfile must default to `{{}}` " + "so the chart's behaviour is unchanged unless the " + "operator opts in.", + ) + + def test_super49b_exposes_intentional_default_model_profile(self) -> None: + """Super-49B pins its bundled two-GPU NIM profile by default.""" + values = _read_required_file(_VALUES_YAML) + loaded = yaml.safe_load(values) + self.assertEqual( + loaded["nimOperator"][_SUPER49B_NIM_KEY]["modelProfile"], + {"profiles": [_SUPER49B_DEFAULT_PROFILE]}, + ) # ------------------------------------------------------------------ # README — operator-facing documentation @@ -221,8 +236,9 @@ def test_default_render_emits_no_model_block(self) -> None: docs = _iter_nimcache_docs(proc.stdout) self.assertEqual( len(docs), - len(_NIM_KEYS), - f"Expected one NIMCache per opted-in NIM (={len(_NIM_KEYS)}); " f"got {len(docs)}.", + len(_EMPTY_MODEL_PROFILE_NIM_KEYS), + f"Expected one NIMCache per opted-in default-empty-profile NIM (={len(_EMPTY_MODEL_PROFILE_NIM_KEYS)}); " + f"got {len(docs)}.", ) for doc in docs: name = doc.get("metadata", {}).get("name", "") @@ -235,11 +251,13 @@ def test_default_render_emits_no_model_block(self) -> None: "— that breaks pre-fix release behaviour.", ) - def test_chart_wide_modelprofile_applies_to_every_nimcache(self) -> None: - """``--set nimOperator.modelProfile.gpus[0]...`` must render on every NIMCache. + def test_chart_wide_modelprofile_applies_to_default_empty_nimcaches(self) -> None: + """``--set nimOperator.modelProfile.gpus[0]...`` renders on default-empty NIMCaches. - This is the exact customer ask: one --set flag, every NIMCache - downloads only the H100 profile. + This is the exact customer ask for the existing extraction NIMs: + one --set flag makes each rendered cache download only the H100 + profile. Super-49B is covered separately because it intentionally + pins its bundled default profile. """ proc = _helm_template( extra_args=( @@ -251,7 +269,7 @@ def test_chart_wide_modelprofile_applies_to_every_nimcache(self) -> None: ) _assert_helm_ok(self, proc) docs = _iter_nimcache_docs(proc.stdout) - self.assertEqual(len(docs), len(_NIM_KEYS)) + self.assertEqual(len(docs), len(_EMPTY_MODEL_PROFILE_NIM_KEYS)) for doc in docs: name = doc.get("metadata", {}).get("name", "") with self.subTest(nimcache=name): @@ -296,7 +314,7 @@ def test_per_nim_override_replaces_chart_wide_default(self) -> None: "Per-NIM override must REPLACE the chart-wide default — the " "page-elements NIMCache must NOT carry the inherited gpus list.", ) - # Every other NIMCache should still carry the chart-wide gpus + # Every other rendered NIMCache should still carry the chart-wide gpus # filter. Spot-check one — the others are covered by the # previous test. ocr = docs["nemotron-ocr-v1"] @@ -380,7 +398,7 @@ def test_rendered_model_block_round_trips_through_yaml(self) -> None: _assert_helm_ok(self, proc) # If any document failed to parse, safe_load_all would raise. docs = _iter_nimcache_docs(proc.stdout) - self.assertEqual(len(docs), len(_NIM_KEYS)) + self.assertEqual(len(docs), len(_EMPTY_MODEL_PROFILE_NIM_KEYS)) for doc in docs: name = doc["metadata"]["name"] ngc = doc["spec"]["source"]["ngc"] @@ -412,12 +430,16 @@ def test_rendered_model_block_is_absent_when_chart_wide_filter_is_empty(self) -> # value is treated as truthy, the helper will render the block. proc = _helm_template(extra_args=()) _assert_helm_ok(self, proc) - self.assertNotIn( - " model:", - proc.stdout, - "Default render (no overrides) must not contain a `model:` " - "block at the NIMCache spec.source.ngc indentation level.", - ) + docs = _iter_nimcache_docs(proc.stdout) + self.assertEqual(len(docs), len(_EMPTY_MODEL_PROFILE_NIM_KEYS)) + for doc in docs: + name = doc["metadata"]["name"] + ngc = doc["spec"]["source"]["ngc"] + self.assertNotIn( + "model", + ngc, + f"Default render must not contain a NIMCache model block for `{name}`.", + ) if __name__ == "__main__": diff --git a/nemo_retriever/tests/test_helm_super49b_answer_generation.py b/nemo_retriever/tests/test_helm_super49b_answer_generation.py new file mode 100644 index 0000000000..a5f69789e7 --- /dev/null +++ b/nemo_retriever/tests/test_helm_super49b_answer_generation.py @@ -0,0 +1,144 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. +# All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Helm wiring for Super-49B answer generation.""" + +from __future__ import annotations + +import shutil +import subprocess +from pathlib import Path +from typing import Sequence +from unittest import SkipTest, TestCase, main + + +_REPO_ROOT = Path(__file__).resolve().parents[2] +_VALUES_YAML = _REPO_ROOT / "nemo_retriever/helm/values.yaml" +_CHART_DIR = _REPO_ROOT / "nemo_retriever/helm" + +_SUPER49B_KEY = " llama_3_3_nemotron_super_49b_v1_5:" +_SUPER49B_SERVICE = "llama-3-3-nemotron-super-49b-v1-5" +_SUPER49B_REPOSITORY = "nvcr.io/nim/nvidia/llama-3.3-nemotron-super-49b-v1.5" +_SUPER49B_TAG = "2.0.5" +_SUPER49B_MODEL = "openai/nvidia/llama-3.3-nemotron-super-49b-v1.5" +_SUPER49B_PROFILE = "1146f49f84dff5dea09f5aa633cc70b92d7d972223d67878c841cd0fbccad4fb" + + +def _read_required_file(path: Path) -> str: + if not path.is_file(): + raise SkipTest(f"Required file not present in this test environment: {path}") + return path.read_text(encoding="utf-8") + + +def _helm_template(extra_args: Sequence[str] = ()) -> subprocess.CompletedProcess[str]: + helm = shutil.which("helm") + if helm is None: + raise SkipTest("`helm` binary not available in this environment.") + cmd = [ + helm, + "template", + "retriever", + str(_CHART_DIR), + "--set", + "ngcImagePullSecret.create=false", + "--set", + "ngcApiSecret.create=false", + "--api-versions", + "apps.nvidia.com/v1alpha1", + ] + cmd += list(extra_args) + return subprocess.run(cmd, check=False, capture_output=True, text=True) + + +def _assert_helm_ok(self: TestCase, proc: subprocess.CompletedProcess[str]) -> None: + self.assertEqual( + proc.returncode, + 0, + f"`helm template` failed:\nSTDOUT:\n{proc.stdout}\nSTDERR:\n{proc.stderr}", + ) + + +class HelmSuper49BAnswerGenerationTests(TestCase): + def test_values_define_super49b_as_optional_answer_nim(self) -> None: + values = _read_required_file(_VALUES_YAML) + + self.assertIn(_SUPER49B_KEY, values) + block = values[values.index(_SUPER49B_KEY) : values.index(_SUPER49B_KEY) + 1200] + self.assertIn("enabled: false", block) + self.assertIn(f"repository: {_SUPER49B_REPOSITORY}", block) + self.assertIn(f'tag: "{_SUPER49B_TAG}"', block) + self.assertIn("nvidia.com/gpu: 2", block) + self.assertIn('size: "250Gi"', block) + self.assertIn(_SUPER49B_PROFILE, block) + + def test_default_render_omits_super49b_and_disables_llm_answering(self) -> None: + proc = _helm_template() + _assert_helm_ok(self, proc) + + self.assertNotIn(f"name: {_SUPER49B_SERVICE}", proc.stdout) + self.assertIn("llm:", proc.stdout) + self.assertIn("enabled: false", proc.stdout) + self.assertIn("api_base: null", proc.stdout) + + def test_super49b_opt_in_renders_nim_and_autowires_llm_config(self) -> None: + proc = _helm_template(extra_args=("--set", "nimOperator.llama_3_3_nemotron_super_49b_v1_5.enabled=true")) + _assert_helm_ok(self, proc) + + self.assertIn(f"name: {_SUPER49B_SERVICE}", proc.stdout) + self.assertIn(f"repository: {_SUPER49B_REPOSITORY}", proc.stdout) + self.assertIn(f"tag: {_SUPER49B_TAG}", proc.stdout) + self.assertIn("nvidia.com/gpu: 2", proc.stdout) + self.assertIn("profiles:", proc.stdout) + self.assertIn(_SUPER49B_PROFILE, proc.stdout) + self.assertIn("NIM_PASSTHROUGH_ARGS", proc.stdout) + self.assertIn("--disable-custom-all-reduce", proc.stdout) + self.assertIn("NCCL_IB_DISABLE", proc.stdout) + self.assertIn("NCCL_P2P_DISABLE", proc.stdout) + self.assertIn('api_base: "http://llama-3-3-nemotron-super-49b-v1-5:8000/v1"', proc.stdout) + self.assertIn(f'model: "{_SUPER49B_MODEL}"', proc.stdout) + self.assertIn('rag_system_prompt_prefix: "/no_think"', proc.stdout) + self.assertIn("enabled: true", proc.stdout) + + def test_explicit_llm_api_base_wins_without_operator_nim(self) -> None: + proc = _helm_template( + extra_args=( + "--set", + "serviceConfig.llm.enabled=true", + "--set", + "serviceConfig.llm.apiBase=http://external-llm:8000/v1", + "--set", + "serviceConfig.llm.model=openai/custom-answerer", + ) + ) + _assert_helm_ok(self, proc) + + self.assertNotIn(f"name: {_SUPER49B_SERVICE}", proc.stdout) + self.assertIn('api_base: "http://external-llm:8000/v1"', proc.stdout) + self.assertIn('model: "openai/custom-answerer"', proc.stdout) + self.assertIn("rag_system_prompt_prefix: null", proc.stdout) + + def test_llm_api_key_secret_renders_env_not_configmap_value(self) -> None: + proc = _helm_template( + extra_args=( + "--set", + "serviceConfig.llm.enabled=true", + "--set", + "serviceConfig.llm.apiBase=http://external-llm:8000/v1", + "--set", + "serviceConfig.llm.apiKeySecret.name=llm-secret", + "--set", + "serviceConfig.llm.apiKeySecret.key=OPENAI_API_KEY", + ) + ) + _assert_helm_ok(self, proc) + + self.assertIn("api_key: null", proc.stdout) + self.assertIn("NEMO_RETRIEVER_LLM_API_KEY", proc.stdout) + self.assertIn('name: "llm-secret"', proc.stdout) + self.assertIn('key: "OPENAI_API_KEY"', proc.stdout) + self.assertNotIn('api_key: "', proc.stdout) + + +if __name__ == "__main__": + main() diff --git a/nemo_retriever/tests/test_live_rag.py b/nemo_retriever/tests/test_live_rag.py index af55c8b45b..5233125f10 100644 --- a/nemo_retriever/tests/test_live_rag.py +++ b/nemo_retriever/tests/test_live_rag.py @@ -626,6 +626,8 @@ def _build_fake_llm_client(*, top_p: float | None = None): extra_params={}, num_retries=3, timeout=120.0, + rag_system_prompt=None, + rag_system_prompt_prefix=None, ) sampling = SimpleNamespace(temperature=0.0, top_p=top_p, max_tokens=512) return SimpleNamespace(transport=transport, sampling=sampling) diff --git a/nemo_retriever/tests/test_llm_params.py b/nemo_retriever/tests/test_llm_params.py index 657d6e8bd9..4eb8703bbf 100644 --- a/nemo_retriever/tests/test_llm_params.py +++ b/nemo_retriever/tests/test_llm_params.py @@ -223,6 +223,39 @@ def test_extra_params_merged_last(self, mock_completion): assert kwargs["num_retries"] == 99 +class TestLiteLLMRAGPrompt: + """RAG prompt customization for local OpenAI-compatible answer models.""" + + @patch("litellm.completion") + def test_generate_prepends_rag_system_prompt_prefix(self, mock_completion): + from nemo_retriever.llm.clients import LiteLLMClient + + mock_completion.return_value = _fake_litellm_response("answer") + client = LiteLLMClient.from_kwargs(model="m", rag_system_prompt_prefix="/no_think") + result = client.generate(query="q", chunks=["ctx"]) + + messages = mock_completion.call_args.kwargs["messages"] + assert messages[0]["role"] == "system" + assert messages[0]["content"].startswith("/no_think\n") + assert "precise question-answering assistant" in messages[0]["content"] + assert result.answer == "answer" + + @patch("litellm.completion") + def test_generate_uses_custom_rag_system_prompt(self, mock_completion): + from nemo_retriever.llm.clients import LiteLLMClient + + mock_completion.return_value = _fake_litellm_response("answer") + client = LiteLLMClient.from_kwargs( + model="m", + rag_system_prompt="Use only context.", + rag_system_prompt_prefix="/no_think", + ) + client.generate(query="q", chunks=["ctx"]) + + messages = mock_completion.call_args.kwargs["messages"] + assert messages[0]["content"] == "/no_think\nUse only context." + + class TestLLMJudgeConstruction: """LLMJudge should use the current Nemotron judge defaults and expose .model.""" diff --git a/nemo_retriever/tests/test_service_answer_generation.py b/nemo_retriever/tests/test_service_answer_generation.py new file mode 100644 index 0000000000..ebde9245f6 --- /dev/null +++ b/nemo_retriever/tests/test_service_answer_generation.py @@ -0,0 +1,276 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. +# All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Service-mode answer generation over the VectorDB query endpoint.""" + +from __future__ import annotations + +import json +from types import SimpleNamespace +from typing import Any +from unittest.mock import patch + +import pytest +from fastapi.testclient import TestClient + +from nemo_retriever.llm.types import GenerationResult +from nemo_retriever.service.app import create_app +from nemo_retriever.service.config import LLMConfig, LoggingConfig, PipelinePoolConfig, ServiceConfig, VectorDbConfig + + +@pytest.fixture +def app_with_answer_config(monkeypatch: pytest.MonkeyPatch, tmp_path): + """Standalone service app with workers stubbed and LLM answering enabled.""" + + async def _stub_work(_item): + return 0, [] + + monkeypatch.setattr( + "nemo_retriever.service.services.pipeline_executor.create_realtime_work_fn", + lambda _config: _stub_work, + ) + monkeypatch.setattr( + "nemo_retriever.service.services.pipeline_executor.create_batch_work_fn", + lambda _config: _stub_work, + ) + + cfg = ServiceConfig( + mode="standalone", + logging=LoggingConfig(file=str(tmp_path / "service.log")), + pipeline=PipelinePoolConfig(realtime_workers=1, batch_workers=1), + vectordb=VectorDbConfig(enabled=True, vectordb_url="http://vectordb:7671"), + llm=LLMConfig( + enabled=True, + model="openai/nvidia/llama-3.3-nemotron-super-49b-v1.5", + api_base="http://llama-3-3-nemotron-super-49b-v1-5:8000/v1", + api_key="not-needed", + max_tokens=128, + timeout=180.0, + rag_system_prompt_prefix="/no_think", + ), + ) + app = create_app(cfg) + with TestClient(app) as client: + yield client + + +def test_answer_retrieves_from_vectordb_and_generates_with_configured_llm( + app_with_answer_config: TestClient, + monkeypatch: pytest.MonkeyPatch, +) -> None: + requests: list[dict[str, Any]] = [] + + class _FakeResponse: + status_code = 200 + content = json.dumps( + { + "results": [ + { + "hits": [ + {"text": "Super-49B is the answer generator.", "source": "doc.pdf", "page_number": 1}, + {"text": "NRL queries LanceDB before generation.", "source": "doc.pdf", "page_number": 2}, + ] + } + ] + } + ).encode() + + def json(self) -> dict[str, Any]: + return json.loads(self.content.decode()) + + class _FakeAsyncClient: + def __init__(self, *args, **kwargs) -> None: + self.kwargs = kwargs + + async def __aenter__(self): + return self + + async def __aexit__(self, exc_type, exc, tb) -> None: + return None + + async def post(self, url: str, **kwargs) -> _FakeResponse: + requests.append({"url": url, **kwargs}) + return _FakeResponse() + + monkeypatch.setattr("httpx.AsyncClient", _FakeAsyncClient) + + fake_llm = SimpleNamespace( + generate=lambda query, chunks: GenerationResult( + answer=f"{query}: {len(chunks)} chunks", + latency_s=0.25, + model="openai/nvidia/llama-3.3-nemotron-super-49b-v1.5", + ) + ) + + with patch("nemo_retriever.llm.clients.LiteLLMClient.from_kwargs", return_value=fake_llm) as from_kwargs: + resp = app_with_answer_config.post( + "/v1/answer", + json={"query": "What generates answers?", "top_k": 2, "include_chunks": True}, + ) + + assert resp.status_code == 200, resp.text + body = resp.json() + assert body["query"] == "What generates answers?" + assert body["answer"] == "What generates answers?: 2 chunks" + assert body["chunk_count"] == 2 + assert body["chunks"] == ["Super-49B is the answer generator.", "NRL queries LanceDB before generation."] + assert body["metadata"] == [ + {"source": "doc.pdf", "page_number": 1}, + {"source": "doc.pdf", "page_number": 2}, + ] + + assert requests == [ + { + "url": "http://vectordb:7671/v1/query", + "json": {"query": "What generates answers?", "top_k": 2}, + } + ] + from_kwargs.assert_called_once_with( + model="openai/nvidia/llama-3.3-nemotron-super-49b-v1.5", + api_base="http://llama-3-3-nemotron-super-49b-v1-5:8000/v1", + api_key="not-needed", + temperature=0.0, + top_p=None, + max_tokens=128, + extra_params={}, + num_retries=3, + timeout=180.0, + rag_system_prompt=None, + rag_system_prompt_prefix="/no_think", + ) + + +def test_answer_returns_404_when_llm_disabled(monkeypatch: pytest.MonkeyPatch, tmp_path) -> None: + async def _stub_work(_item): + return 0, [] + + monkeypatch.setattr( + "nemo_retriever.service.services.pipeline_executor.create_realtime_work_fn", + lambda _config: _stub_work, + ) + monkeypatch.setattr( + "nemo_retriever.service.services.pipeline_executor.create_batch_work_fn", + lambda _config: _stub_work, + ) + + app = create_app( + ServiceConfig( + mode="standalone", + logging=LoggingConfig(file=str(tmp_path / "service.log")), + pipeline=PipelinePoolConfig(realtime_workers=1, batch_workers=1), + vectordb=VectorDbConfig(enabled=True, vectordb_url="http://vectordb:7671"), + llm=LLMConfig(enabled=False), + ) + ) + + with TestClient(app) as client: + resp = client.post("/v1/answer", json={"query": "q"}) + + assert resp.status_code == 404 + assert "LLM answer generation is not enabled" in resp.json()["detail"] + + +def test_answer_returns_404_when_vectordb_disabled(monkeypatch: pytest.MonkeyPatch, tmp_path) -> None: + async def _stub_work(_item): + return 0, [] + + monkeypatch.setattr( + "nemo_retriever.service.services.pipeline_executor.create_realtime_work_fn", + lambda _config: _stub_work, + ) + monkeypatch.setattr( + "nemo_retriever.service.services.pipeline_executor.create_batch_work_fn", + lambda _config: _stub_work, + ) + + app = create_app( + ServiceConfig( + mode="standalone", + logging=LoggingConfig(file=str(tmp_path / "service.log")), + pipeline=PipelinePoolConfig(realtime_workers=1, batch_workers=1), + vectordb=VectorDbConfig(enabled=False), + llm=LLMConfig(enabled=True), + ) + ) + + with TestClient(app) as client: + resp = client.post("/v1/answer", json={"query": "q"}) + + assert resp.status_code == 404 + assert "VectorDB is not enabled" in resp.json()["detail"] + + +def test_answer_returns_502_when_llm_generation_fails( + app_with_answer_config: TestClient, + monkeypatch: pytest.MonkeyPatch, +) -> None: + class _FakeResponse: + status_code = 200 + content = json.dumps({"results": [{"hits": [{"text": "context"}]}]}).encode() + + def json(self) -> dict[str, Any]: + return json.loads(self.content.decode()) + + class _FakeAsyncClient: + def __init__(self, *args, **kwargs) -> None: + pass + + async def __aenter__(self): + return self + + async def __aexit__(self, exc_type, exc, tb) -> None: + return None + + async def post(self, url: str, **kwargs) -> _FakeResponse: + return _FakeResponse() + + monkeypatch.setattr("httpx.AsyncClient", _FakeAsyncClient) + fake_llm = SimpleNamespace( + generate=lambda query, chunks: GenerationResult( + answer="", + latency_s=0.0, + model="openai/nvidia/llama-3.3-nemotron-super-49b-v1.5", + error="connection refused", + ) + ) + + with patch("nemo_retriever.llm.clients.LiteLLMClient.from_kwargs", return_value=fake_llm): + resp = app_with_answer_config.post("/v1/answer", json={"query": "q"}) + + assert resp.status_code == 502 + assert resp.json()["detail"] == "LLM answer generation failed: connection refused" + + +def test_service_start_reads_llm_api_key_from_env(monkeypatch: pytest.MonkeyPatch, tmp_path) -> None: + from typer.testing import CliRunner + + from nemo_retriever.service.cli import app as service_cli_app + + config_path = tmp_path / "retriever-service.yaml" + config_path.write_text( + "mode: standalone\n" "llm:\n" " enabled: true\n" " api_base: http://external-llm:8000/v1\n", + encoding="utf-8", + ) + captured: dict[str, Any] = {} + application = object() + + def _fake_create_app(config: ServiceConfig) -> object: + captured["config"] = config + return application + + def _fake_uvicorn_run(app: object, *, host: str, port: int, log_level: str) -> None: + captured["uvicorn"] = {"app": app, "host": host, "port": port, "log_level": log_level} + + monkeypatch.setattr("nemo_retriever.service.app.create_app", _fake_create_app) + monkeypatch.setattr("uvicorn.run", _fake_uvicorn_run) + + result = CliRunner().invoke( + service_cli_app, + ["start", "--config", str(config_path)], + env={"NEMO_RETRIEVER_LLM_API_KEY": "secret-from-env"}, + ) + + assert result.exit_code == 0, result.output + assert captured["config"].llm.api_key == "secret-from-env" + assert captured["uvicorn"]["app"] is application diff --git a/nemo_retriever/tests/test_service_packaging.py b/nemo_retriever/tests/test_service_packaging.py new file mode 100644 index 0000000000..897c4fff06 --- /dev/null +++ b/nemo_retriever/tests/test_service_packaging.py @@ -0,0 +1,28 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. +# All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import tomllib +from pathlib import Path + +from packaging.requirements import Requirement + + +PROJECT_ROOT = Path(__file__).resolve().parents[1] + + +def _extra_requirements(extra: str) -> list[Requirement]: + pyproject = tomllib.loads((PROJECT_ROOT / "pyproject.toml").read_text(encoding="utf-8")) + deps = pyproject["project"]["optional-dependencies"][extra] + return [Requirement(dep) for dep in deps] + + +def test_service_extra_includes_litellm_for_answer_generation() -> None: + requirements = _extra_requirements("service") + + litellm = next((req for req in requirements if req.name == "litellm"), None) + + assert litellm is not None + assert litellm.specifier.contains("1.86.0") diff --git a/nemo_retriever/uv.lock b/nemo_retriever/uv.lock index aa7ab1fb85..f6dbf1f008 100644 --- a/nemo_retriever/uv.lock +++ b/nemo_retriever/uv.lock @@ -2490,7 +2490,6 @@ all = [ { name = "nemotron-table-structure-v1" }, { name = "neo4j" }, { name = "nvidia-ml-py" }, - { name = "nvidia-riva-client" }, { name = "open-clip-torch" }, { name = "opencv-python-headless" }, { name = "psutil" }, @@ -2562,6 +2561,7 @@ service = [ { name = "easydict" }, { name = "glom" }, { name = "librosa" }, + { name = "litellm" }, { name = "psutil" }, { name = "scikit-learn" }, ] @@ -2606,7 +2606,8 @@ requires-dist = [ { name = "langgraph", marker = "extra == 'tabular'", specifier = ">=1.1.0a2" }, { name = "librosa", marker = "extra == 'multimedia'", specifier = ">=0.10.2" }, { name = "librosa", marker = "extra == 'service'", specifier = ">=0.10.2" }, - { name = "litellm", marker = "extra == 'llm'", specifier = ">=1.86.0rc1" }, + { name = "litellm", marker = "extra == 'llm'", specifier = ">=1.86.0" }, + { name = "litellm", marker = "extra == 'service'", specifier = ">=1.86.0" }, { name = "markitdown" }, { name = "nemo-retriever", extras = ["benchmarks", "llm", "local", "multimedia", "nemotron-parse", "service", "tabular"], marker = "extra == 'all'" }, { name = "nemotron-graphic-elements-v1", marker = "extra == 'local'", specifier = ">=0.dev0", index = "https://test.pypi.org/simple/" }, From 39c364e2d67ec277160acf63d3423c234ff1e580 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Wed, 3 Jun 2026 14:37:24 +0000 Subject: [PATCH 02/14] Generalize Helm answer LLM NIM --- nemo_retriever/helm/README.md | 90 ++++++++++++++----- nemo_retriever/helm/templates/NOTES.txt | 4 +- nemo_retriever/helm/templates/_helpers.tpl | 1 + nemo_retriever/helm/templates/configmap.yaml | 16 ++-- nemo_retriever/helm/templates/deployment.yaml | 43 +++++---- ...on-super-49b-v1-5.yaml => answer-llm.yaml} | 6 +- nemo_retriever/helm/values.yaml | 61 ++++++++----- ....py => test_helm_answer_llm_generation.py} | 90 ++++++++++++++++--- .../tests/test_helm_nimcache_model_profile.py | 17 ++-- 9 files changed, 230 insertions(+), 98 deletions(-) rename nemo_retriever/helm/templates/nims/{llama-3-3-nemotron-super-49b-v1-5.yaml => answer-llm.yaml} (86%) rename nemo_retriever/tests/{test_helm_super49b_answer_generation.py => test_helm_answer_llm_generation.py} (53%) diff --git a/nemo_retriever/helm/README.md b/nemo_retriever/helm/README.md index 81f8d1a459..acfe0a33a3 100644 --- a/nemo_retriever/helm/README.md +++ b/nemo_retriever/helm/README.md @@ -211,12 +211,12 @@ helm install retriever ./nemo_retriever/helm \ --set ngcApiSecret.password=$NGC_API_KEY ``` -> The VL reranker (`rerankqa`), Nemotron Parse, the Nemotron 3 Nano Omni 30B caption NIM, the Super-49B answer-generation LLM, and the Parakeet `audio` ASR NIM are **all off by default** in 26.05 — they only reconcile when you explicitly opt in. Opt-in flags: +> The VL reranker (`rerankqa`), Nemotron Parse, the Nemotron 3 Nano Omni 30B caption NIM, the generic answer-generation LLM (`answer_llm`, Super-49B defaults), and the Parakeet `audio` ASR NIM are **all off by default** in 26.05 — they only reconcile when you explicitly opt in. Opt-in flags: > > * VL reranker — `--set nimOperator.rerankqa.enabled=true` > * Nemotron Parse — `--set nimOperator.nemotron_parse.enabled=true` > * Omni 30B captioner — `--set nimOperator.nemotron_3_nano_omni_30b_a3b_reasoning.enabled=true` -> * Super-49B answer generation — `--set nimOperator.llama_3_3_nemotron_super_49b_v1_5.enabled=true` +> * Answer generation LLM — `--set nimOperator.answer_llm.enabled=true` > * Parakeet ASR — `--set nimOperator.audio.enabled=true` (also set `serviceConfig.nimEndpoints.audioGrpcEndpoint=audio:50051` to wire ASR into the service, plus `service.installFfmpeg=true` if your image does not bundle ffmpeg) > > This matches the "optional and disabled by default" contract in [deployment-options.md](https://github.com/NVIDIA/NeMo-Retriever/blob/main/docs/docs/extraction/deployment-options.md) and avoids silently pulling ≈ 62 GiB of Omni weights, loading a large two-GPU LLM, or claiming extra dedicated GPUs on a "default" install. See the [model hardware requirements](https://github.com/NVIDIA/NeMo-Retriever/blob/main/docs/docs/extraction/prerequisites-support-matrix.md#model-hardware-requirements) table for per-NIM GPU and disk costs. @@ -319,12 +319,12 @@ The retriever service picks up the in-cluster ASR endpoint when `nimOperator.aud | `serviceConfig.pipeline.batchWorkers` | `48` | Per-pod batch worker count. See [Timeouts and alleviating ingest failures](#timeouts-and-alleviating-ingest-failures) if embed or pool errors appear under load. | | `serviceConfig.nimEndpoints.*InvokeUrl` | `""` | Override the auto-resolved NIM Operator URL. Available knobs: `pageElementsInvokeUrl`, `tableStructureInvokeUrl`, `ocrInvokeUrl`, `embedInvokeUrl`, and `captionInvokeUrl` (see [Image captioning (Omni 30B)](#image-captioning-omni-30b)). | | `serviceConfig.nimEndpoints.captionModelName` | `""` | Model id sent to the remote VLM. Auto-set to `nvidia/nemotron-3-nano-omni-30b-a3b-reasoning` whenever a caption URL is resolved. | -| `serviceConfig.llm.enabled` | `false` | Enables `POST /v1/answer`. Auto-flips to true when the Super-49B NIM is enabled and the operator URL resolves. | -| `serviceConfig.llm.apiBase` | `""` | OpenAI-compatible LLM base URL. Explicit value wins; otherwise Super-49B opt-in resolves to `http://llama-3-3-nemotron-super-49b-v1-5:8000/v1`. | -| `serviceConfig.llm.apiKeySecret.name` | `""` | Optional Secret name for external LLM credentials. When set, the service reads the key from `NEMO_RETRIEVER_LLM_API_KEY`; the key is not written to the ConfigMap. | -| `serviceConfig.llm.apiKeySecret.key` | `api_key` | Secret key for external LLM credentials. | -| `serviceConfig.llm.model` | `openai/nvidia/llama-3.3-nemotron-super-49b-v1.5` | Model id passed to LiteLLM for answer generation. | -| `serviceConfig.llm.ragSystemPromptPrefix` | `/no_think` | Prepended to the RAG system prompt for Super-49B. | +| `serviceConfig.llm.enabled` | `false` | Enables `POST /v1/answer`. Auto-flips to true when `nimOperator.answer_llm` is enabled and the operator URL resolves. | +| `serviceConfig.llm.apiBase` | `""` | OpenAI-compatible LLM base URL. Explicit value wins; otherwise `answer_llm` opt-in resolves to `http://answer-llm:8000/v1` by default. | +| `serviceConfig.llm.apiKeySecret.name` | `""` | Optional Secret name for external LLM credentials. Explicit values win; otherwise operator-managed `answer_llm` mounts its `authSecret` as `NEMO_RETRIEVER_LLM_API_KEY` so LiteLLM/OpenAI has a credential value without writing it to the ConfigMap. | +| `serviceConfig.llm.apiKeySecret.key` | `api_key` | Secret key for external LLM credentials. Operator-managed `answer_llm` uses `NGC_API_KEY` from `nimOperator.answer_llm.authSecret` when no explicit LLM Secret is set. | +| `serviceConfig.llm.model` | `""` | Optional explicit LiteLLM model id. Leave empty to inherit `nimOperator.answer_llm.model` when using the operator-managed answer LLM; set it for external endpoints. | +| `serviceConfig.llm.ragSystemPromptPrefix` | `""` | Optional explicit RAG prompt prefix. Leave empty to inherit `nimOperator.answer_llm.ragSystemPromptPrefix` when using the operator-managed answer LLM. | | `serviceConfig.vectordb.enabled` | `true` | Deploy the LanceDB vectordb Pod. When `true` the chart **requires** a resolvable embed endpoint (see [VectorDB and the embed endpoint](#vectordb-and-the-embed-endpoint)); `helm install` / `helm upgrade` fails fast otherwise. | | `serviceConfig.vectordb.lancedbUri` | `/data/vectordb` | LanceDB on the vectordb Pod's PVC. | | `serviceConfig.vectordb.embedModel` | `nvidia/llama-nemotron-embed-vl-1b-v2` | Passed to vectordb + worker `embed_model_name`. | @@ -362,38 +362,80 @@ sources](#3-install-with-the-nim-operator-in-cluster-nims)): `apps.nvidia.com/v1alpha1` CRDs are installed in the cluster. 3. Otherwise the chart fails the install. -#### Answer generation (Super-49B) { #answer-generation-super-49b } +#### Answer generation (operator-managed LLM) { #answer-generation-llm } -Enable the Super-49B NIM to add service-mode answer generation on top of -the VectorDB query path: +Enable the generic `answer_llm` NIM slot to add service-mode answer +generation on top of the VectorDB query path. The slot defaults to the +Super-49B NIM, but the image, model id, service name, resources, +profile filter, and environment can be overridden for another +OpenAI-compatible LLM NIM. ```bash helm upgrade --install retriever ./nemo_retriever/helm \ - --set nimOperator.llama_3_3_nemotron_super_49b_v1_5.enabled=true + --set nimOperator.answer_llm.enabled=true ``` -When the NIM Operator CRDs are present, the chart renders a -`llama-3-3-nemotron-super-49b-v1-5` NIMCache/NIMService and writes this -block into `retriever-service.yaml`: +When the NIM Operator CRDs are present, the chart renders an `answer-llm` +NIMCache/NIMService by default and writes this block into +`retriever-service.yaml`: ```yaml llm: enabled: true model: "openai/nvidia/llama-3.3-nemotron-super-49b-v1.5" - api_base: "http://llama-3-3-nemotron-super-49b-v1-5:8000/v1" + api_base: "http://answer-llm:8000/v1" rag_system_prompt_prefix: "/no_think" ``` The retriever service then exposes `POST /v1/answer`, which calls the VectorDB pod's `/v1/query` endpoint for context and sends those chunks to -the configured LLM endpoint. The default Super-49B NIMService resources request two GPUs -(`nvidia.com/gpu: 2`) to match the bundled tensor-parallel NIM -profile. Override `resources`, `modelProfile`, or `env` for deployments -that use a different profile or hardware topology. +the configured LLM endpoint. The default Super-49B NIMService resources +request two GPUs (`nvidia.com/gpu: 2`) to match the bundled +tensor-parallel NIM profile. Override `resources`, `modelProfile`, or +`env` for deployments that use a different profile or hardware topology. +When `answer_llm` is enabled and no explicit `serviceConfig.llm.apiKeySecret` +is set, the service also mounts `nimOperator.answer_llm.authSecret` as +`NEMO_RETRIEVER_LLM_API_KEY`; OpenAI-compatible clients require a +credential value even for in-cluster NIM endpoints, and the key is never +rendered into the ConfigMap. + +For example, to try Nemotron 3 Nano as the answer LLM on an A100 80GB +node, override the operator-managed slot instead of adding a second +hard-coded LLM service: + +```bash +helm upgrade --install retriever ./nemo_retriever/helm \ + --set nimOperator.answer_llm.enabled=true \ + --set nimOperator.answer_llm.nimServiceName=nemotron-3-nano \ + --set nimOperator.answer_llm.image.repository=nvcr.io/nim/nvidia/nemotron-3-nano \ + --set nimOperator.answer_llm.image.tag=1.7.0-variant \ + --set nimOperator.answer_llm.model=openai/nvidia/nemotron-3-nano-30b-a3b \ + --set nimOperator.answer_llm.ragSystemPromptPrefix= \ + --set-json nimOperator.answer_llm.modelProfile='{"profiles":["5f89f01a0af587fd8bae50c611b1f358f92effdb9fb29362e1af0a986e5561c3"]}' \ + --set-json nimOperator.answer_llm.resources='{"limits":{"nvidia.com/gpu":1},"requests":{"nvidia.com/gpu":1}}' \ + --set nimOperator.answer_llm.env[0].name=NIM_HTTP_API_PORT \ + --set-string nimOperator.answer_llm.env[0].value=8000 \ + --set nimOperator.answer_llm.env[1].name=NIM_SERVED_MODEL_NAME \ + --set-string nimOperator.answer_llm.env[1].value=nvidia/nemotron-3-nano-30b-a3b \ + --set nimOperator.answer_llm.env[2].name=NIM_TENSOR_PARALLEL_SIZE \ + --set-string nimOperator.answer_llm.env[2].value=1 +``` + +Use the repository and tag available in your NGC environment; staging +registries can use the same override shape with `nvstaging` image names +or tags. `nimOperator.answer_llm.model` is the LiteLLM model id used by +the retriever service; for an OpenAI-compatible in-cluster NIM, keep the +`openai/` prefix there and set `NIM_SERVED_MODEL_NAME` to the raw model +name advertised by the NIM. Replace the default Super-49B `modelProfile`, +`resources`, `env`, and prompt prefix when the target model requires a +different GPU/profile setup. Leaving `modelProfile` empty preserves NIM +Operator auto-discovery, but for Nano it can cache every advertised +profile on first reconciliation; pin a known-compatible profile when you +know the target GPU topology. `serviceConfig.llm.apiBase` and `serviceConfig.llm.model` can be set explicitly to point `/v1/answer` at an external OpenAI-compatible LLM -instead of deploying Super-49B in-cluster. For external credentials, +instead of deploying an answer LLM in-cluster. For external credentials, create a Kubernetes Secret and set `serviceConfig.llm.apiKeySecret.name` plus `serviceConfig.llm.apiKeySecret.key`; Helm mounts the Secret as an environment variable instead of writing the key into the ConfigMap. @@ -420,7 +462,9 @@ pair gated on three conditions ALL holding: | `nimOperator.rerankqa.enabled` | `false` | VL reranker NIM (optional; not auto-wired). Set `true` to opt in. Default `false` so 26.05 installs honor the "optional and disabled by default" contract in [deployment-options.md](https://github.com/NVIDIA/NeMo-Retriever/blob/main/docs/docs/extraction/deployment-options.md) and do not silently provision an extra ≈ 3.1 GiB GPU NIM. The image points at the **VL** SKU (`llama-nemotron-rerank-vl-1b-v2`) per [prerequisites-support-matrix.md](https://github.com/NVIDIA/NeMo-Retriever/blob/main/docs/docs/extraction/prerequisites-support-matrix.md#default-helm-nims) — the text-only `llama-nemotron-rerank-1b-v2` silently degrades multimodal reranking and is not the documented POR. | | `nimOperator.nemotron_parse.enabled` | `false` | Structured-parse NIM (optional). Set `true` when using `extract_method="nemotron_parse"`. Default `false` so 26.05 installs honor the "optional and disabled by default" contract in [deployment-options.md](https://github.com/NVIDIA/NeMo-Retriever/blob/main/docs/docs/extraction/deployment-options.md). Image tag follows the [image tag conventions](#image-tag-conventions). | | `nimOperator.nemotron_3_nano_omni_30b_a3b_reasoning.enabled` | `false` | Omni 30B caption NIM (optional). Set `true` to enable image captioning — see [Image captioning (Omni 30B)](#image-captioning-omni-30b). Default `false` so 26.05 installs do not silently pull ≈ 62 GiB of BF16 weights or claim a second dedicated GPU. Image tag follows the [image tag conventions](#image-tag-conventions). | -| `nimOperator.llama_3_3_nemotron_super_49b_v1_5.enabled` | `false` | Super-49B answer-generation NIM (optional). Set `true` to enable `/v1/answer` — see [Answer generation (Super-49B)](#answer-generation-super-49b). Default `false` so installs do not silently claim two dedicated GPUs. | +| `nimOperator.answer_llm.enabled` | `false` | Generic answer-generation LLM NIM (optional; Super-49B defaults). Set `true` to enable `/v1/answer` — see [Answer generation (operator-managed LLM)](#answer-generation-llm). Default `false` so installs do not silently claim answer-generation GPUs. | +| `nimOperator.answer_llm.model` | `openai/nvidia/llama-3.3-nemotron-super-49b-v1.5` | LiteLLM/OpenAI model id inherited by `serviceConfig.llm.model` when the operator-managed answer LLM is enabled and no explicit service model is set. | +| `nimOperator.answer_llm.ragSystemPromptPrefix` | `/no_think` | Prompt prefix inherited by `serviceConfig.llm.ragSystemPromptPrefix` for the operator-managed answer LLM. Override to `""` for models that should receive the plain RAG prompt. | | `nimOperator.audio.enabled` | `false` | Parakeet ASR NIM (optional). Set `true` for audio/video transcription; pair with `serviceConfig.nimEndpoints.audioGrpcEndpoint=audio:50051` so the retriever-service can reach it. | | `nimOperator..image.repository` | `nvcr.io/nim/nvidia/...` | Per-NIM image. | | `nimOperator..image.pullSecrets` | `[ngc-secret]` | Referenced by the NIMService CR. | @@ -1150,7 +1194,7 @@ Verify tags on the Git branch or tag you ship (for example `26.05` or | VL reranker (optional) | `rerankqa` | `nvcr.io/nim/nvidia/llama-nemotron-rerank-vl-1b-v2:1.10.0` | | Nemotron Parse (optional) | `nemotron_parse` | `nvcr.io/nim/nvidia/nemotron-parse-v1.2:1.7.0-variant` | | Omni caption (optional) | `nemotron_3_nano_omni_30b_a3b_reasoning` | `nvcr.io/nim/nvidia/nemotron-3-nano-omni-30b-a3b-reasoning:1.7.0-variant` | -| Super-49B answer LLM (optional) | `llama_3_3_nemotron_super_49b_v1_5` | `nvcr.io/nim/nvidia/llama-3.3-nemotron-super-49b-v1.5:2.0.5` | +| Answer LLM (optional, Super-49B default) | `answer_llm` | `nvcr.io/nim/nvidia/llama-3.3-nemotron-super-49b-v1.5:2.0.5` | | Parakeet ASR (optional) | `audio` | `nvcr.io/nim/nvidia/parakeet-1-1b-ctc-en-us:1.5.0` | GPU SKU support for `audio` is in [Model hardware requirements](https://github.com/NVIDIA/NeMo-Retriever/blob/main/docs/docs/extraction/prerequisites-support-matrix.md#model-hardware-requirements). diff --git a/nemo_retriever/helm/templates/NOTES.txt b/nemo_retriever/helm/templates/NOTES.txt index d2c1cec6d8..66f813a343 100644 --- a/nemo_retriever/helm/templates/NOTES.txt +++ b/nemo_retriever/helm/templates/NOTES.txt @@ -59,8 +59,8 @@ Services: {{- if .Values.nimOperator.vlm_embed.enabled }} - {{ .Values.nimOperator.vlm_embed.nimServiceName }} → http://{{ .Values.nimOperator.vlm_embed.nimServiceName }}:{{ .Values.nimOperator.vlm_embed.expose.service.port }}/v1/embeddings {{- end }} -{{- if .Values.nimOperator.llama_3_3_nemotron_super_49b_v1_5.enabled }} - - {{ .Values.nimOperator.llama_3_3_nemotron_super_49b_v1_5.nimServiceName }} → http://{{ .Values.nimOperator.llama_3_3_nemotron_super_49b_v1_5.nimServiceName }}:{{ .Values.nimOperator.llama_3_3_nemotron_super_49b_v1_5.expose.service.port }}/v1/chat/completions +{{- if .Values.nimOperator.answer_llm.enabled }} + - {{ .Values.nimOperator.answer_llm.nimServiceName }} → http://{{ .Values.nimOperator.answer_llm.nimServiceName }}:{{ .Values.nimOperator.answer_llm.expose.service.port }}/v1/chat/completions (auto-wired into the retriever service config as `llm.api_base` for `/v1/answer`) {{- end }} {{- if .Values.nimOperator.rerankqa.enabled }} diff --git a/nemo_retriever/helm/templates/_helpers.tpl b/nemo_retriever/helm/templates/_helpers.tpl index 90098d5594..3875d3d854 100644 --- a/nemo_retriever/helm/templates/_helpers.tpl +++ b/nemo_retriever/helm/templates/_helpers.tpl @@ -251,6 +251,7 @@ Mapping (key -> Service name, default invokePath): ocr -> nemotron-ocr-v1 /v1/infer vlm_embed -> llama-nemotron-embed-vl-1b-v2 /v1/embeddings nemotron_3_nano_omni_30b_a3b_reasoning -> nemotron-3-nano-omni-30b-a3b-reasoning /v1/chat/completions + answer_llm -> Values.nimOperator.answer_llm.nimServiceName /v1 Audio ASR (Parakeet) is configured directly via serviceConfig.nimEndpoints.audioGrpcEndpoint (no NIM Operator auto-wire). diff --git a/nemo_retriever/helm/templates/configmap.yaml b/nemo_retriever/helm/templates/configmap.yaml index 435acf1942..7fd1083317 100644 --- a/nemo_retriever/helm/templates/configmap.yaml +++ b/nemo_retriever/helm/templates/configmap.yaml @@ -13,7 +13,7 @@ inherits the NIMService resource name, so the mapping is fixed: ocr -> nemotron-ocr-v1 /v1/infer vlm_embed -> llama-nemotron-embed-vl-1b-v2 /v1/embeddings nemotron_3_nano_omni_30b_a3b_reasoning -> nemotron-3-nano-omni-30b-a3b-reasoning /v1/chat/completions - llama_3_3_nemotron_super_49b_v1_5 -> llama-3-3-nemotron-super-49b-v1-5 /v1 + answer_llm -> Values.nimOperator.answer_llm.nimServiceName /v1 */}} {{- $ctx := . -}} {{- $pageElementsURL := include "nemo-retriever.nim.endpointURL" (dict "context" $ctx "key" "page_elements" "serviceName" "nemotron-page-elements-v3" "configKey" "pageElementsInvokeUrl" "invokePath" "/v1/infer") -}} @@ -38,13 +38,17 @@ inherits the NIMService resource name, so the mapping is fixed: {{- $llmAPIBase := $ctx.Values.serviceConfig.llm.apiBase | default "" -}} {{- $llmOperatorAPIBase := "" -}} {{- if not $llmAPIBase -}} -{{- $llmOperatorAPIBase = include "nemo-retriever.nimOperator.url" (dict "context" $ctx "key" "llama_3_3_nemotron_super_49b_v1_5" "serviceName" $ctx.Values.nimOperator.llama_3_3_nemotron_super_49b_v1_5.nimServiceName "invokePath" "/v1") -}} +{{- $llmOperatorAPIBase = include "nemo-retriever.nimOperator.url" (dict "context" $ctx "key" "answer_llm" "serviceName" $ctx.Values.nimOperator.answer_llm.nimServiceName "invokePath" "/v1") -}} {{- $llmAPIBase = $llmOperatorAPIBase -}} {{- end -}} +{{- $llmModel := $ctx.Values.serviceConfig.llm.model | default "" -}} +{{- if and (not $llmModel) $llmOperatorAPIBase -}} +{{- $llmModel = $ctx.Values.nimOperator.answer_llm.model | default "" -}} +{{- end -}} {{- $llmEnabled := or $ctx.Values.serviceConfig.llm.enabled $llmAPIBase -}} {{- $llmRagSystemPromptPrefix := $ctx.Values.serviceConfig.llm.ragSystemPromptPrefix | default "" -}} {{- if and (not $llmRagSystemPromptPrefix) $llmOperatorAPIBase -}} -{{- $llmRagSystemPromptPrefix = "/no_think" -}} +{{- $llmRagSystemPromptPrefix = $ctx.Values.nimOperator.answer_llm.ragSystemPromptPrefix | default "" -}} {{- end -}} {{- define "nemo-retriever.configBody" -}} @@ -70,7 +74,7 @@ nim_endpoints: llm: enabled: {{ if .llmEnabled }}true{{ else }}false{{ end }} - model: {{ .Values.serviceConfig.llm.model | quote }} + model: {{ .llmModel | quote }} api_base: {{ if .llmAPIBase }}{{ .llmAPIBase | quote }}{{ else }}null{{ end }} api_key: null temperature: {{ .Values.serviceConfig.llm.temperature }} @@ -126,7 +130,7 @@ metadata: data: retriever-service.yaml: | mode: standalone -{{ include "nemo-retriever.configBody" (dict "Values" .Values "pageElementsURL" $pageElementsURL "tableStructureURL" $tableStructureURL "ocrURL" $ocrURL "embedURL" $embedURL "captionURL" $captionURL "captionModelName" $captionModelName "audioGrpcEndpoint" $audioGrpcEndpoint "llmEnabled" $llmEnabled "llmAPIBase" $llmAPIBase "llmRagSystemPromptPrefix" $llmRagSystemPromptPrefix "vectordbSvc" $vectordbSvc "vectordbPort" $vectordbPort) | indent 4 }} +{{ include "nemo-retriever.configBody" (dict "Values" .Values "pageElementsURL" $pageElementsURL "tableStructureURL" $tableStructureURL "ocrURL" $ocrURL "embedURL" $embedURL "captionURL" $captionURL "captionModelName" $captionModelName "audioGrpcEndpoint" $audioGrpcEndpoint "llmEnabled" $llmEnabled "llmModel" $llmModel "llmAPIBase" $llmAPIBase "llmRagSystemPromptPrefix" $llmRagSystemPromptPrefix "vectordbSvc" $vectordbSvc "vectordbPort" $vectordbPort) | indent 4 }} {{- else }} # ========================================================================= # Split mode — one ConfigMap per role with the appropriate mode + gateway @@ -154,6 +158,6 @@ data: timeout_s: 300.0 max_connections: 100 {{- end }} -{{ include "nemo-retriever.configBody" (dict "Values" $.Values "pageElementsURL" $pageElementsURL "tableStructureURL" $tableStructureURL "ocrURL" $ocrURL "embedURL" $embedURL "captionURL" $captionURL "captionModelName" $captionModelName "audioGrpcEndpoint" $audioGrpcEndpoint "llmEnabled" $llmEnabled "llmAPIBase" $llmAPIBase "llmRagSystemPromptPrefix" $llmRagSystemPromptPrefix "vectordbSvc" $vectordbSvc "vectordbPort" $vectordbPort) | indent 4 }} +{{ include "nemo-retriever.configBody" (dict "Values" $.Values "pageElementsURL" $pageElementsURL "tableStructureURL" $tableStructureURL "ocrURL" $ocrURL "embedURL" $embedURL "captionURL" $captionURL "captionModelName" $captionModelName "audioGrpcEndpoint" $audioGrpcEndpoint "llmEnabled" $llmEnabled "llmModel" $llmModel "llmAPIBase" $llmAPIBase "llmRagSystemPromptPrefix" $llmRagSystemPromptPrefix "vectordbSvc" $vectordbSvc "vectordbPort" $vectordbPort) | indent 4 }} {{- end }} {{- end }} diff --git a/nemo_retriever/helm/templates/deployment.yaml b/nemo_retriever/helm/templates/deployment.yaml index 85e6152a92..e4e4932697 100644 --- a/nemo_retriever/helm/templates/deployment.yaml +++ b/nemo_retriever/helm/templates/deployment.yaml @@ -1,3 +1,22 @@ +{{- define "nemo-retriever.llmApiKeyEnv" -}} +{{- $root := . -}} +{{- $secret := $root.Values.serviceConfig.llm.apiKeySecret -}} +{{- if $secret.name -}} +- name: NEMO_RETRIEVER_LLM_API_KEY + valueFrom: + secretKeyRef: + name: {{ $secret.name | quote }} + key: {{ $secret.key | default "api_key" | quote }} + optional: {{ $secret.optional | default false }} +{{- else if and $root.Values.nims.enabled $root.Values.nimOperator.answer_llm.enabled -}} +- name: NEMO_RETRIEVER_LLM_API_KEY + valueFrom: + secretKeyRef: + name: {{ $root.Values.nimOperator.answer_llm.authSecret | quote }} + key: "NGC_API_KEY" + optional: false +{{- end -}} +{{- end -}} {{- $svc := .Values.service -}} {{- if and $svc.installFfmpeg $svc.env -}} {{- range $env := $svc.env }} @@ -97,15 +116,9 @@ spec: name: {{ .Values.ngcApiSecret.name }} key: NGC_API_KEY optional: true - {{- with .Values.serviceConfig.llm.apiKeySecret }} - {{- if .name }} - - name: NEMO_RETRIEVER_LLM_API_KEY - valueFrom: - secretKeyRef: - name: {{ .name | quote }} - key: {{ .key | default "api_key" | quote }} - optional: {{ .optional | default false }} - {{- end }} + {{- $llmApiKeyEnv := include "nemo-retriever.llmApiKeyEnv" . }} + {{- if $llmApiKeyEnv }} +{{ $llmApiKeyEnv | nindent 12 }} {{- end }} {{- if $svc.installFfmpeg }} - name: INSTALL_FFMPEG @@ -271,15 +284,9 @@ spec: name: {{ $.Values.ngcApiSecret.name }} key: NGC_API_KEY optional: true - {{- with $.Values.serviceConfig.llm.apiKeySecret }} - {{- if .name }} - - name: NEMO_RETRIEVER_LLM_API_KEY - valueFrom: - secretKeyRef: - name: {{ .name | quote }} - key: {{ .key | default "api_key" | quote }} - optional: {{ .optional | default false }} - {{- end }} + {{- $llmApiKeyEnv := include "nemo-retriever.llmApiKeyEnv" $ }} + {{- if $llmApiKeyEnv }} +{{ $llmApiKeyEnv | nindent 12 }} {{- end }} {{- if $svc.installFfmpeg }} - name: INSTALL_FFMPEG diff --git a/nemo_retriever/helm/templates/nims/llama-3-3-nemotron-super-49b-v1-5.yaml b/nemo_retriever/helm/templates/nims/answer-llm.yaml similarity index 86% rename from nemo_retriever/helm/templates/nims/llama-3-3-nemotron-super-49b-v1-5.yaml rename to nemo_retriever/helm/templates/nims/answer-llm.yaml index 53f9660e38..5df26b12a0 100644 --- a/nemo_retriever/helm/templates/nims/llama-3-3-nemotron-super-49b-v1-5.yaml +++ b/nemo_retriever/helm/templates/nims/answer-llm.yaml @@ -1,5 +1,5 @@ -{{- if and (and (.Capabilities.APIVersions.Has "apps.nvidia.com/v1alpha1") .Values.nims.enabled) (eq .Values.nimOperator.llama_3_3_nemotron_super_49b_v1_5.enabled true) -}} -{{- $cfg := .Values.nimOperator.llama_3_3_nemotron_super_49b_v1_5 -}} +{{- if and (and (.Capabilities.APIVersions.Has "apps.nvidia.com/v1alpha1") .Values.nims.enabled) (eq .Values.nimOperator.answer_llm.enabled true) -}} +{{- $cfg := .Values.nimOperator.answer_llm -}} {{- $name := $cfg.nimServiceName -}} apiVersion: apps.nvidia.com/v1alpha1 kind: NIMCache @@ -12,7 +12,7 @@ spec: modelPuller: "{{ $cfg.image.repository }}:{{ $cfg.image.tag }}" pullSecret: "{{ index $cfg.image.pullSecrets 0 }}" authSecret: {{ $cfg.authSecret }} - {{- include "nemo-retriever.nimcache.modelBlock" (dict "context" $ "key" "llama_3_3_nemotron_super_49b_v1_5") | nindent 6 }} + {{- include "nemo-retriever.nimcache.modelBlock" (dict "context" $ "key" "answer_llm") | nindent 6 }} storage: pvc: create: {{ $cfg.storage.pvc.create }} diff --git a/nemo_retriever/helm/values.yaml b/nemo_retriever/helm/values.yaml index 533a8dc697..5083ad66eb 100644 --- a/nemo_retriever/helm/values.yaml +++ b/nemo_retriever/helm/values.yaml @@ -490,19 +490,25 @@ serviceConfig: # Required for audio/video ingestion in service mode (without torch). audioGrpcEndpoint: "" - # Remote LLM endpoint used by POST /v1/answer. When the Super-49B - # NIM is enabled under nimOperator.llama_3_3_nemotron_super_49b_v1_5 - # (and the NIM Operator CRDs exist), the chart auto-wires apiBase to - # the in-cluster OpenAI-compatible /v1 endpoint and flips answering on. + # Remote LLM endpoint used by POST /v1/answer. When the generic + # `nimOperator.answer_llm.enabled` block is enabled (and the NIM + # Operator CRDs exist), the chart auto-wires apiBase to that + # in-cluster OpenAI-compatible /v1 endpoint and flips answering on. # Explicit apiBase/model values below always win, which lets operators # point /v1/answer at an external OpenAI-compatible endpoint instead. llm: enabled: false - model: "openai/nvidia/llama-3.3-nemotron-super-49b-v1.5" + # Optional explicit LiteLLM/OpenAI model id. Leave empty to inherit + # `nimOperator.answer_llm.model` when the operator-managed answer LLM + # is enabled; set this when using an external LLM endpoint. + model: "" apiBase: "" # Optional Secret reference for external LLM credentials. The Secret is # mounted as NEMO_RETRIEVER_LLM_API_KEY and read by the service CLI, so - # the key is not rendered into the retriever-service ConfigMap. + # the key is not rendered into the retriever-service ConfigMap. When this + # is empty and nimOperator.answer_llm is enabled, the service mounts that + # NIM's authSecret instead because LiteLLM/OpenAI-compatible clients + # require a credential value even for local NIM endpoints. apiKeySecret: name: "" key: api_key @@ -516,7 +522,7 @@ serviceConfig: ragSystemPrompt: "" # Optional prefix prepended to the RAG system prompt. Leave empty # for external LLMs; the chart auto-wires /no_think when it - # resolves the in-cluster Super-49B NIM endpoint. + # resolves the operator-managed answer LLM endpoint. ragSystemPromptPrefix: "" # Pipeline worker pools. Workers are abstract dispatchers — sizing @@ -834,8 +840,8 @@ nimOperator: # - nemotron_parse (Parse v1.2, ≈ 3.5 GiB GPU) # - nemotron_3_nano_omni_30b_a3b_reasoning (Omni 30B caption NIM, # ≈ 62 GiB BF16 weights) - # - llama_3_3_nemotron_super_49b_v1_5 (Super-49B answer LLM, - # 2-GPU tensor-parallel NIM) + # - answer_llm (OpenAI-compatible answer LLM, + # Super-49B defaults) # - audio (Parakeet ASR) # # Two flags gate every NIM, so each one is opt-in along two independent @@ -1029,22 +1035,29 @@ nimOperator: # value: throughput # --------------------------------------------------------------------------- - # Optional / advanced NIMs. Enabled by default alongside the core four; - # the retriever-service does not auto-wire these into its config, so - # integrate them manually via your own pipeline configuration. Set - # `nimOperator..enabled: false` to skip one without disabling the - # whole sub-stack. + # Optional / advanced NIMs. Disabled by default unless noted below; + # opt in to the ones your pipeline or service endpoints actually use. # --------------------------------------------------------------------------- - # Llama 3.3 Nemotron Super 49B v1.5. Optional answer-generation LLM. + # Generic OpenAI-compatible answer-generation LLM NIM. # - # Disabled by default because it is much larger than the core extraction - # NIMs. The default resources request two GPUs and only auto-wire - # /v1/answer after explicit opt-in. - llama_3_3_nemotron_super_49b_v1_5: + # Disabled by default because answer LLMs are larger than the core + # extraction NIMs. The default image/model is Super-49B and requests + # two GPUs. Override `image.repository`, `image.tag`, `model`, + # `nimServiceName`, `resources`, `modelProfile`, and `env` to swap in + # another compatible LLM NIM such as Nemotron 3 Nano. + answer_llm: enabled: false # NIMService / NIMCache resource name and in-cluster Service DNS label. - nimServiceName: llama-3-3-nemotron-super-49b-v1-5 + nimServiceName: answer-llm + # LiteLLM/OpenAI model id rendered into retriever-service.yaml when + # this operator-managed LLM is enabled and serviceConfig.llm.model is + # left empty. + model: "openai/nvidia/llama-3.3-nemotron-super-49b-v1.5" + # Prompt prefix to apply when this operator-managed LLM auto-wires + # /v1/answer. Super-49B uses /no_think by default; override to "" + # for models that should receive the plain RAG system prompt. + ragSystemPromptPrefix: "/no_think" image: repository: nvcr.io/nim/nvidia/llama-3.3-nemotron-super-49b-v1.5 tag: "2.0.5" @@ -1055,8 +1068,8 @@ nimOperator: # Per-NIM NIMCache GPU/profile filter. See `nimOperator.modelProfile` # above for the shape; non-empty REPLACES the chart-wide default. # Cache only the default BF16 2-GPU profile used by the Super-49B - # service resources. Override this when deploying a different profile - # or GPU topology. + # default resources. Override this when deploying a different model, + # profile, or GPU topology. modelProfile: profiles: - "1146f49f84dff5dea09f5aa633cc70b92d7d972223d67878c841cd0fbccad4fb" @@ -1067,9 +1080,9 @@ nimOperator: size: "250Gi" volumeAccessMode: ReadWriteOnce replicas: 1 - # Super-49B uses a 2-GPU tensor-parallel profile by default. This + # The Super-49B default uses a 2-GPU tensor-parallel profile. This # non-empty resources block intentionally replaces - # nimOperator.nimServiceGpuLimit. + # nimOperator.nimServiceGpuLimit. Override for smaller LLM NIMs. resources: limits: nvidia.com/gpu: 2 diff --git a/nemo_retriever/tests/test_helm_super49b_answer_generation.py b/nemo_retriever/tests/test_helm_answer_llm_generation.py similarity index 53% rename from nemo_retriever/tests/test_helm_super49b_answer_generation.py rename to nemo_retriever/tests/test_helm_answer_llm_generation.py index a5f69789e7..2e61c777e9 100644 --- a/nemo_retriever/tests/test_helm_super49b_answer_generation.py +++ b/nemo_retriever/tests/test_helm_answer_llm_generation.py @@ -2,7 +2,7 @@ # All rights reserved. # SPDX-License-Identifier: Apache-2.0 -"""Helm wiring for Super-49B answer generation.""" +"""Helm wiring for operator-managed answer-generation LLM NIMs.""" from __future__ import annotations @@ -17,12 +17,18 @@ _VALUES_YAML = _REPO_ROOT / "nemo_retriever/helm/values.yaml" _CHART_DIR = _REPO_ROOT / "nemo_retriever/helm" -_SUPER49B_KEY = " llama_3_3_nemotron_super_49b_v1_5:" -_SUPER49B_SERVICE = "llama-3-3-nemotron-super-49b-v1-5" +_ANSWER_LLM_KEY = " answer_llm:" +_ANSWER_LLM_SERVICE = "answer-llm" _SUPER49B_REPOSITORY = "nvcr.io/nim/nvidia/llama-3.3-nemotron-super-49b-v1.5" _SUPER49B_TAG = "2.0.5" _SUPER49B_MODEL = "openai/nvidia/llama-3.3-nemotron-super-49b-v1.5" _SUPER49B_PROFILE = "1146f49f84dff5dea09f5aa633cc70b92d7d972223d67878c841cd0fbccad4fb" +_NANO_SERVICE = "nemotron-3-nano" +_NANO_REPOSITORY = "nvcr.io/nim/nvidia/nemotron-3-nano" +_NANO_TAG = "1.7.0-variant" +_NANO_MODEL = "openai/nvidia/nemotron-3-nano-30b-a3b" +_NANO_SERVED_MODEL = "nvidia/nemotron-3-nano-30b-a3b" +_NANO_A100_PROFILE = "5f89f01a0af587fd8bae50c611b1f358f92effdb9fb29362e1af0a986e5561c3" def _read_required_file(path: Path) -> str: @@ -59,33 +65,37 @@ def _assert_helm_ok(self: TestCase, proc: subprocess.CompletedProcess[str]) -> N ) -class HelmSuper49BAnswerGenerationTests(TestCase): - def test_values_define_super49b_as_optional_answer_nim(self) -> None: +class HelmAnswerLLMGenerationTests(TestCase): + def test_values_define_generic_answer_llm_with_super49b_defaults(self) -> None: values = _read_required_file(_VALUES_YAML) - self.assertIn(_SUPER49B_KEY, values) - block = values[values.index(_SUPER49B_KEY) : values.index(_SUPER49B_KEY) + 1200] + self.assertIn(_ANSWER_LLM_KEY, values) + block = values[values.index(_ANSWER_LLM_KEY) : values.index(_ANSWER_LLM_KEY) + 2200] self.assertIn("enabled: false", block) + self.assertIn(f"nimServiceName: {_ANSWER_LLM_SERVICE}", block) self.assertIn(f"repository: {_SUPER49B_REPOSITORY}", block) self.assertIn(f'tag: "{_SUPER49B_TAG}"', block) + self.assertIn(f'model: "{_SUPER49B_MODEL}"', block) self.assertIn("nvidia.com/gpu: 2", block) self.assertIn('size: "250Gi"', block) self.assertIn(_SUPER49B_PROFILE, block) + self.assertIn('ragSystemPromptPrefix: "/no_think"', block) - def test_default_render_omits_super49b_and_disables_llm_answering(self) -> None: + def test_default_render_omits_answer_llm_and_disables_llm_answering(self) -> None: proc = _helm_template() _assert_helm_ok(self, proc) - self.assertNotIn(f"name: {_SUPER49B_SERVICE}", proc.stdout) + self.assertNotIn(f"name: {_ANSWER_LLM_SERVICE}", proc.stdout) self.assertIn("llm:", proc.stdout) self.assertIn("enabled: false", proc.stdout) self.assertIn("api_base: null", proc.stdout) + self.assertNotIn("NEMO_RETRIEVER_LLM_API_KEY", proc.stdout) - def test_super49b_opt_in_renders_nim_and_autowires_llm_config(self) -> None: - proc = _helm_template(extra_args=("--set", "nimOperator.llama_3_3_nemotron_super_49b_v1_5.enabled=true")) + def test_answer_llm_opt_in_renders_default_super49b_nim_and_autowires_llm_config(self) -> None: + proc = _helm_template(extra_args=("--set", "nimOperator.answer_llm.enabled=true")) _assert_helm_ok(self, proc) - self.assertIn(f"name: {_SUPER49B_SERVICE}", proc.stdout) + self.assertIn(f"name: {_ANSWER_LLM_SERVICE}", proc.stdout) self.assertIn(f"repository: {_SUPER49B_REPOSITORY}", proc.stdout) self.assertIn(f"tag: {_SUPER49B_TAG}", proc.stdout) self.assertIn("nvidia.com/gpu: 2", proc.stdout) @@ -95,10 +105,62 @@ def test_super49b_opt_in_renders_nim_and_autowires_llm_config(self) -> None: self.assertIn("--disable-custom-all-reduce", proc.stdout) self.assertIn("NCCL_IB_DISABLE", proc.stdout) self.assertIn("NCCL_P2P_DISABLE", proc.stdout) - self.assertIn('api_base: "http://llama-3-3-nemotron-super-49b-v1-5:8000/v1"', proc.stdout) + self.assertIn(f'api_base: "http://{_ANSWER_LLM_SERVICE}:8000/v1"', proc.stdout) self.assertIn(f'model: "{_SUPER49B_MODEL}"', proc.stdout) self.assertIn('rag_system_prompt_prefix: "/no_think"', proc.stdout) self.assertIn("enabled: true", proc.stdout) + self.assertIn("NEMO_RETRIEVER_LLM_API_KEY", proc.stdout) + self.assertIn('name: "ngc-api"', proc.stdout) + self.assertIn('key: "NGC_API_KEY"', proc.stdout) + + def test_answer_llm_can_swap_to_nano_image_model_and_profile(self) -> None: + proc = _helm_template( + extra_args=( + "--set", + "nimOperator.answer_llm.enabled=true", + "--set", + f"nimOperator.answer_llm.nimServiceName={_NANO_SERVICE}", + "--set", + f"nimOperator.answer_llm.image.repository={_NANO_REPOSITORY}", + "--set", + f"nimOperator.answer_llm.image.tag={_NANO_TAG}", + "--set", + f"nimOperator.answer_llm.model={_NANO_MODEL}", + "--set", + "nimOperator.answer_llm.ragSystemPromptPrefix=", + "--set-json", + f'nimOperator.answer_llm.modelProfile={{"profiles":["{_NANO_A100_PROFILE}"]}}', + "--set-json", + 'nimOperator.answer_llm.resources={"limits":{"nvidia.com/gpu":1},"requests":{"nvidia.com/gpu":1}}', + "--set", + "nimOperator.answer_llm.env[0].name=NIM_HTTP_API_PORT", + "--set-string", + "nimOperator.answer_llm.env[0].value=8000", + "--set", + "nimOperator.answer_llm.env[1].name=NIM_SERVED_MODEL_NAME", + "--set-string", + f"nimOperator.answer_llm.env[1].value={_NANO_SERVED_MODEL}", + "--set", + "nimOperator.answer_llm.env[2].name=NIM_TENSOR_PARALLEL_SIZE", + "--set-string", + "nimOperator.answer_llm.env[2].value=1", + ) + ) + _assert_helm_ok(self, proc) + + self.assertIn(f"name: {_NANO_SERVICE}", proc.stdout) + self.assertIn(f"repository: {_NANO_REPOSITORY}", proc.stdout) + self.assertIn(f"tag: {_NANO_TAG}", proc.stdout) + self.assertIn("NIM_SERVED_MODEL_NAME", proc.stdout) + self.assertIn(_NANO_SERVED_MODEL, proc.stdout) + self.assertIn("NIM_TENSOR_PARALLEL_SIZE", proc.stdout) + self.assertIn(_NANO_A100_PROFILE, proc.stdout) + self.assertIn("nvidia.com/gpu: 1", proc.stdout) + self.assertNotIn(_SUPER49B_PROFILE, proc.stdout) + self.assertIn(f'api_base: "http://{_NANO_SERVICE}:8000/v1"', proc.stdout) + self.assertIn(f'model: "{_NANO_MODEL}"', proc.stdout) + self.assertIn("rag_system_prompt_prefix: null", proc.stdout) + self.assertIn("enabled: true", proc.stdout) def test_explicit_llm_api_base_wins_without_operator_nim(self) -> None: proc = _helm_template( @@ -113,7 +175,7 @@ def test_explicit_llm_api_base_wins_without_operator_nim(self) -> None: ) _assert_helm_ok(self, proc) - self.assertNotIn(f"name: {_SUPER49B_SERVICE}", proc.stdout) + self.assertNotIn(f"name: {_ANSWER_LLM_SERVICE}", proc.stdout) self.assertIn('api_base: "http://external-llm:8000/v1"', proc.stdout) self.assertIn('model: "openai/custom-answerer"', proc.stdout) self.assertIn("rag_system_prompt_prefix: null", proc.stdout) diff --git a/nemo_retriever/tests/test_helm_nimcache_model_profile.py b/nemo_retriever/tests/test_helm_nimcache_model_profile.py index 119698013b..f6414cc252 100644 --- a/nemo_retriever/tests/test_helm_nimcache_model_profile.py +++ b/nemo_retriever/tests/test_helm_nimcache_model_profile.py @@ -19,8 +19,9 @@ * ``values.yaml`` carries a chart-wide ``nimOperator.modelProfile`` default plus a per-NIM ``nimOperator..modelProfile`` override for every NIMCache the chart provisions. Existing extraction NIMs - default their per-NIM override to ``{}``; Super-49B is the intentional - exception because it pins the bundled two-GPU profile by default. + default their per-NIM override to ``{}``; ``answer_llm`` is the + intentional exception because its Super-49B default pins the bundled + two-GPU profile by default. * A ``helm template`` with **no overrides** renders no ``model:`` block on default-empty-profile NIMCaches (preserves operator default). * ``--set nimOperator.modelProfile.gpus[0]...`` renders an identical @@ -57,8 +58,8 @@ # Per-NIM keys whose NIMCache modelProfile values intentionally default # to ``{}``, preserving pre-fix operator behaviour unless an operator -# opts into GPU/profile filtering. Super-49B is tracked separately because -# it ships with a pinned default profile for its bundled two-GPU NIM. +# opts into GPU/profile filtering. answer_llm is tracked separately because +# its Super-49B default ships with a pinned profile for the bundled two-GPU NIM. _EMPTY_MODEL_PROFILE_NIM_KEYS: tuple[str, ...] = ( "page_elements", "table_structure", @@ -69,9 +70,9 @@ "nemotron_3_nano_omni_30b_a3b_reasoning", "audio", ) -_SUPER49B_NIM_KEY = "llama_3_3_nemotron_super_49b_v1_5" +_ANSWER_LLM_NIM_KEY = "answer_llm" _SUPER49B_DEFAULT_PROFILE = "1146f49f84dff5dea09f5aa633cc70b92d7d972223d67878c841cd0fbccad4fb" -_ALL_NIM_KEYS: tuple[str, ...] = _EMPTY_MODEL_PROFILE_NIM_KEYS + (_SUPER49B_NIM_KEY,) +_ALL_NIM_KEYS: tuple[str, ...] = _EMPTY_MODEL_PROFILE_NIM_KEYS + (_ANSWER_LLM_NIM_KEY,) def _read_required_file(path: Path) -> str: @@ -185,12 +186,12 @@ def test_values_exposes_per_nim_override_for_every_nim(self) -> None: "operator opts in.", ) - def test_super49b_exposes_intentional_default_model_profile(self) -> None: + def test_answer_llm_exposes_intentional_default_super49b_model_profile(self) -> None: """Super-49B pins its bundled two-GPU NIM profile by default.""" values = _read_required_file(_VALUES_YAML) loaded = yaml.safe_load(values) self.assertEqual( - loaded["nimOperator"][_SUPER49B_NIM_KEY]["modelProfile"], + loaded["nimOperator"][_ANSWER_LLM_NIM_KEY]["modelProfile"], {"profiles": [_SUPER49B_DEFAULT_PROFILE]}, ) From 150c0b8b4454b5ec86c130eb46817ecc99eeb77a Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Wed, 3 Jun 2026 17:13:26 +0000 Subject: [PATCH 03/14] Add request-level answer reasoning toggle --- nemo_retriever/helm/README.md | 23 +++++--- nemo_retriever/helm/templates/configmap.yaml | 1 + nemo_retriever/helm/values.yaml | 17 ++++-- .../src/nemo_retriever/llm/clients/litellm.py | 56 +++++++++++++++++-- .../src/nemo_retriever/params/models.py | 1 + .../src/nemo_retriever/service/config.py | 1 + .../service/retriever-service.yaml | 1 + .../nemo_retriever/service/routers/ingest.py | 9 ++- .../tests/test_helm_answer_llm_generation.py | 24 ++++++-- nemo_retriever/tests/test_llm_params.py | 42 +++++++++++++- .../tests/test_service_answer_generation.py | 49 ++++++++++++++-- 11 files changed, 194 insertions(+), 30 deletions(-) diff --git a/nemo_retriever/helm/README.md b/nemo_retriever/helm/README.md index acfe0a33a3..f86e4dfe90 100644 --- a/nemo_retriever/helm/README.md +++ b/nemo_retriever/helm/README.md @@ -324,7 +324,8 @@ The retriever service picks up the in-cluster ASR endpoint when `nimOperator.aud | `serviceConfig.llm.apiKeySecret.name` | `""` | Optional Secret name for external LLM credentials. Explicit values win; otherwise operator-managed `answer_llm` mounts its `authSecret` as `NEMO_RETRIEVER_LLM_API_KEY` so LiteLLM/OpenAI has a credential value without writing it to the ConfigMap. | | `serviceConfig.llm.apiKeySecret.key` | `api_key` | Secret key for external LLM credentials. Operator-managed `answer_llm` uses `NGC_API_KEY` from `nimOperator.answer_llm.authSecret` when no explicit LLM Secret is set. | | `serviceConfig.llm.model` | `""` | Optional explicit LiteLLM model id. Leave empty to inherit `nimOperator.answer_llm.model` when using the operator-managed answer LLM; set it for external endpoints. | -| `serviceConfig.llm.ragSystemPromptPrefix` | `""` | Optional explicit RAG prompt prefix. Leave empty to inherit `nimOperator.answer_llm.ragSystemPromptPrefix` when using the operator-managed answer LLM. | +| `serviceConfig.llm.ragSystemPromptPrefix` | `""` | Optional explicit RAG prompt prefix. Leave empty unless an endpoint needs model-specific prompt directives. | +| `serviceConfig.llm.reasoningEnabled` | `false` | Request-level reasoning toggle for `/v1/answer`. When false, the service adds portable no-reasoning metadata; when true, requests leave reasoning behavior to the LLM endpoint defaults. | | `serviceConfig.vectordb.enabled` | `true` | Deploy the LanceDB vectordb Pod. When `true` the chart **requires** a resolvable embed endpoint (see [VectorDB and the embed endpoint](#vectordb-and-the-embed-endpoint)); `helm install` / `helm upgrade` fails fast otherwise. | | `serviceConfig.vectordb.lancedbUri` | `/data/vectordb` | LanceDB on the vectordb Pod's PVC. | | `serviceConfig.vectordb.embedModel` | `nvidia/llama-nemotron-embed-vl-1b-v2` | Passed to vectordb + worker `embed_model_name`. | @@ -384,13 +385,20 @@ llm: enabled: true model: "openai/nvidia/llama-3.3-nemotron-super-49b-v1.5" api_base: "http://answer-llm:8000/v1" - rag_system_prompt_prefix: "/no_think" + rag_system_prompt_prefix: null + reasoning_enabled: false ``` The retriever service then exposes `POST /v1/answer`, which calls the VectorDB pod's `/v1/query` endpoint for context and sends those chunks to -the configured LLM endpoint. The default Super-49B NIMService resources -request two GPUs (`nvidia.com/gpu: 2`) to match the bundled +the configured LLM endpoint. The `answer_llm` NIM deployment leaves +reasoning defaults model-neutral; `/v1/answer` controls reasoning per +request. With `serviceConfig.llm.reasoningEnabled=false`, the service adds +both `/no_think` and `chat_template_kwargs.enable_thinking=false` so +Super-49B and Nemotron 3 Nano both skip reasoning. Set +`serviceConfig.llm.reasoningEnabled=true` to leave answer requests at the +LLM endpoint's default reasoning behavior. The default Super-49B NIMService +resources request two GPUs (`nvidia.com/gpu: 2`) to match the bundled tensor-parallel NIM profile. Override `resources`, `modelProfile`, or `env` for deployments that use a different profile or hardware topology. When `answer_llm` is enabled and no explicit `serviceConfig.llm.apiKeySecret` @@ -410,7 +418,6 @@ helm upgrade --install retriever ./nemo_retriever/helm \ --set nimOperator.answer_llm.image.repository=nvcr.io/nim/nvidia/nemotron-3-nano \ --set nimOperator.answer_llm.image.tag=1.7.0-variant \ --set nimOperator.answer_llm.model=openai/nvidia/nemotron-3-nano-30b-a3b \ - --set nimOperator.answer_llm.ragSystemPromptPrefix= \ --set-json nimOperator.answer_llm.modelProfile='{"profiles":["5f89f01a0af587fd8bae50c611b1f358f92effdb9fb29362e1af0a986e5561c3"]}' \ --set-json nimOperator.answer_llm.resources='{"limits":{"nvidia.com/gpu":1},"requests":{"nvidia.com/gpu":1}}' \ --set nimOperator.answer_llm.env[0].name=NIM_HTTP_API_PORT \ @@ -427,8 +434,8 @@ or tags. `nimOperator.answer_llm.model` is the LiteLLM model id used by the retriever service; for an OpenAI-compatible in-cluster NIM, keep the `openai/` prefix there and set `NIM_SERVED_MODEL_NAME` to the raw model name advertised by the NIM. Replace the default Super-49B `modelProfile`, -`resources`, `env`, and prompt prefix when the target model requires a -different GPU/profile setup. Leaving `modelProfile` empty preserves NIM +`resources`, and `env` when the target model requires a different +GPU/profile setup. Leaving `modelProfile` empty preserves NIM Operator auto-discovery, but for Nano it can cache every advertised profile on first reconciliation; pin a known-compatible profile when you know the target GPU topology. @@ -464,7 +471,7 @@ pair gated on three conditions ALL holding: | `nimOperator.nemotron_3_nano_omni_30b_a3b_reasoning.enabled` | `false` | Omni 30B caption NIM (optional). Set `true` to enable image captioning — see [Image captioning (Omni 30B)](#image-captioning-omni-30b). Default `false` so 26.05 installs do not silently pull ≈ 62 GiB of BF16 weights or claim a second dedicated GPU. Image tag follows the [image tag conventions](#image-tag-conventions). | | `nimOperator.answer_llm.enabled` | `false` | Generic answer-generation LLM NIM (optional; Super-49B defaults). Set `true` to enable `/v1/answer` — see [Answer generation (operator-managed LLM)](#answer-generation-llm). Default `false` so installs do not silently claim answer-generation GPUs. | | `nimOperator.answer_llm.model` | `openai/nvidia/llama-3.3-nemotron-super-49b-v1.5` | LiteLLM/OpenAI model id inherited by `serviceConfig.llm.model` when the operator-managed answer LLM is enabled and no explicit service model is set. | -| `nimOperator.answer_llm.ragSystemPromptPrefix` | `/no_think` | Prompt prefix inherited by `serviceConfig.llm.ragSystemPromptPrefix` for the operator-managed answer LLM. Override to `""` for models that should receive the plain RAG prompt. | +| `nimOperator.answer_llm.ragSystemPromptPrefix` | `""` | Optional prompt prefix inherited by `serviceConfig.llm.ragSystemPromptPrefix` only when explicitly set. Leave empty to keep the operator-managed LLM model-neutral and use `serviceConfig.llm.reasoningEnabled` for request-level reasoning control. | | `nimOperator.audio.enabled` | `false` | Parakeet ASR NIM (optional). Set `true` for audio/video transcription; pair with `serviceConfig.nimEndpoints.audioGrpcEndpoint=audio:50051` so the retriever-service can reach it. | | `nimOperator..image.repository` | `nvcr.io/nim/nvidia/...` | Per-NIM image. | | `nimOperator..image.pullSecrets` | `[ngc-secret]` | Referenced by the NIMService CR. | diff --git a/nemo_retriever/helm/templates/configmap.yaml b/nemo_retriever/helm/templates/configmap.yaml index 7fd1083317..739b5ec9f3 100644 --- a/nemo_retriever/helm/templates/configmap.yaml +++ b/nemo_retriever/helm/templates/configmap.yaml @@ -85,6 +85,7 @@ llm: timeout: {{ .Values.serviceConfig.llm.timeout }} rag_system_prompt: {{ if .Values.serviceConfig.llm.ragSystemPrompt }}{{ .Values.serviceConfig.llm.ragSystemPrompt | quote }}{{ else }}null{{ end }} rag_system_prompt_prefix: {{ if .llmRagSystemPromptPrefix }}{{ .llmRagSystemPromptPrefix | quote }}{{ else }}null{{ end }} + reasoning_enabled: {{ .Values.serviceConfig.llm.reasoningEnabled }} pipeline: realtime_workers: {{ .Values.serviceConfig.pipeline.realtimeWorkers }} diff --git a/nemo_retriever/helm/values.yaml b/nemo_retriever/helm/values.yaml index 5083ad66eb..9e3e44ad27 100644 --- a/nemo_retriever/helm/values.yaml +++ b/nemo_retriever/helm/values.yaml @@ -521,9 +521,13 @@ serviceConfig: timeout: 180.0 ragSystemPrompt: "" # Optional prefix prepended to the RAG system prompt. Leave empty - # for external LLMs; the chart auto-wires /no_think when it - # resolves the operator-managed answer LLM endpoint. + # unless an endpoint needs model-specific prompt directives. ragSystemPromptPrefix: "" + # Unified request-level reasoning control for /v1/answer. When false, + # the service sends portable no-reasoning metadata understood by both + # Super-49B and Nemotron 3 Nano. Set true to leave reasoning behavior + # to the LLM endpoint defaults for answer requests. + reasoningEnabled: false # Pipeline worker pools. Workers are abstract dispatchers — sizing # depends on whether they do local GPU work or fan out to remote NIMs. @@ -1054,10 +1058,11 @@ nimOperator: # this operator-managed LLM is enabled and serviceConfig.llm.model is # left empty. model: "openai/nvidia/llama-3.3-nemotron-super-49b-v1.5" - # Prompt prefix to apply when this operator-managed LLM auto-wires - # /v1/answer. Super-49B uses /no_think by default; override to "" - # for models that should receive the plain RAG system prompt. - ragSystemPromptPrefix: "/no_think" + # Optional prompt prefix to apply when this operator-managed LLM + # auto-wires /v1/answer. Leave empty to keep the LLM deployment and + # prompt defaults model-neutral; use serviceConfig.llm.reasoningEnabled + # for the portable request-level reasoning toggle. + ragSystemPromptPrefix: "" image: repository: nvcr.io/nim/nvidia/llama-3.3-nemotron-super-49b-v1.5 tag: "2.0.5" diff --git a/nemo_retriever/src/nemo_retriever/llm/clients/litellm.py b/nemo_retriever/src/nemo_retriever/llm/clients/litellm.py index e69d58649d..91104ccdbd 100644 --- a/nemo_retriever/src/nemo_retriever/llm/clients/litellm.py +++ b/nemo_retriever/src/nemo_retriever/llm/clients/litellm.py @@ -12,6 +12,7 @@ from __future__ import annotations +from copy import deepcopy import logging import time from typing import Any, Optional @@ -37,6 +38,9 @@ Answer:""" +_NO_REASONING_SYSTEM_DIRECTIVE = "/no_think" +_NO_REASONING_EXTRA_PARAMS = {"chat_template_kwargs": {"enable_thinking": False}} + def _format_rag_system_prompt( *, @@ -75,6 +79,30 @@ def _build_rag_prompt( ] +def _deep_merge_dicts(left: dict[str, Any], right: dict[str, Any]) -> dict[str, Any]: + """Return a recursive merge where ``right`` wins without mutating inputs.""" + merged = deepcopy(left) + for key, value in right.items(): + if isinstance(merged.get(key), dict) and isinstance(value, dict): + merged[key] = _deep_merge_dicts(merged[key], value) + else: + merged[key] = deepcopy(value) + return merged + + +def _with_no_reasoning_controls(messages: list[dict]) -> list[dict]: + """Add no-reasoning prompt metadata understood by current Nemotron LLM NIMs.""" + updated = [dict(message) for message in messages] + if updated and updated[0].get("role") == "system": + content = str(updated[0].get("content") or "").strip() + if _NO_REASONING_SYSTEM_DIRECTIVE not in content: + content = f"{_NO_REASONING_SYSTEM_DIRECTIVE}\n{content}" if content else _NO_REASONING_SYSTEM_DIRECTIVE + updated[0]["content"] = content + return updated + updated.insert(0, {"role": "system", "content": _NO_REASONING_SYSTEM_DIRECTIVE}) + return updated + + class LiteLLMClient: """Unified LLM client backed by litellm. @@ -136,6 +164,7 @@ def from_kwargs( timeout: float = 120.0, rag_system_prompt: Optional[str] = None, rag_system_prompt_prefix: Optional[str] = None, + reasoning_enabled: bool = False, ) -> "LiteLLMClient": """Flat-kwarg constructor for zero-churn migration from the old signature. @@ -152,6 +181,7 @@ def from_kwargs( extra_params=extra_params or {}, rag_system_prompt=rag_system_prompt, rag_system_prompt_prefix=rag_system_prompt_prefix, + reasoning_enabled=reasoning_enabled, ) sampling = LLMInferenceParams( temperature=temperature, @@ -160,7 +190,12 @@ def from_kwargs( ) return cls(transport=transport, sampling=sampling) - def complete(self, messages: list[dict], max_tokens: Optional[int] = None) -> tuple[str, float]: + def complete( + self, + messages: list[dict], + max_tokens: Optional[int] = None, + extra_params: Optional[dict[str, Any]] = None, + ) -> tuple[str, float]: """Raw litellm completion call. Returns (content_text, latency_s).""" import litellm @@ -179,7 +214,7 @@ def complete(self, messages: list[dict], max_tokens: Optional[int] = None) -> tu call_kwargs["api_base"] = self.transport.api_base if self.transport.api_key: call_kwargs["api_key"] = self.transport.api_key - call_kwargs.update(self.transport.extra_params) + call_kwargs.update(_deep_merge_dicts(self.transport.extra_params, extra_params or {})) t0 = time.monotonic() try: @@ -202,11 +237,24 @@ def complete(self, messages: list[dict], max_tokens: Optional[int] = None) -> tu content = (response.choices[0].message.content or "").strip() return content, latency - def generate(self, query: str, chunks: list[str]) -> GenerationResult: + def generate( + self, + query: str, + chunks: list[str], + *, + reasoning_enabled: Optional[bool] = None, + ) -> GenerationResult: """Generate an answer for the given query using retrieved chunks as context.""" messages = _build_rag_prompt(query, chunks, rag_system_prompt=self._rag_system_prompt) + request_extra_params: dict[str, Any] | None = None + effective_reasoning_enabled = ( + self.transport.reasoning_enabled if reasoning_enabled is None else reasoning_enabled + ) + if not effective_reasoning_enabled: + messages = _with_no_reasoning_controls(messages) + request_extra_params = _NO_REASONING_EXTRA_PARAMS try: - raw_answer, latency = self.complete(messages) + raw_answer, latency = self.complete(messages, extra_params=request_extra_params) answer = strip_think_tags(raw_answer) if not answer: return GenerationResult( diff --git a/nemo_retriever/src/nemo_retriever/params/models.py b/nemo_retriever/src/nemo_retriever/params/models.py index 5579b7c648..6b0bcf444f 100644 --- a/nemo_retriever/src/nemo_retriever/params/models.py +++ b/nemo_retriever/src/nemo_retriever/params/models.py @@ -601,6 +601,7 @@ class LLMRemoteClientParams(_ParamsModel): extra_params: dict[str, Any] = Field(default_factory=dict) rag_system_prompt: Optional[str] = None rag_system_prompt_prefix: Optional[str] = None + reasoning_enabled: bool = False @field_validator("num_retries") @classmethod diff --git a/nemo_retriever/src/nemo_retriever/service/config.py b/nemo_retriever/src/nemo_retriever/service/config.py index 1e50338c03..ca132cc9d6 100644 --- a/nemo_retriever/src/nemo_retriever/service/config.py +++ b/nemo_retriever/src/nemo_retriever/service/config.py @@ -95,6 +95,7 @@ class LLMConfig(RichModel): timeout: float = 180.0 rag_system_prompt: str | None = None rag_system_prompt_prefix: str | None = None + reasoning_enabled: bool = False class ResourceLimitsConfig(RichModel): diff --git a/nemo_retriever/src/nemo_retriever/service/retriever-service.yaml b/nemo_retriever/src/nemo_retriever/service/retriever-service.yaml index 5d7472eea8..a9c11cecfa 100644 --- a/nemo_retriever/src/nemo_retriever/service/retriever-service.yaml +++ b/nemo_retriever/src/nemo_retriever/service/retriever-service.yaml @@ -64,6 +64,7 @@ llm: timeout: 180.0 rag_system_prompt: null rag_system_prompt_prefix: null + reasoning_enabled: false # Pipeline worker pools. Workers are abstract dispatchers — sizing # depends on whether they do local GPU work or fan out to remote NIMs. diff --git a/nemo_retriever/src/nemo_retriever/service/routers/ingest.py b/nemo_retriever/src/nemo_retriever/service/routers/ingest.py index 9b16a0463c..e5d5c9d693 100644 --- a/nemo_retriever/src/nemo_retriever/service/routers/ingest.py +++ b/nemo_retriever/src/nemo_retriever/service/routers/ingest.py @@ -84,6 +84,7 @@ class AnswerRequest(BaseModel): query: str top_k: int = Field(default=5, ge=1, le=1000) include_chunks: bool = False + reasoning_enabled: bool | None = None class AnswerResponse(BaseModel): @@ -1406,9 +1407,15 @@ async def answer(req: AnswerRequest, request: Request) -> Response | AnswerRespo timeout=llm_cfg.timeout, rag_system_prompt=llm_cfg.rag_system_prompt, rag_system_prompt_prefix=llm_cfg.rag_system_prompt_prefix, + reasoning_enabled=llm_cfg.reasoning_enabled, ) request.app.state.answer_llm_client = llm - gen = await asyncio.to_thread(llm.generate, req.query, chunks) + gen = await asyncio.to_thread( + llm.generate, + req.query, + chunks, + reasoning_enabled=req.reasoning_enabled, + ) if gen.error: logger.error("LLM answer generation failed for model %s: %s", gen.model, gen.error) raise HTTPException( diff --git a/nemo_retriever/tests/test_helm_answer_llm_generation.py b/nemo_retriever/tests/test_helm_answer_llm_generation.py index 2e61c777e9..331b7d9622 100644 --- a/nemo_retriever/tests/test_helm_answer_llm_generation.py +++ b/nemo_retriever/tests/test_helm_answer_llm_generation.py @@ -79,7 +79,8 @@ def test_values_define_generic_answer_llm_with_super49b_defaults(self) -> None: self.assertIn("nvidia.com/gpu: 2", block) self.assertIn('size: "250Gi"', block) self.assertIn(_SUPER49B_PROFILE, block) - self.assertIn('ragSystemPromptPrefix: "/no_think"', block) + self.assertIn("reasoningEnabled: false", values) + self.assertIn('ragSystemPromptPrefix: ""', block) def test_default_render_omits_answer_llm_and_disables_llm_answering(self) -> None: proc = _helm_template() @@ -107,7 +108,8 @@ def test_answer_llm_opt_in_renders_default_super49b_nim_and_autowires_llm_config self.assertIn("NCCL_P2P_DISABLE", proc.stdout) self.assertIn(f'api_base: "http://{_ANSWER_LLM_SERVICE}:8000/v1"', proc.stdout) self.assertIn(f'model: "{_SUPER49B_MODEL}"', proc.stdout) - self.assertIn('rag_system_prompt_prefix: "/no_think"', proc.stdout) + self.assertIn("reasoning_enabled: false", proc.stdout) + self.assertIn("rag_system_prompt_prefix: null", proc.stdout) self.assertIn("enabled: true", proc.stdout) self.assertIn("NEMO_RETRIEVER_LLM_API_KEY", proc.stdout) self.assertIn('name: "ngc-api"', proc.stdout) @@ -126,8 +128,6 @@ def test_answer_llm_can_swap_to_nano_image_model_and_profile(self) -> None: f"nimOperator.answer_llm.image.tag={_NANO_TAG}", "--set", f"nimOperator.answer_llm.model={_NANO_MODEL}", - "--set", - "nimOperator.answer_llm.ragSystemPromptPrefix=", "--set-json", f'nimOperator.answer_llm.modelProfile={{"profiles":["{_NANO_A100_PROFILE}"]}}', "--set-json", @@ -180,6 +180,22 @@ def test_explicit_llm_api_base_wins_without_operator_nim(self) -> None: self.assertIn('model: "openai/custom-answerer"', proc.stdout) self.assertIn("rag_system_prompt_prefix: null", proc.stdout) + def test_service_llm_reasoning_enabled_can_be_overridden(self) -> None: + proc = _helm_template( + extra_args=( + "--set", + "serviceConfig.llm.enabled=true", + "--set", + "serviceConfig.llm.apiBase=http://external-llm:8000/v1", + "--set", + "serviceConfig.llm.reasoningEnabled=true", + ) + ) + _assert_helm_ok(self, proc) + + self.assertIn("reasoning_enabled: true", proc.stdout) + self.assertIn("rag_system_prompt_prefix: null", proc.stdout) + def test_llm_api_key_secret_renders_env_not_configmap_value(self) -> None: proc = _helm_template( extra_args=( diff --git a/nemo_retriever/tests/test_llm_params.py b/nemo_retriever/tests/test_llm_params.py index 4eb8703bbf..f8488a9c14 100644 --- a/nemo_retriever/tests/test_llm_params.py +++ b/nemo_retriever/tests/test_llm_params.py @@ -148,6 +148,12 @@ def test_from_kwargs_defaults_top_p_to_none(self): client = LiteLLMClient.from_kwargs(model="m") assert client.sampling.top_p is None + def test_from_kwargs_defaults_reasoning_disabled_for_rag_answers(self): + from nemo_retriever.llm.clients import LiteLLMClient + + client = LiteLLMClient.from_kwargs(model="m") + assert client.transport.reasoning_enabled is False + class TestLiteLLMCompleteCallKwargs: """Inspect the exact kwargs LiteLLMClient.complete() forwards to litellm.""" @@ -227,19 +233,49 @@ class TestLiteLLMRAGPrompt: """RAG prompt customization for local OpenAI-compatible answer models.""" @patch("litellm.completion") - def test_generate_prepends_rag_system_prompt_prefix(self, mock_completion): + def test_generate_disables_reasoning_with_portable_request_controls(self, mock_completion): from nemo_retriever.llm.clients import LiteLLMClient mock_completion.return_value = _fake_litellm_response("answer") - client = LiteLLMClient.from_kwargs(model="m", rag_system_prompt_prefix="/no_think") + client = LiteLLMClient.from_kwargs( + model="m", + extra_params={"chat_template_kwargs": {"reasoning_budget": 32}}, + ) result = client.generate(query="q", chunks=["ctx"]) - messages = mock_completion.call_args.kwargs["messages"] + kwargs = mock_completion.call_args.kwargs + messages = kwargs["messages"] assert messages[0]["role"] == "system" assert messages[0]["content"].startswith("/no_think\n") assert "precise question-answering assistant" in messages[0]["content"] + assert kwargs["chat_template_kwargs"] == {"reasoning_budget": 32, "enable_thinking": False} assert result.answer == "answer" + @patch("litellm.completion") + def test_generate_leaves_reasoning_request_defaults_when_enabled(self, mock_completion): + from nemo_retriever.llm.clients import LiteLLMClient + + mock_completion.return_value = _fake_litellm_response("answer") + client = LiteLLMClient.from_kwargs(model="m", reasoning_enabled=True) + client.generate(query="q", chunks=["ctx"]) + + kwargs = mock_completion.call_args.kwargs + messages = kwargs["messages"] + assert not messages[0]["content"].startswith("/no_think\n") + assert "chat_template_kwargs" not in kwargs + + @patch("litellm.completion") + def test_generate_can_override_reasoning_per_call(self, mock_completion): + from nemo_retriever.llm.clients import LiteLLMClient + + mock_completion.return_value = _fake_litellm_response("answer") + client = LiteLLMClient.from_kwargs(model="m", reasoning_enabled=True) + client.generate(query="q", chunks=["ctx"], reasoning_enabled=False) + + kwargs = mock_completion.call_args.kwargs + assert kwargs["messages"][0]["content"].startswith("/no_think\n") + assert kwargs["chat_template_kwargs"] == {"enable_thinking": False} + @patch("litellm.completion") def test_generate_uses_custom_rag_system_prompt(self, mock_completion): from nemo_retriever.llm.clients import LiteLLMClient diff --git a/nemo_retriever/tests/test_service_answer_generation.py b/nemo_retriever/tests/test_service_answer_generation.py index ebde9245f6..829e542d80 100644 --- a/nemo_retriever/tests/test_service_answer_generation.py +++ b/nemo_retriever/tests/test_service_answer_generation.py @@ -47,7 +47,7 @@ async def _stub_work(_item): api_key="not-needed", max_tokens=128, timeout=180.0, - rag_system_prompt_prefix="/no_think", + reasoning_enabled=False, ), ) app = create_app(cfg) @@ -96,7 +96,7 @@ async def post(self, url: str, **kwargs) -> _FakeResponse: monkeypatch.setattr("httpx.AsyncClient", _FakeAsyncClient) fake_llm = SimpleNamespace( - generate=lambda query, chunks: GenerationResult( + generate=lambda query, chunks, *, reasoning_enabled=None: GenerationResult( answer=f"{query}: {len(chunks)} chunks", latency_s=0.25, model="openai/nvidia/llama-3.3-nemotron-super-49b-v1.5", @@ -137,7 +137,8 @@ async def post(self, url: str, **kwargs) -> _FakeResponse: num_retries=3, timeout=180.0, rag_system_prompt=None, - rag_system_prompt_prefix="/no_think", + rag_system_prompt_prefix=None, + reasoning_enabled=False, ) @@ -201,6 +202,46 @@ async def _stub_work(_item): assert "VectorDB is not enabled" in resp.json()["detail"] +def test_answer_forwards_request_reasoning_override( + app_with_answer_config: TestClient, + monkeypatch: pytest.MonkeyPatch, +) -> None: + class _FakeResponse: + status_code = 200 + content = json.dumps({"results": [{"hits": [{"text": "context"}]}]}).encode() + + def json(self) -> dict[str, Any]: + return json.loads(self.content.decode()) + + class _FakeAsyncClient: + def __init__(self, *args, **kwargs) -> None: + pass + + async def __aenter__(self): + return self + + async def __aexit__(self, exc_type, exc, tb) -> None: + return None + + async def post(self, url: str, **kwargs) -> _FakeResponse: + return _FakeResponse() + + monkeypatch.setattr("httpx.AsyncClient", _FakeAsyncClient) + seen: dict[str, Any] = {} + + def _generate(query, chunks, *, reasoning_enabled=None): + seen["reasoning_enabled"] = reasoning_enabled + return GenerationResult(answer="ok", latency_s=0.1, model="m") + + fake_llm = SimpleNamespace(generate=_generate) + + with patch("nemo_retriever.llm.clients.LiteLLMClient.from_kwargs", return_value=fake_llm): + resp = app_with_answer_config.post("/v1/answer", json={"query": "q", "reasoning_enabled": True}) + + assert resp.status_code == 200, resp.text + assert seen["reasoning_enabled"] is True + + def test_answer_returns_502_when_llm_generation_fails( app_with_answer_config: TestClient, monkeypatch: pytest.MonkeyPatch, @@ -227,7 +268,7 @@ async def post(self, url: str, **kwargs) -> _FakeResponse: monkeypatch.setattr("httpx.AsyncClient", _FakeAsyncClient) fake_llm = SimpleNamespace( - generate=lambda query, chunks: GenerationResult( + generate=lambda query, chunks, *, reasoning_enabled=None: GenerationResult( answer="", latency_s=0.0, model="openai/nvidia/llama-3.3-nemotron-super-49b-v1.5", From 5b7492b8cec7b62246d8f6fba4fb747bda5e357a Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Wed, 3 Jun 2026 17:43:43 +0000 Subject: [PATCH 04/14] Address Greptile answer LLM cleanup --- .../helm/templates/nims/answer-llm.yaml | 4 ++-- .../src/nemo_retriever/llm/clients/litellm.py | 19 ++++++++----------- 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/nemo_retriever/helm/templates/nims/answer-llm.yaml b/nemo_retriever/helm/templates/nims/answer-llm.yaml index 5df26b12a0..022ce8a094 100644 --- a/nemo_retriever/helm/templates/nims/answer-llm.yaml +++ b/nemo_retriever/helm/templates/nims/answer-llm.yaml @@ -11,7 +11,7 @@ spec: ngc: modelPuller: "{{ $cfg.image.repository }}:{{ $cfg.image.tag }}" pullSecret: "{{ index $cfg.image.pullSecrets 0 }}" - authSecret: {{ $cfg.authSecret }} + authSecret: {{ $cfg.authSecret | quote }} {{- include "nemo-retriever.nimcache.modelBlock" (dict "context" $ "key" "answer_llm") | nindent 6 }} storage: pvc: @@ -31,7 +31,7 @@ spec: pullPolicy: {{ $cfg.image.pullPolicy }} pullSecrets: {{ toYaml $cfg.image.pullSecrets | indent 6 }} - authSecret: {{ $cfg.authSecret }} + authSecret: {{ $cfg.authSecret | quote }} storage: nimCache: name: {{ $name }} diff --git a/nemo_retriever/src/nemo_retriever/llm/clients/litellm.py b/nemo_retriever/src/nemo_retriever/llm/clients/litellm.py index 91104ccdbd..fa11d3227f 100644 --- a/nemo_retriever/src/nemo_retriever/llm/clients/litellm.py +++ b/nemo_retriever/src/nemo_retriever/llm/clients/litellm.py @@ -61,20 +61,13 @@ def _build_rag_prompt( query: str, chunks: list[str], *, - rag_system_prompt: Optional[str] = None, - rag_system_prompt_prefix: Optional[str] = None, + formatted_rag_system_prompt: str, ) -> list[dict]: """Build the OpenAI-style messages list for a RAG prompt.""" context = "\n\n---\n\n".join(chunks) if chunks else "(no context retrieved)" user_content = _RAG_USER_TEMPLATE.format(context=context, query=query) return [ - { - "role": "system", - "content": _format_rag_system_prompt( - rag_system_prompt=rag_system_prompt, - rag_system_prompt_prefix=rag_system_prompt_prefix, - ), - }, + {"role": "system", "content": formatted_rag_system_prompt}, {"role": "user", "content": user_content}, ] @@ -139,7 +132,7 @@ def __init__( # ``max_tokens=1024`` for captioning/summarization workloads; RAG # answers routinely exceed that, so the client overrides it. self.sampling = sampling if sampling is not None else LLMInferenceParams(temperature=0.0, max_tokens=4096) - self._rag_system_prompt = _format_rag_system_prompt( + self._formatted_rag_system_prompt = _format_rag_system_prompt( rag_system_prompt=transport.rag_system_prompt, rag_system_prompt_prefix=transport.rag_system_prompt_prefix, ) @@ -245,7 +238,11 @@ def generate( reasoning_enabled: Optional[bool] = None, ) -> GenerationResult: """Generate an answer for the given query using retrieved chunks as context.""" - messages = _build_rag_prompt(query, chunks, rag_system_prompt=self._rag_system_prompt) + messages = _build_rag_prompt( + query, + chunks, + formatted_rag_system_prompt=self._formatted_rag_system_prompt, + ) request_extra_params: dict[str, Any] | None = None effective_reasoning_enabled = ( self.transport.reasoning_enabled if reasoning_enabled is None else reasoning_enabled From 680be66a02d6e8ca45439c49a71e7f42b6efbbb8 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Wed, 3 Jun 2026 17:58:44 +0000 Subject: [PATCH 05/14] Forward answer LLM reasoning in library mode --- .../helm/templates/nims/answer-llm.yaml | 6 +++- .../nemo_retriever/evaluation/generation.py | 3 ++ .../src/nemo_retriever/retriever.py | 1 + .../tests/test_helm_answer_llm_generation.py | 28 +++++++++++++++++++ nemo_retriever/tests/test_llm_params.py | 24 ++++++++++++++++ 5 files changed, 61 insertions(+), 1 deletion(-) diff --git a/nemo_retriever/helm/templates/nims/answer-llm.yaml b/nemo_retriever/helm/templates/nims/answer-llm.yaml index 022ce8a094..fe17471335 100644 --- a/nemo_retriever/helm/templates/nims/answer-llm.yaml +++ b/nemo_retriever/helm/templates/nims/answer-llm.yaml @@ -44,7 +44,11 @@ spec: expose: {{ toYaml $cfg.expose | indent 4 }} env: -{{ toYaml $cfg.env | indent 4 }} +{{- range $envVar := $cfg.env }} +{{- if ne (toString $envVar.name) "NGC_API_KEY" }} +{{ toYaml (list $envVar) | indent 4 }} +{{- end }} +{{- end }} - name: NGC_API_KEY valueFrom: secretKeyRef: diff --git a/nemo_retriever/src/nemo_retriever/evaluation/generation.py b/nemo_retriever/src/nemo_retriever/evaluation/generation.py index db9d474823..076b5a6185 100644 --- a/nemo_retriever/src/nemo_retriever/evaluation/generation.py +++ b/nemo_retriever/src/nemo_retriever/evaluation/generation.py @@ -44,6 +44,7 @@ def __init__( max_workers: int = 8, rag_system_prompt: Optional[str] = None, rag_system_prompt_prefix: Optional[str] = None, + reasoning_enabled: bool = False, ) -> None: super().__init__( model=model, @@ -58,6 +59,7 @@ def __init__( max_workers=max_workers, rag_system_prompt=rag_system_prompt, rag_system_prompt_prefix=rag_system_prompt_prefix, + reasoning_enabled=reasoning_enabled, ) self._client = LiteLLMClient.from_kwargs( model=model, @@ -71,6 +73,7 @@ def __init__( timeout=timeout, rag_system_prompt=rag_system_prompt, rag_system_prompt_prefix=rag_system_prompt_prefix, + reasoning_enabled=reasoning_enabled, ) self._max_workers = max_workers diff --git a/nemo_retriever/src/nemo_retriever/retriever.py b/nemo_retriever/src/nemo_retriever/retriever.py index 85fdaad4f5..f8e1d83d15 100644 --- a/nemo_retriever/src/nemo_retriever/retriever.py +++ b/nemo_retriever/src/nemo_retriever/retriever.py @@ -624,6 +624,7 @@ def generate( timeout=transport.timeout, rag_system_prompt=transport.rag_system_prompt, rag_system_prompt_prefix=transport.rag_system_prompt_prefix, + reasoning_enabled=transport.reasoning_enabled, ) else: operator = QAGenerationOperator(model=model, **kwargs) diff --git a/nemo_retriever/tests/test_helm_answer_llm_generation.py b/nemo_retriever/tests/test_helm_answer_llm_generation.py index 331b7d9622..aa9ce3882d 100644 --- a/nemo_retriever/tests/test_helm_answer_llm_generation.py +++ b/nemo_retriever/tests/test_helm_answer_llm_generation.py @@ -196,6 +196,34 @@ def test_service_llm_reasoning_enabled_can_be_overridden(self) -> None: self.assertIn("reasoning_enabled: true", proc.stdout) self.assertIn("rag_system_prompt_prefix: null", proc.stdout) + def test_answer_llm_filters_reserved_ngc_api_key_from_custom_env(self) -> None: + proc = _helm_template( + extra_args=( + "--set", + "nimOperator.answer_llm.enabled=true", + "--set", + "nimOperator.answer_llm.env[0].name=NGC_API_KEY", + "--set-string", + "nimOperator.answer_llm.env[0].value=ignored-user-secret", + "--set", + "nimOperator.answer_llm.env[1].name=OTHER_ENV", + "--set-string", + "nimOperator.answer_llm.env[1].value=kept", + ) + ) + _assert_helm_ok(self, proc) + + answer_llm_service_start = proc.stdout.index(f"kind: NIMService\nmetadata:\n name: {_ANSWER_LLM_SERVICE}") + next_manifest_start = proc.stdout.find("\n---", answer_llm_service_start + 1) + answer_llm_service = proc.stdout[ + answer_llm_service_start : next_manifest_start if next_manifest_start != -1 else len(proc.stdout) + ] + + self.assertNotIn("ignored-user-secret", answer_llm_service) + self.assertIn("OTHER_ENV", answer_llm_service) + self.assertIn("value: kept", answer_llm_service) + self.assertEqual(answer_llm_service.count("- name: NGC_API_KEY"), 1) + def test_llm_api_key_secret_renders_env_not_configmap_value(self) -> None: proc = _helm_template( extra_args=( diff --git a/nemo_retriever/tests/test_llm_params.py b/nemo_retriever/tests/test_llm_params.py index f8488a9c14..8b8f1f693c 100644 --- a/nemo_retriever/tests/test_llm_params.py +++ b/nemo_retriever/tests/test_llm_params.py @@ -387,6 +387,30 @@ def test_qa_generation_operator_constructs_cleanly(self, mock_completion): assert op._client.sampling.temperature == 0.0 assert op._client.sampling.max_tokens == 128 + def test_qa_generation_operator_forwards_reasoning_enabled(self): + from nemo_retriever.evaluation.generation import QAGenerationOperator + + op = QAGenerationOperator(model="m", reasoning_enabled=True) + assert op._client.transport.reasoning_enabled is True + + def test_pipeline_builder_generate_forwards_reasoning_enabled(self): + from unittest.mock import MagicMock + + from nemo_retriever.evaluation.generation import QAGenerationOperator + from nemo_retriever.llm.clients import LiteLLMClient + from nemo_retriever.retriever import RetrieverPipelineBuilder + + retriever = MagicMock() + retriever.top_k = 5 + builder = RetrieverPipelineBuilder(retriever, top_k=5) + + llm = LiteLLMClient.from_kwargs(model="m", reasoning_enabled=True) + builder.generate(llm) + + generation_ops = [s for s in builder._steps if isinstance(s, QAGenerationOperator)] + assert len(generation_ops) == 1 + assert generation_ops[0]._client.transport.reasoning_enabled is True + def test_judging_operator_constructs_cleanly(self): from nemo_retriever.evaluation.judging import JudgingOperator From 5670a945b4c6f718ba3fb6b45efec8ff8cd3773d Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Wed, 3 Jun 2026 18:10:33 +0000 Subject: [PATCH 06/14] Default missing LLM reasoning flag --- .../src/nemo_retriever/retriever.py | 2 +- nemo_retriever/tests/test_llm_params.py | 30 +++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/nemo_retriever/src/nemo_retriever/retriever.py b/nemo_retriever/src/nemo_retriever/retriever.py index f8e1d83d15..af8a914ee3 100644 --- a/nemo_retriever/src/nemo_retriever/retriever.py +++ b/nemo_retriever/src/nemo_retriever/retriever.py @@ -624,7 +624,7 @@ def generate( timeout=transport.timeout, rag_system_prompt=transport.rag_system_prompt, rag_system_prompt_prefix=transport.rag_system_prompt_prefix, - reasoning_enabled=transport.reasoning_enabled, + reasoning_enabled=getattr(transport, "reasoning_enabled", False), ) else: operator = QAGenerationOperator(model=model, **kwargs) diff --git a/nemo_retriever/tests/test_llm_params.py b/nemo_retriever/tests/test_llm_params.py index 8b8f1f693c..58cf1e3511 100644 --- a/nemo_retriever/tests/test_llm_params.py +++ b/nemo_retriever/tests/test_llm_params.py @@ -411,6 +411,36 @@ def test_pipeline_builder_generate_forwards_reasoning_enabled(self): assert len(generation_ops) == 1 assert generation_ops[0]._client.transport.reasoning_enabled is True + def test_pipeline_builder_generate_defaults_reasoning_disabled_for_legacy_client(self): + from types import SimpleNamespace + from unittest.mock import MagicMock + + from nemo_retriever.evaluation.generation import QAGenerationOperator + from nemo_retriever.retriever import RetrieverPipelineBuilder + + retriever = MagicMock() + retriever.top_k = 5 + builder = RetrieverPipelineBuilder(retriever, top_k=5) + llm = SimpleNamespace( + transport=SimpleNamespace( + model="m", + api_base=None, + api_key=None, + extra_params={}, + num_retries=3, + timeout=120.0, + rag_system_prompt=None, + rag_system_prompt_prefix=None, + ), + sampling=SimpleNamespace(temperature=0.0, top_p=None, max_tokens=128), + ) + + builder.generate(llm) + + generation_ops = [s for s in builder._steps if isinstance(s, QAGenerationOperator)] + assert len(generation_ops) == 1 + assert generation_ops[0]._client.transport.reasoning_enabled is False + def test_judging_operator_constructs_cleanly(self): from nemo_retriever.evaluation.judging import JudgingOperator From 5327e5ad4f052cb09a8c45dac8a331832b40cdca Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Wed, 3 Jun 2026 18:22:41 +0000 Subject: [PATCH 07/14] Stabilize service pipeline text extraction test --- .../tests/test_service_pipeline_spec.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/nemo_retriever/tests/test_service_pipeline_spec.py b/nemo_retriever/tests/test_service_pipeline_spec.py index 372e105868..92f9b22251 100644 --- a/nemo_retriever/tests/test_service_pipeline_spec.py +++ b/nemo_retriever/tests/test_service_pipeline_spec.py @@ -35,6 +35,18 @@ from nemo_retriever.service_ingestor import ServiceIngestor +class _TinyTokenizer: + def __init__(self) -> None: + self._text = "" + + def encode(self, text: str, *, add_special_tokens: bool = False) -> list[int]: + self._text = text + return list(range(len(text))) + + def decode(self, ids: list[int], *, skip_special_tokens: bool = True) -> str: + return "".join(self._text[i] for i in ids) + + @pytest.fixture(autouse=True) def _no_remote_api_keys(monkeypatch: pytest.MonkeyPatch) -> None: # _ParamsModel auto-resolves unset *api_key fields from these env vars, @@ -359,7 +371,6 @@ def test_build_graph_ingestor_does_not_attach_asr_params_for_pdf_upload() -> Non base_extract: dict[str, object] = {} base_asr = {"audio_endpoints": ["audio:50051", None]} spec = {"extraction_mode": "auto", "stage_order": ["extract"]} - ingestor, mode, _ = _build_graph_ingestor_from_spec( "report.pdf", b"%PDF-1.4 stub", @@ -478,8 +489,10 @@ def test_run_pipeline_in_process_rejects_empty_text_like_output() -> None: _run_pipeline_in_process("empty.txt", b"", {}, None, None, spec) -def test_run_pipeline_in_process_html_txt_produce_rows() -> None: +def test_run_pipeline_in_process_html_txt_produce_rows(monkeypatch: pytest.MonkeyPatch) -> None: spec = {"extraction_mode": "auto", "stage_order": ["extract"]} + monkeypatch.setattr("nemo_retriever.html.convert._get_txt_tokenizer", lambda *_, **__: _TinyTokenizer()) + monkeypatch.setattr("nemo_retriever.txt.split._get_tokenizer", lambda *_, **__: _TinyTokenizer()) html_rows, _, _ = _run_pipeline_in_process( "page.html", b"

Title

body

", From 71a78f4b646666b3dc7717538a675ce22d9bdbf1 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Wed, 3 Jun 2026 18:42:59 +0000 Subject: [PATCH 08/14] Preserve provider reasoning by default --- .../src/nemo_retriever/evaluation/generation.py | 2 +- nemo_retriever/src/nemo_retriever/llm/clients/litellm.py | 2 +- nemo_retriever/src/nemo_retriever/params/models.py | 2 +- nemo_retriever/src/nemo_retriever/retriever.py | 2 +- nemo_retriever/tests/test_llm_params.py | 9 +++++---- 5 files changed, 9 insertions(+), 8 deletions(-) diff --git a/nemo_retriever/src/nemo_retriever/evaluation/generation.py b/nemo_retriever/src/nemo_retriever/evaluation/generation.py index 076b5a6185..b2c43123ac 100644 --- a/nemo_retriever/src/nemo_retriever/evaluation/generation.py +++ b/nemo_retriever/src/nemo_retriever/evaluation/generation.py @@ -44,7 +44,7 @@ def __init__( max_workers: int = 8, rag_system_prompt: Optional[str] = None, rag_system_prompt_prefix: Optional[str] = None, - reasoning_enabled: bool = False, + reasoning_enabled: bool = True, ) -> None: super().__init__( model=model, diff --git a/nemo_retriever/src/nemo_retriever/llm/clients/litellm.py b/nemo_retriever/src/nemo_retriever/llm/clients/litellm.py index fa11d3227f..1f2a876354 100644 --- a/nemo_retriever/src/nemo_retriever/llm/clients/litellm.py +++ b/nemo_retriever/src/nemo_retriever/llm/clients/litellm.py @@ -157,7 +157,7 @@ def from_kwargs( timeout: float = 120.0, rag_system_prompt: Optional[str] = None, rag_system_prompt_prefix: Optional[str] = None, - reasoning_enabled: bool = False, + reasoning_enabled: bool = True, ) -> "LiteLLMClient": """Flat-kwarg constructor for zero-churn migration from the old signature. diff --git a/nemo_retriever/src/nemo_retriever/params/models.py b/nemo_retriever/src/nemo_retriever/params/models.py index 6b0bcf444f..3e1b08057c 100644 --- a/nemo_retriever/src/nemo_retriever/params/models.py +++ b/nemo_retriever/src/nemo_retriever/params/models.py @@ -601,7 +601,7 @@ class LLMRemoteClientParams(_ParamsModel): extra_params: dict[str, Any] = Field(default_factory=dict) rag_system_prompt: Optional[str] = None rag_system_prompt_prefix: Optional[str] = None - reasoning_enabled: bool = False + reasoning_enabled: bool = True @field_validator("num_retries") @classmethod diff --git a/nemo_retriever/src/nemo_retriever/retriever.py b/nemo_retriever/src/nemo_retriever/retriever.py index af8a914ee3..03e25c4a73 100644 --- a/nemo_retriever/src/nemo_retriever/retriever.py +++ b/nemo_retriever/src/nemo_retriever/retriever.py @@ -624,7 +624,7 @@ def generate( timeout=transport.timeout, rag_system_prompt=transport.rag_system_prompt, rag_system_prompt_prefix=transport.rag_system_prompt_prefix, - reasoning_enabled=getattr(transport, "reasoning_enabled", False), + reasoning_enabled=getattr(transport, "reasoning_enabled", True), ) else: operator = QAGenerationOperator(model=model, **kwargs) diff --git a/nemo_retriever/tests/test_llm_params.py b/nemo_retriever/tests/test_llm_params.py index 58cf1e3511..449d70618d 100644 --- a/nemo_retriever/tests/test_llm_params.py +++ b/nemo_retriever/tests/test_llm_params.py @@ -148,11 +148,11 @@ def test_from_kwargs_defaults_top_p_to_none(self): client = LiteLLMClient.from_kwargs(model="m") assert client.sampling.top_p is None - def test_from_kwargs_defaults_reasoning_disabled_for_rag_answers(self): + def test_from_kwargs_preserves_provider_reasoning_by_default(self): from nemo_retriever.llm.clients import LiteLLMClient client = LiteLLMClient.from_kwargs(model="m") - assert client.transport.reasoning_enabled is False + assert client.transport.reasoning_enabled is True class TestLiteLLMCompleteCallKwargs: @@ -240,6 +240,7 @@ def test_generate_disables_reasoning_with_portable_request_controls(self, mock_c client = LiteLLMClient.from_kwargs( model="m", extra_params={"chat_template_kwargs": {"reasoning_budget": 32}}, + reasoning_enabled=False, ) result = client.generate(query="q", chunks=["ctx"]) @@ -411,7 +412,7 @@ def test_pipeline_builder_generate_forwards_reasoning_enabled(self): assert len(generation_ops) == 1 assert generation_ops[0]._client.transport.reasoning_enabled is True - def test_pipeline_builder_generate_defaults_reasoning_disabled_for_legacy_client(self): + def test_pipeline_builder_generate_defaults_reasoning_enabled_for_legacy_client(self): from types import SimpleNamespace from unittest.mock import MagicMock @@ -439,7 +440,7 @@ def test_pipeline_builder_generate_defaults_reasoning_disabled_for_legacy_client generation_ops = [s for s in builder._steps if isinstance(s, QAGenerationOperator)] assert len(generation_ops) == 1 - assert generation_ops[0]._client.transport.reasoning_enabled is False + assert generation_ops[0]._client.transport.reasoning_enabled is True def test_judging_operator_constructs_cleanly(self): from nemo_retriever.evaluation.judging import JudgingOperator From c830385427fc3e99c31e19e55a209e5177dc5a33 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Wed, 3 Jun 2026 19:09:04 +0000 Subject: [PATCH 09/14] Decouple answer metadata response flag --- .../nemo_retriever/service/routers/ingest.py | 3 +- .../tests/test_service_answer_generation.py | 58 +++++++++++++++++++ .../tests/test_service_packaging.py | 2 +- 3 files changed, 61 insertions(+), 2 deletions(-) diff --git a/nemo_retriever/src/nemo_retriever/service/routers/ingest.py b/nemo_retriever/src/nemo_retriever/service/routers/ingest.py index bc68806622..83f8e96e54 100644 --- a/nemo_retriever/src/nemo_retriever/service/routers/ingest.py +++ b/nemo_retriever/src/nemo_retriever/service/routers/ingest.py @@ -85,6 +85,7 @@ class AnswerRequest(BaseModel): query: str top_k: int = Field(default=5, ge=1, le=1000) include_chunks: bool = False + include_metadata: bool = False reasoning_enabled: bool | None = None @@ -1465,7 +1466,7 @@ async def answer(req: AnswerRequest, request: Request) -> Response | AnswerRespo latency_s=gen.latency_s, chunk_count=len(chunks), chunks=chunks if req.include_chunks else None, - metadata=metadata if req.include_chunks else None, + metadata=metadata if req.include_chunks or req.include_metadata else None, ) diff --git a/nemo_retriever/tests/test_service_answer_generation.py b/nemo_retriever/tests/test_service_answer_generation.py index 829e542d80..1b0f597263 100644 --- a/nemo_retriever/tests/test_service_answer_generation.py +++ b/nemo_retriever/tests/test_service_answer_generation.py @@ -142,6 +142,64 @@ async def post(self, url: str, **kwargs) -> _FakeResponse: ) +def test_answer_can_return_metadata_without_chunks( + app_with_answer_config: TestClient, + monkeypatch: pytest.MonkeyPatch, +) -> None: + class _FakeResponse: + status_code = 200 + content = json.dumps( + { + "results": [ + { + "hits": [ + { + "text": "citation context", + "source": "doc.pdf", + "page_number": 3, + } + ] + } + ] + } + ).encode() + + def json(self) -> dict[str, Any]: + return json.loads(self.content.decode()) + + class _FakeAsyncClient: + def __init__(self, *args, **kwargs) -> None: + pass + + async def __aenter__(self): + return self + + async def __aexit__(self, exc_type, exc, tb) -> None: + return None + + async def post(self, url: str, **kwargs) -> _FakeResponse: + return _FakeResponse() + + monkeypatch.setattr("httpx.AsyncClient", _FakeAsyncClient) + seen: dict[str, Any] = {} + + def _generate(query, chunks, *, reasoning_enabled=None): + seen["chunks"] = chunks + return GenerationResult(answer="answer", latency_s=0.1, model="m") + + fake_llm = SimpleNamespace(generate=_generate) + + with patch("nemo_retriever.llm.clients.LiteLLMClient.from_kwargs", return_value=fake_llm): + resp = app_with_answer_config.post("/v1/answer", json={"query": "q", "include_metadata": True}) + + assert resp.status_code == 200, resp.text + body = resp.json() + assert body["chunk_count"] == 1 + assert body["chunks"] is None + assert body["metadata"] == [{"source": "doc.pdf", "page_number": 3}] + assert seen["chunks"] == ["citation context"] + + def test_answer_returns_404_when_llm_disabled(monkeypatch: pytest.MonkeyPatch, tmp_path) -> None: async def _stub_work(_item): return 0, [] diff --git a/nemo_retriever/tests/test_service_packaging.py b/nemo_retriever/tests/test_service_packaging.py index 897c4fff06..789bf31741 100644 --- a/nemo_retriever/tests/test_service_packaging.py +++ b/nemo_retriever/tests/test_service_packaging.py @@ -25,4 +25,4 @@ def test_service_extra_includes_litellm_for_answer_generation() -> None: litellm = next((req for req in requirements if req.name == "litellm"), None) assert litellm is not None - assert litellm.specifier.contains("1.86.0") + assert any(str(spec).startswith(">=") and "1.86.0" in str(spec) for spec in litellm.specifier) From e3df3961a394354ca7f7a024607b645b0ea64108 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Wed, 3 Jun 2026 19:26:42 +0000 Subject: [PATCH 10/14] Align live RAG fake LLM transport --- nemo_retriever/tests/test_live_rag.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nemo_retriever/tests/test_live_rag.py b/nemo_retriever/tests/test_live_rag.py index 5233125f10..4a81089a7f 100644 --- a/nemo_retriever/tests/test_live_rag.py +++ b/nemo_retriever/tests/test_live_rag.py @@ -628,6 +628,7 @@ def _build_fake_llm_client(*, top_p: float | None = None): timeout=120.0, rag_system_prompt=None, rag_system_prompt_prefix=None, + reasoning_enabled=True, ) sampling = SimpleNamespace(temperature=0.0, top_p=top_p, max_tokens=512) return SimpleNamespace(transport=transport, sampling=sampling) From 0e83a1a0cc114a522c97d1afdbb700e79fbe1a8e Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Wed, 3 Jun 2026 19:42:17 +0000 Subject: [PATCH 11/14] Tighten answer response and pull secret guards --- .../helm/templates/nims/answer-llm.yaml | 2 +- .../src/nemo_retriever/service/routers/ingest.py | 2 +- .../tests/test_helm_answer_llm_generation.py | 16 ++++++++++++++++ .../tests/test_service_answer_generation.py | 12 ++++++++++-- 4 files changed, 28 insertions(+), 4 deletions(-) diff --git a/nemo_retriever/helm/templates/nims/answer-llm.yaml b/nemo_retriever/helm/templates/nims/answer-llm.yaml index fe17471335..98332edb57 100644 --- a/nemo_retriever/helm/templates/nims/answer-llm.yaml +++ b/nemo_retriever/helm/templates/nims/answer-llm.yaml @@ -10,7 +10,7 @@ spec: source: ngc: modelPuller: "{{ $cfg.image.repository }}:{{ $cfg.image.tag }}" - pullSecret: "{{ index $cfg.image.pullSecrets 0 }}" + pullSecret: {{ if $cfg.image.pullSecrets }}{{ index $cfg.image.pullSecrets 0 | quote }}{{ else }}{{ fail "nimOperator.answer_llm.image.pullSecrets must not be empty" }}{{ end }} authSecret: {{ $cfg.authSecret | quote }} {{- include "nemo-retriever.nimcache.modelBlock" (dict "context" $ "key" "answer_llm") | nindent 6 }} storage: diff --git a/nemo_retriever/src/nemo_retriever/service/routers/ingest.py b/nemo_retriever/src/nemo_retriever/service/routers/ingest.py index 83f8e96e54..4ec2a647c0 100644 --- a/nemo_retriever/src/nemo_retriever/service/routers/ingest.py +++ b/nemo_retriever/src/nemo_retriever/service/routers/ingest.py @@ -1466,7 +1466,7 @@ async def answer(req: AnswerRequest, request: Request) -> Response | AnswerRespo latency_s=gen.latency_s, chunk_count=len(chunks), chunks=chunks if req.include_chunks else None, - metadata=metadata if req.include_chunks or req.include_metadata else None, + metadata=metadata if req.include_metadata else None, ) diff --git a/nemo_retriever/tests/test_helm_answer_llm_generation.py b/nemo_retriever/tests/test_helm_answer_llm_generation.py index aa9ce3882d..2f360f9abe 100644 --- a/nemo_retriever/tests/test_helm_answer_llm_generation.py +++ b/nemo_retriever/tests/test_helm_answer_llm_generation.py @@ -115,6 +115,22 @@ def test_answer_llm_opt_in_renders_default_super49b_nim_and_autowires_llm_config self.assertIn('name: "ngc-api"', proc.stdout) self.assertIn('key: "NGC_API_KEY"', proc.stdout) + def test_answer_llm_rejects_empty_pull_secrets(self) -> None: + proc = _helm_template( + extra_args=( + "--set", + "nimOperator.answer_llm.enabled=true", + "--set-json", + "nimOperator.answer_llm.image.pullSecrets=[]", + ) + ) + + self.assertNotEqual(proc.returncode, 0) + self.assertIn( + "nimOperator.answer_llm.image.pullSecrets must not be empty", + proc.stdout + proc.stderr, + ) + def test_answer_llm_can_swap_to_nano_image_model_and_profile(self) -> None: proc = _helm_template( extra_args=( diff --git a/nemo_retriever/tests/test_service_answer_generation.py b/nemo_retriever/tests/test_service_answer_generation.py index 1b0f597263..726aefb834 100644 --- a/nemo_retriever/tests/test_service_answer_generation.py +++ b/nemo_retriever/tests/test_service_answer_generation.py @@ -106,7 +106,7 @@ async def post(self, url: str, **kwargs) -> _FakeResponse: with patch("nemo_retriever.llm.clients.LiteLLMClient.from_kwargs", return_value=fake_llm) as from_kwargs: resp = app_with_answer_config.post( "/v1/answer", - json={"query": "What generates answers?", "top_k": 2, "include_chunks": True}, + json={"query": "What generates answers?", "top_k": 2, "include_chunks": True, "include_metadata": True}, ) assert resp.status_code == 200, resp.text @@ -142,7 +142,7 @@ async def post(self, url: str, **kwargs) -> _FakeResponse: ) -def test_answer_can_return_metadata_without_chunks( +def test_answer_response_fields_respect_chunk_and_metadata_flags( app_with_answer_config: TestClient, monkeypatch: pytest.MonkeyPatch, ) -> None: @@ -199,6 +199,14 @@ def _generate(query, chunks, *, reasoning_enabled=None): assert body["metadata"] == [{"source": "doc.pdf", "page_number": 3}] assert seen["chunks"] == ["citation context"] + resp = app_with_answer_config.post("/v1/answer", json={"query": "q", "include_chunks": True}) + + assert resp.status_code == 200, resp.text + body = resp.json() + assert body["chunk_count"] == 1 + assert body["chunks"] == ["citation context"] + assert body["metadata"] is None + def test_answer_returns_404_when_llm_disabled(monkeypatch: pytest.MonkeyPatch, tmp_path) -> None: async def _stub_work(_item): From 3e5352afb98ddc75504aab39edfc5686f5e668f3 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Fri, 5 Jun 2026 15:23:47 +0000 Subject: [PATCH 12/14] Address Greptile answer LLM feedback --- nemo_retriever/helm/README.md | 15 ++++--- nemo_retriever/helm/values.yaml | 10 ++--- .../src/nemo_retriever/service/config.py | 2 +- .../service/retriever-service.yaml | 2 +- .../nemo_retriever/service/routers/ingest.py | 7 ++- .../tests/test_helm_answer_llm_generation.py | 11 ++--- .../tests/test_service_answer_generation.py | 45 +++++++++++++++++++ 7 files changed, 71 insertions(+), 21 deletions(-) diff --git a/nemo_retriever/helm/README.md b/nemo_retriever/helm/README.md index f86e4dfe90..b0a96750c5 100644 --- a/nemo_retriever/helm/README.md +++ b/nemo_retriever/helm/README.md @@ -325,7 +325,7 @@ The retriever service picks up the in-cluster ASR endpoint when `nimOperator.aud | `serviceConfig.llm.apiKeySecret.key` | `api_key` | Secret key for external LLM credentials. Operator-managed `answer_llm` uses `NGC_API_KEY` from `nimOperator.answer_llm.authSecret` when no explicit LLM Secret is set. | | `serviceConfig.llm.model` | `""` | Optional explicit LiteLLM model id. Leave empty to inherit `nimOperator.answer_llm.model` when using the operator-managed answer LLM; set it for external endpoints. | | `serviceConfig.llm.ragSystemPromptPrefix` | `""` | Optional explicit RAG prompt prefix. Leave empty unless an endpoint needs model-specific prompt directives. | -| `serviceConfig.llm.reasoningEnabled` | `false` | Request-level reasoning toggle for `/v1/answer`. When false, the service adds portable no-reasoning metadata; when true, requests leave reasoning behavior to the LLM endpoint defaults. | +| `serviceConfig.llm.reasoningEnabled` | `true` | Request-level reasoning toggle for `/v1/answer`. Defaults to true for external OpenAI-compatible providers; set false for Nemotron endpoints that should receive portable no-reasoning controls. | | `serviceConfig.vectordb.enabled` | `true` | Deploy the LanceDB vectordb Pod. When `true` the chart **requires** a resolvable embed endpoint (see [VectorDB and the embed endpoint](#vectordb-and-the-embed-endpoint)); `helm install` / `helm upgrade` fails fast otherwise. | | `serviceConfig.vectordb.lancedbUri` | `/data/vectordb` | LanceDB on the vectordb Pod's PVC. | | `serviceConfig.vectordb.embedModel` | `nvidia/llama-nemotron-embed-vl-1b-v2` | Passed to vectordb + worker `embed_model_name`. | @@ -386,18 +386,19 @@ llm: model: "openai/nvidia/llama-3.3-nemotron-super-49b-v1.5" api_base: "http://answer-llm:8000/v1" rag_system_prompt_prefix: null - reasoning_enabled: false + reasoning_enabled: true ``` The retriever service then exposes `POST /v1/answer`, which calls the VectorDB pod's `/v1/query` endpoint for context and sends those chunks to the configured LLM endpoint. The `answer_llm` NIM deployment leaves reasoning defaults model-neutral; `/v1/answer` controls reasoning per -request. With `serviceConfig.llm.reasoningEnabled=false`, the service adds -both `/no_think` and `chat_template_kwargs.enable_thinking=false` so -Super-49B and Nemotron 3 Nano both skip reasoning. Set -`serviceConfig.llm.reasoningEnabled=true` to leave answer requests at the -LLM endpoint's default reasoning behavior. The default Super-49B NIMService +request. By default, `serviceConfig.llm.reasoningEnabled=true`, so requests +leave reasoning behavior to the LLM endpoint defaults and avoid sending +provider-specific `chat_template_kwargs` to external OpenAI-compatible +endpoints. Set `serviceConfig.llm.reasoningEnabled=false` for Nemotron +endpoints that should skip reasoning; the service then adds both `/no_think` +and `chat_template_kwargs.enable_thinking=false`. The default Super-49B NIMService resources request two GPUs (`nvidia.com/gpu: 2`) to match the bundled tensor-parallel NIM profile. Override `resources`, `modelProfile`, or `env` for deployments that use a different profile or hardware topology. diff --git a/nemo_retriever/helm/values.yaml b/nemo_retriever/helm/values.yaml index 3e54824226..4a1e1eb587 100644 --- a/nemo_retriever/helm/values.yaml +++ b/nemo_retriever/helm/values.yaml @@ -523,11 +523,11 @@ serviceConfig: # Optional prefix prepended to the RAG system prompt. Leave empty # unless an endpoint needs model-specific prompt directives. ragSystemPromptPrefix: "" - # Unified request-level reasoning control for /v1/answer. When false, - # the service sends portable no-reasoning metadata understood by both - # Super-49B and Nemotron 3 Nano. Set true to leave reasoning behavior - # to the LLM endpoint defaults for answer requests. - reasoningEnabled: false + # Unified request-level reasoning control for /v1/answer. Defaults to true + # so external OpenAI-compatible providers receive only standard request + # parameters. Set false for Nemotron endpoints that should receive the + # portable no-reasoning controls. + reasoningEnabled: true # Pipeline worker pools. Workers are abstract dispatchers — sizing # depends on whether they do local GPU work or fan out to remote NIMs. diff --git a/nemo_retriever/src/nemo_retriever/service/config.py b/nemo_retriever/src/nemo_retriever/service/config.py index ca132cc9d6..6a8ce6cf95 100644 --- a/nemo_retriever/src/nemo_retriever/service/config.py +++ b/nemo_retriever/src/nemo_retriever/service/config.py @@ -95,7 +95,7 @@ class LLMConfig(RichModel): timeout: float = 180.0 rag_system_prompt: str | None = None rag_system_prompt_prefix: str | None = None - reasoning_enabled: bool = False + reasoning_enabled: bool = True class ResourceLimitsConfig(RichModel): diff --git a/nemo_retriever/src/nemo_retriever/service/retriever-service.yaml b/nemo_retriever/src/nemo_retriever/service/retriever-service.yaml index a9c11cecfa..25d122ec21 100644 --- a/nemo_retriever/src/nemo_retriever/service/retriever-service.yaml +++ b/nemo_retriever/src/nemo_retriever/service/retriever-service.yaml @@ -64,7 +64,7 @@ llm: timeout: 180.0 rag_system_prompt: null rag_system_prompt_prefix: null - reasoning_enabled: false + reasoning_enabled: true # Pipeline worker pools. Workers are abstract dispatchers — sizing # depends on whether they do local GPU work or fan out to remote NIMs. diff --git a/nemo_retriever/src/nemo_retriever/service/routers/ingest.py b/nemo_retriever/src/nemo_retriever/service/routers/ingest.py index 4ec2a647c0..b0dc856058 100644 --- a/nemo_retriever/src/nemo_retriever/service/routers/ingest.py +++ b/nemo_retriever/src/nemo_retriever/service/routers/ingest.py @@ -1366,8 +1366,11 @@ async def pipeline_config(request: Request): def _text_from_hit(hit: dict[str, Any]) -> str: - value = hit.get("text") or hit.get("content") or hit.get("chunk") or hit.get("page_content") - return str(value or "") + for key in ("text", "content", "chunk", "page_content"): + value = hit.get(key) + if value is not None: + return str(value) + return "" def _metadata_from_hit(hit: dict[str, Any]) -> dict[str, Any]: diff --git a/nemo_retriever/tests/test_helm_answer_llm_generation.py b/nemo_retriever/tests/test_helm_answer_llm_generation.py index 2f360f9abe..c962e303bd 100644 --- a/nemo_retriever/tests/test_helm_answer_llm_generation.py +++ b/nemo_retriever/tests/test_helm_answer_llm_generation.py @@ -79,7 +79,7 @@ def test_values_define_generic_answer_llm_with_super49b_defaults(self) -> None: self.assertIn("nvidia.com/gpu: 2", block) self.assertIn('size: "250Gi"', block) self.assertIn(_SUPER49B_PROFILE, block) - self.assertIn("reasoningEnabled: false", values) + self.assertIn("reasoningEnabled: true", values) self.assertIn('ragSystemPromptPrefix: ""', block) def test_default_render_omits_answer_llm_and_disables_llm_answering(self) -> None: @@ -108,7 +108,7 @@ def test_answer_llm_opt_in_renders_default_super49b_nim_and_autowires_llm_config self.assertIn("NCCL_P2P_DISABLE", proc.stdout) self.assertIn(f'api_base: "http://{_ANSWER_LLM_SERVICE}:8000/v1"', proc.stdout) self.assertIn(f'model: "{_SUPER49B_MODEL}"', proc.stdout) - self.assertIn("reasoning_enabled: false", proc.stdout) + self.assertIn("reasoning_enabled: true", proc.stdout) self.assertIn("rag_system_prompt_prefix: null", proc.stdout) self.assertIn("enabled: true", proc.stdout) self.assertIn("NEMO_RETRIEVER_LLM_API_KEY", proc.stdout) @@ -195,8 +195,9 @@ def test_explicit_llm_api_base_wins_without_operator_nim(self) -> None: self.assertIn('api_base: "http://external-llm:8000/v1"', proc.stdout) self.assertIn('model: "openai/custom-answerer"', proc.stdout) self.assertIn("rag_system_prompt_prefix: null", proc.stdout) + self.assertIn("reasoning_enabled: true", proc.stdout) - def test_service_llm_reasoning_enabled_can_be_overridden(self) -> None: + def test_service_llm_reasoning_enabled_can_be_disabled(self) -> None: proc = _helm_template( extra_args=( "--set", @@ -204,12 +205,12 @@ def test_service_llm_reasoning_enabled_can_be_overridden(self) -> None: "--set", "serviceConfig.llm.apiBase=http://external-llm:8000/v1", "--set", - "serviceConfig.llm.reasoningEnabled=true", + "serviceConfig.llm.reasoningEnabled=false", ) ) _assert_helm_ok(self, proc) - self.assertIn("reasoning_enabled: true", proc.stdout) + self.assertIn("reasoning_enabled: false", proc.stdout) self.assertIn("rag_system_prompt_prefix: null", proc.stdout) def test_answer_llm_filters_reserved_ngc_api_key_from_custom_env(self) -> None: diff --git a/nemo_retriever/tests/test_service_answer_generation.py b/nemo_retriever/tests/test_service_answer_generation.py index 726aefb834..dce779a7c6 100644 --- a/nemo_retriever/tests/test_service_answer_generation.py +++ b/nemo_retriever/tests/test_service_answer_generation.py @@ -19,6 +19,10 @@ from nemo_retriever.service.config import LLMConfig, LoggingConfig, PipelinePoolConfig, ServiceConfig, VectorDbConfig +def test_llm_config_defaults_to_reasoning_enabled_for_external_provider_safety() -> None: + assert LLMConfig().reasoning_enabled is True + + @pytest.fixture def app_with_answer_config(monkeypatch: pytest.MonkeyPatch, tmp_path): """Standalone service app with workers stubbed and LLM answering enabled.""" @@ -208,6 +212,47 @@ def _generate(query, chunks, *, reasoning_enabled=None): assert body["metadata"] is None +def test_answer_preserves_present_empty_text_hit_before_fallbacks( + app_with_answer_config: TestClient, + monkeypatch: pytest.MonkeyPatch, +) -> None: + class _FakeResponse: + status_code = 200 + content = json.dumps({"results": [{"hits": [{"text": "", "content": "fallback content"}]}]}).encode() + + def json(self) -> dict[str, Any]: + return json.loads(self.content.decode()) + + class _FakeAsyncClient: + def __init__(self, *args, **kwargs) -> None: + pass + + async def __aenter__(self): + return self + + async def __aexit__(self, exc_type, exc, tb) -> None: + return None + + async def post(self, url: str, **kwargs) -> _FakeResponse: + return _FakeResponse() + + monkeypatch.setattr("httpx.AsyncClient", _FakeAsyncClient) + seen: dict[str, Any] = {} + + def _generate(query, chunks, *, reasoning_enabled=None): + seen["chunks"] = chunks + return GenerationResult(answer="answer", latency_s=0.1, model="m") + + fake_llm = SimpleNamespace(generate=_generate) + + with patch("nemo_retriever.llm.clients.LiteLLMClient.from_kwargs", return_value=fake_llm): + resp = app_with_answer_config.post("/v1/answer", json={"query": "q", "include_chunks": True}) + + assert resp.status_code == 200, resp.text + assert seen["chunks"] == [""] + assert resp.json()["chunks"] == [""] + + def test_answer_returns_404_when_llm_disabled(monkeypatch: pytest.MonkeyPatch, tmp_path) -> None: async def _stub_work(_item): return 0, [] From 6d5f26114a6e0244202e6f245afe96a1f31f4a4e Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Fri, 5 Jun 2026 15:45:50 +0000 Subject: [PATCH 13/14] Validate enabled answer LLM model --- nemo_retriever/src/nemo_retriever/service/config.py | 8 +++++++- nemo_retriever/tests/test_service_answer_generation.py | 9 +++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/nemo_retriever/src/nemo_retriever/service/config.py b/nemo_retriever/src/nemo_retriever/service/config.py index 6a8ce6cf95..8ef1b368b2 100644 --- a/nemo_retriever/src/nemo_retriever/service/config.py +++ b/nemo_retriever/src/nemo_retriever/service/config.py @@ -11,7 +11,7 @@ from typing import Any, Literal import yaml -from pydantic import ConfigDict, Field +from pydantic import ConfigDict, Field, model_validator from nemo_retriever.service.models.base import RichModel @@ -97,6 +97,12 @@ class LLMConfig(RichModel): rag_system_prompt_prefix: str | None = None reasoning_enabled: bool = True + @model_validator(mode="after") + def _validate_enabled_model(self) -> "LLMConfig": + if self.enabled and not self.model.strip(): + raise ValueError("llm.model must be set when llm.enabled is true") + return self + class ResourceLimitsConfig(RichModel): model_config = ConfigDict(extra="forbid") diff --git a/nemo_retriever/tests/test_service_answer_generation.py b/nemo_retriever/tests/test_service_answer_generation.py index dce779a7c6..edad18df7a 100644 --- a/nemo_retriever/tests/test_service_answer_generation.py +++ b/nemo_retriever/tests/test_service_answer_generation.py @@ -23,6 +23,15 @@ def test_llm_config_defaults_to_reasoning_enabled_for_external_provider_safety() assert LLMConfig().reasoning_enabled is True +def test_llm_config_allows_empty_model_when_disabled_for_helm_default() -> None: + assert LLMConfig(enabled=False, model="").model == "" + + +def test_llm_config_rejects_empty_model_when_enabled() -> None: + with pytest.raises(ValueError, match="llm.model must be set when llm.enabled is true"): + LLMConfig(enabled=True, model="", api_base="http://external-llm:8000/v1") + + @pytest.fixture def app_with_answer_config(monkeypatch: pytest.MonkeyPatch, tmp_path): """Standalone service app with workers stubbed and LLM answering enabled.""" From 3131a2e4992378f37635c091b417e06ab8129aa5 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Fri, 5 Jun 2026 16:17:41 +0000 Subject: [PATCH 14/14] Preserve VectorDB error content type --- .../nemo_retriever/service/routers/ingest.py | 6 +++- .../tests/test_service_answer_generation.py | 33 +++++++++++++++++++ 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/nemo_retriever/src/nemo_retriever/service/routers/ingest.py b/nemo_retriever/src/nemo_retriever/service/routers/ingest.py index b0dc856058..7434852077 100644 --- a/nemo_retriever/src/nemo_retriever/service/routers/ingest.py +++ b/nemo_retriever/src/nemo_retriever/service/routers/ingest.py @@ -1421,7 +1421,11 @@ async def answer(req: AnswerRequest, request: Request) -> Response | AnswerRespo ) if resp.status_code != 200: - return Response(content=resp.content, status_code=resp.status_code, media_type="application/json") + return Response( + content=resp.content, + status_code=resp.status_code, + media_type=resp.headers.get("content-type", "application/json"), + ) payload = resp.json() result_sets = payload.get("results") or [] diff --git a/nemo_retriever/tests/test_service_answer_generation.py b/nemo_retriever/tests/test_service_answer_generation.py index edad18df7a..9b7f93dd87 100644 --- a/nemo_retriever/tests/test_service_answer_generation.py +++ b/nemo_retriever/tests/test_service_answer_generation.py @@ -155,6 +155,39 @@ async def post(self, url: str, **kwargs) -> _FakeResponse: ) +def test_answer_preserves_vectordb_error_content_type( + app_with_answer_config: TestClient, + monkeypatch: pytest.MonkeyPatch, +) -> None: + class _FakeResponse: + status_code = 429 + content = b"rate limited" + headers = {"content-type": "text/plain; charset=utf-8"} + + class _FakeAsyncClient: + def __init__(self, *args, **kwargs) -> None: + pass + + async def __aenter__(self): + return self + + async def __aexit__(self, exc_type, exc, tb) -> None: + return None + + async def post(self, url: str, **kwargs) -> _FakeResponse: + return _FakeResponse() + + monkeypatch.setattr("httpx.AsyncClient", _FakeAsyncClient) + + with patch("nemo_retriever.llm.clients.LiteLLMClient.from_kwargs") as from_kwargs: + resp = app_with_answer_config.post("/v1/answer", json={"query": "q"}) + + assert resp.status_code == 429 + assert resp.text == "rate limited" + assert resp.headers["content-type"] == "text/plain; charset=utf-8" + from_kwargs.assert_not_called() + + def test_answer_response_fields_respect_chunk_and_metadata_flags( app_with_answer_config: TestClient, monkeypatch: pytest.MonkeyPatch,