diff --git a/nemo_retriever/helm/README.md b/nemo_retriever/helm/README.md index 121a41fdd5..517f4fdbc0 100644 --- a/nemo_retriever/helm/README.md +++ b/nemo_retriever/helm/README.md @@ -937,6 +937,91 @@ sanity check before opening Grafana. --- +## Tracing and Zipkin + +Helm installs the chart-owned OpenTelemetry Collector and Zipkin backend on by +default. This is intentional: the legacy 26.1.2 Helm chart shipped with a +managed Zipkin deployment enabled, so the new chart keeps a default trace +backend available for functional parity. Pod trace export is still opt-in: + +```yaml +topology: + otel: + enabled: true + zipkin: + enabled: true + +service: + otel: + enabled: false + +nimOperator: + otel: + enabled: false +``` + +Because Zipkin is chart-owned by default, an upgrade with default values can +create a Zipkin Deployment and Service. Set `topology.zipkin.enabled=false` +before upgrading if your deployment uses an external backend or should not run +chart-owned Zipkin. + +Set `service.otel.enabled=true` and `nimOperator.otel.enabled=true` to have +retriever service pods and chart-managed NIMs emit OTLP to the chart's +OpenTelemetry Collector. Once pod trace export is enabled, the collector exports +traces to the chart-owned Zipkin service. Open a job and read the Zipkin lookup +key from either the JSON body or the `x-trace-id` response header: + +```bash +kubectl port-forward svc/tracing-smoke-nemo-retriever 7670:80 + +curl -s -D headers.txt -o job.json \ + -X POST http://localhost:7670/v1/ingest/job \ + -H 'content-type: application/json' \ + -d '{"expected_documents":1}' + +TRACE_ID=$(jq -r .trace_id job.json) +grep -i x-trace-id headers.txt +``` + +Port-forward Zipkin and query the trace directly: + +```bash +kubectl port-forward svc/tracing-smoke-nemo-retriever-zipkin 9411:9411 +curl "http://localhost:9411/api/v2/trace/${TRACE_ID}" +``` + +Common opt-out and override knobs: + +```yaml +topology: + zipkin: + enabled: false # do not deploy chart-owned Zipkin + exporter: + enabled: false # keep Zipkin deployed, but do not export traces to it + endpoint: http://external-zipkin:9411/api/v2/spans + +service: + otel: + enabled: true # inject service pod instrumentation env + +nimOperator: + otel: + enabled: true # inject inherited NIM OTLP env + page_elements: + otel: + enabled: false # per-NIM opt-out + ocr: + otel: + env: + TRITON_OTEL_RATE: "10" # per-NIM Triton OTel override +``` + +Set `topology.zipkin.exporter.endpoint` when you run your own Zipkin-compatible +collector. Set `topology.otel.enabled=false` to disable the chart-owned collector +and all chart-rendered collector wiring. + +--- + ## OpenShift deployment { #openshift-deployment } The chart defaults target generic Kubernetes clusters that allow fixed numeric diff --git a/nemo_retriever/helm/templates/_helpers.tpl b/nemo_retriever/helm/templates/_helpers.tpl index 90098d5594..df972ab2c5 100644 --- a/nemo_retriever/helm/templates/_helpers.tpl +++ b/nemo_retriever/helm/templates/_helpers.tpl @@ -200,6 +200,285 @@ nemo-retriever.role.configMapName {{- printf "%s-config" (include "nemo-retriever.role.fullname" .) -}} {{- end -}} + +{{/* +============================================================================= +Tracing helpers +============================================================================= +*/}} + +{{- define "nemo-retriever.suffixedFullname" -}} +{{- $base := include "nemo-retriever.fullname" .context -}} +{{- $suffix := .suffix -}} +{{- $maxBaseLen := int (sub 63 (len $suffix)) -}} +{{- printf "%s%s" ($base | trunc $maxBaseLen | trimSuffix "-") $suffix | trimSuffix "-" -}} +{{- end -}} + +{{- define "nemo-retriever.zipkin.fullname" -}} +{{- include "nemo-retriever.suffixedFullname" (dict "context" . "suffix" "-zipkin") -}} +{{- end -}} + +{{- define "nemo-retriever.otel.fullname" -}} +{{- include "nemo-retriever.suffixedFullname" (dict "context" . "suffix" "-otel") -}} +{{- end -}} + +{{- define "nemo-retriever.otel.configMapName" -}} +{{- include "nemo-retriever.suffixedFullname" (dict "context" . "suffix" "-otel-config") -}} +{{- end -}} + +{{- define "nemo-retriever.otel.ports" -}} +{{- $otel := .Values.topology.otel | default dict -}} +{{- if not (kindIs "map" $otel) -}} +{{- fail "topology.otel must be a map" -}} +{{- end -}} +{{- $ports := get $otel "ports" -}} +{{- if not (kindIs "map" $ports) -}} +{{- fail "topology.otel.ports must be a map when topology.otel.enabled=true" -}} +{{- end -}} +{{- range $portName := list "otlpGrpc" "otlpHttp" "prometheus" -}} +{{- if not (get $ports $portName) -}} +{{- fail (printf "topology.otel.ports.%s is required when topology.otel.enabled=true" $portName) -}} +{{- end -}} +{{- end -}} +{{- toYaml $ports -}} +{{- end -}} + +{{- define "nemo-retriever.zipkin.image" -}} +{{- $zipkin := .Values.topology.zipkin | default dict -}} +{{- if not (kindIs "map" $zipkin) -}} +{{- fail "topology.zipkin must be a map" -}} +{{- end -}} +{{- $image := get $zipkin "image" -}} +{{- if not (kindIs "map" $image) -}} +{{- fail "topology.zipkin.image must be a map when topology.zipkin.enabled=true" -}} +{{- end -}} +{{- range $fieldName := list "repository" "tag" "pullPolicy" -}} +{{- if not (get $image $fieldName) -}} +{{- fail (printf "topology.zipkin.image.%s is required when topology.zipkin.enabled=true" $fieldName) -}} +{{- end -}} +{{- end -}} +{{- toYaml $image -}} +{{- end -}} + +{{- define "nemo-retriever.zipkin.port" -}} +{{- $zipkin := .Values.topology.zipkin | default dict -}} +{{- if not (kindIs "map" $zipkin) -}} +{{- fail "topology.zipkin must be a map" -}} +{{- end -}} +{{- $port := get $zipkin "port" -}} +{{- if not $port -}} +{{- fail "topology.zipkin.port is required when topology.zipkin.enabled=true" -}} +{{- end -}} +{{- $port -}} +{{- end -}} + + +{{- define "nemo-retriever.zipkin.endpoint" -}} +{{- $zipkin := .Values.topology.zipkin | default dict -}} +{{- if not (kindIs "map" $zipkin) -}} +{{- fail "topology.zipkin must be a map" -}} +{{- end -}} +{{- $exporter := get $zipkin "exporter" | default dict -}} +{{- if not (kindIs "map" $exporter) -}} +{{- fail "topology.zipkin.exporter must be a map" -}} +{{- end -}} +{{- $endpoint := get $exporter "endpoint" -}} +{{- if $endpoint -}} +{{- tpl $endpoint . -}} +{{- else -}} +{{- printf "http://%s:%v/api/v2/spans" (include "nemo-retriever.zipkin.fullname" .) (include "nemo-retriever.zipkin.port" .) -}} +{{- end -}} +{{- end -}} + +{{- define "nemo-retriever.otel.config" -}} +{{- $otel := .Values.topology.otel | default dict -}} +{{- if not (kindIs "map" $otel) -}} +{{- fail "topology.otel must be a map" -}} +{{- end -}} +{{- $configValue := get $otel "config" -}} +{{- if not (kindIs "map" $configValue) -}} +{{- fail "topology.otel.config must be a map when topology.otel.enabled=true" -}} +{{- end -}} +{{- $config := deepCopy $configValue -}} +{{- $zipkin := .Values.topology.zipkin | default dict -}} +{{- if not (kindIs "map" $zipkin) -}} +{{- fail "topology.zipkin must be a map" -}} +{{- end -}} +{{- $zipkinExporterConfig := get $zipkin "exporter" | default dict -}} +{{- if not (kindIs "map" $zipkinExporterConfig) -}} +{{- fail "topology.zipkin.exporter must be a map" -}} +{{- end -}} +{{- $zipkinInjectionEnabled := and (get $zipkinExporterConfig "enabled") (or (get $zipkin "enabled") (get $zipkinExporterConfig "endpoint")) -}} +{{- if $zipkinInjectionEnabled -}} +{{- $service := get $config "service" | default dict -}} +{{- $pipelines := get $service "pipelines" | default dict -}} +{{- $traces := get $pipelines "traces" -}} +{{- if not $traces -}} +{{- fail "topology.zipkin.exporter.enabled requires topology.otel.config.service.pipelines.traces with non-empty receivers; provide that traces pipeline or set topology.zipkin.exporter.enabled=false" -}} +{{- end -}} +{{- $traceReceivers := get $traces "receivers" -}} +{{- if not $traceReceivers -}} +{{- fail "topology.zipkin.exporter.enabled requires topology.otel.config.service.pipelines.traces with non-empty receivers; provide that traces pipeline or set topology.zipkin.exporter.enabled=false" -}} +{{- end -}} +{{- $receivers := get $config "receivers" | default dict -}} +{{- range $receiverName := $traceReceivers -}} +{{- if or (not (hasKey $receivers $receiverName)) (eq (get $receivers $receiverName) nil) -}} +{{- fail (printf "topology.otel.config.service.pipelines.traces trace receiver %q is missing or null in topology.otel.config.receivers; fix topology.otel.config or set topology.zipkin.exporter.enabled=false" $receiverName) -}} +{{- end -}} +{{- end -}} +{{- $processors := get $config "processors" | default dict -}} +{{- $traceProcessors := get $traces "processors" | default list -}} +{{- range $processorName := $traceProcessors -}} +{{- if or (not (hasKey $processors $processorName)) (eq (get $processors $processorName) nil) -}} +{{- fail (printf "topology.otel.config.service.pipelines.traces trace processor %q is missing or null in topology.otel.config.processors; fix topology.otel.config or set topology.zipkin.exporter.enabled=false" $processorName) -}} +{{- end -}} +{{- end -}} +{{- $exporters := get $config "exporters" | default dict -}} +{{- $zipkinExporter := get $exporters "zipkin" | default dict -}} +{{- if not (kindIs "map" $zipkinExporter) -}} +{{- fail "topology.otel.config.exporters.zipkin must be a map; fix topology.otel.config or set topology.zipkin.exporter.enabled=false" -}} +{{- end -}} +{{- $zipkinExporter = mergeOverwrite (deepCopy $zipkinExporter) (dict "endpoint" (include "nemo-retriever.zipkin.endpoint" .)) -}} +{{- $_ := set $exporters "zipkin" $zipkinExporter -}} +{{- $_ := set $config "exporters" $exporters -}} +{{- $traceExporters := get $traces "exporters" | default list -}} +{{- if not (has "zipkin" $traceExporters) -}} +{{- $traceExporters = append $traceExporters "zipkin" -}} +{{- end -}} +{{- range $exporterName := $traceExporters -}} +{{- if or (not (hasKey $exporters $exporterName)) (eq (get $exporters $exporterName) nil) -}} +{{- fail (printf "topology.otel.config.service.pipelines.traces trace exporter %q is missing or null in topology.otel.config.exporters; fix topology.otel.config or set topology.zipkin.exporter.enabled=false" $exporterName) -}} +{{- end -}} +{{- end -}} +{{- $_ := set $traces "exporters" $traceExporters -}} +{{- $_ := set $pipelines "traces" $traces -}} +{{- $_ := set $service "pipelines" $pipelines -}} +{{- $_ := set $config "service" $service -}} +{{- end -}} +{{- toYaml $config -}} +{{- end -}} + +{{- define "nemo-retriever.serviceOtelEnv" -}} +{{- $root := .context -}} +{{- $role := .role -}} +{{- $serviceOtel := $root.Values.service.otel | default dict -}} +{{- $topologyOtel := $root.Values.topology.otel | default dict -}} +{{- if not (kindIs "map" $serviceOtel) -}} +{{- fail "service.otel must be a map" -}} +{{- end -}} +{{- if not (kindIs "map" $topologyOtel) -}} +{{- fail "topology.otel must be a map" -}} +{{- end -}} +{{- if and (get $topologyOtel "enabled") (get $serviceOtel "enabled") -}} +{{- $serviceOtelEnv := get $serviceOtel "env" | default dict -}} +{{- if not (kindIs "map" $serviceOtelEnv) -}} +{{- fail "service.otel.env must be a map" -}} +{{- end -}} +{{- $userEnvNames := dict -}} +{{- range $env := $root.Values.service.env -}} +{{- if and $env (hasKey $env "name") -}} +{{- $_ := set $userEnvNames $env.name true -}} +{{- end -}} +{{- end -}} +{{- $otelPorts := include "nemo-retriever.otel.ports" $root | fromYaml -}} +{{- $defaults := dict + "OTEL_EXPORTER_OTLP_ENDPOINT" (printf "http://%s:%v" (include "nemo-retriever.otel.fullname" $root) (get $otelPorts "otlpGrpc")) + "OTEL_SERVICE_NAME" (default "nemo-retriever-service" (get $serviceOtel "serviceName")) + "OTEL_TRACES_EXPORTER" "otlp" + "OTEL_METRICS_EXPORTER" "otlp" + "OTEL_LOGS_EXPORTER" "none" + "OTEL_PROPAGATORS" "tracecontext,baggage" + "OTEL_RESOURCE_ATTRIBUTES" (printf "service.namespace=nemo-retriever,service.role=%s" $role) + "OTEL_PYTHON_EXCLUDED_URLS" "health" +-}} +{{- $otelEnv := mergeOverwrite (deepCopy $defaults) (deepCopy $serviceOtelEnv) -}} +{{- range $name, $value := $otelEnv }} +{{- if not (hasKey $userEnvNames $name) }} +- name: {{ $name }} + value: {{ $value | quote }} +{{- end }} +{{- end }} +{{- end -}} +{{- end -}} + +{{- define "nemo-retriever.nimServiceOtelEnv" -}} +{{- $root := .context -}} +{{- $key := .key -}} +{{- $name := .name -}} +{{- $nim := index $root.Values.nimOperator $key -}} +{{- $chartOtel := $root.Values.nimOperator.otel | default dict -}} +{{- $nimOtel := $nim.otel | default dict -}} +{{- $topologyOtel := $root.Values.topology.otel | default dict -}} +{{- if not (kindIs "map" $chartOtel) -}} +{{- fail "nimOperator.otel must be a map" -}} +{{- end -}} +{{- if not (kindIs "map" $topologyOtel) -}} +{{- fail "topology.otel must be a map" -}} +{{- end -}} +{{- if not (kindIs "map" $nimOtel) -}} +{{- fail (printf "nimOperator.%s.otel must be a map" $key) -}} +{{- end -}} +{{- $enabled := get $chartOtel "enabled" -}} +{{- if and (hasKey $nimOtel "enabled") (not (eq (get $nimOtel "enabled") nil)) -}} +{{- $enabled = get $nimOtel "enabled" -}} +{{- end -}} +{{- if and (get $topologyOtel "enabled") $enabled -}} +{{- $chartNimOtelEnv := get $chartOtel "env" | default dict -}} +{{- $nimOtelEnv := get $nimOtel "env" | default dict -}} +{{- if not (kindIs "map" $chartNimOtelEnv) -}} +{{- fail "nimOperator.otel.env must be a map" -}} +{{- end -}} +{{- if not (kindIs "map" $nimOtelEnv) -}} +{{- fail (printf "nimOperator.%s.otel.env must be a map" $key) -}} +{{- end -}} +{{- $existingEnvNames := dict -}} +{{- $existingEnvValues := dict -}} +{{- range $env := $nim.env -}} +{{- if and $env (hasKey $env "name") -}} +{{- $_ := set $existingEnvNames $env.name true -}} +{{- if hasKey $env "value" -}} +{{- $_ := set $existingEnvValues $env.name $env.value -}} +{{- end -}} +{{- end -}} +{{- end -}} +{{- $otelPorts := include "nemo-retriever.otel.ports" $root | fromYaml -}} +{{- $defaultEndpoint := printf "http://%s:%v" (include "nemo-retriever.otel.fullname" $root) (get $otelPorts "otlpHttp") -}} +{{- $chartEndpoint := default $defaultEndpoint (get $chartOtel "endpoint") -}} +{{- $endpoint := default $chartEndpoint (get $nimOtel "endpoint") -}} +{{- $tritonPath := default "/v1/traces" (get $chartOtel "tritonPath") -}} +{{- $serviceName := default $name (get $nimOtel "serviceName") -}} +{{- $defaults := dict + "NIM_ENABLE_OTEL" "true" + "NIM_OTEL_SERVICE_NAME" $serviceName + "NIM_OTEL_TRACES_EXPORTER" "otlp" + "NIM_OTEL_METRICS_EXPORTER" "console" + "NIM_OTEL_EXPORTER_OTLP_ENDPOINT" $endpoint + "TRITON_OTEL_URL" (printf "%s%s" $endpoint $tritonPath) + "TRITON_OTEL_RATE" "1" +-}} +{{- $explicitTritonUrl := or (hasKey $existingEnvNames "TRITON_OTEL_URL") (hasKey $chartNimOtelEnv "TRITON_OTEL_URL") (hasKey $nimOtelEnv "TRITON_OTEL_URL") -}} +{{- $existingEndpointWithoutLiteral := and (hasKey $existingEnvNames "NIM_OTEL_EXPORTER_OTLP_ENDPOINT") (not (hasKey $existingEnvValues "NIM_OTEL_EXPORTER_OTLP_ENDPOINT")) -}} +{{- $otelEnv := mergeOverwrite (deepCopy $defaults) (deepCopy $chartNimOtelEnv) (deepCopy $nimOtelEnv) -}} +{{- $finalEndpoint := get $otelEnv "NIM_OTEL_EXPORTER_OTLP_ENDPOINT" -}} +{{- if hasKey $existingEnvValues "NIM_OTEL_EXPORTER_OTLP_ENDPOINT" -}} +{{- $finalEndpoint = get $existingEnvValues "NIM_OTEL_EXPORTER_OTLP_ENDPOINT" -}} +{{- end -}} +{{- if not $explicitTritonUrl -}} +{{- if $existingEndpointWithoutLiteral -}} +{{- $_ := unset $otelEnv "TRITON_OTEL_URL" -}} +{{- else -}} +{{- $_ := set $otelEnv "TRITON_OTEL_URL" (printf "%s%s" $finalEndpoint $tritonPath) -}} +{{- end -}} +{{- end -}} +{{- range $envName, $envValue := $otelEnv }} +{{- if not (hasKey $existingEnvNames $envName) }} +- name: {{ $envName }} + value: {{ $envValue | quote }} +{{- end }} +{{- end }} +{{- end -}} +{{- end -}} + {{/* ============================================================================= NIMService GPU resources diff --git a/nemo_retriever/helm/templates/configmap-otel.yaml b/nemo_retriever/helm/templates/configmap-otel.yaml index 11a07dd00d..eafc7b0183 100644 --- a/nemo_retriever/helm/templates/configmap-otel.yaml +++ b/nemo_retriever/helm/templates/configmap-otel.yaml @@ -1,13 +1,16 @@ -{{- if .Values.topology.otel.enabled }} -{{- $fullname := include "nemo-retriever.fullname" . -}} -{{- $otelName := printf "%s-otel" $fullname -}} +{{- $otel := .Values.topology.otel | default dict -}} +{{- if not (kindIs "map" $otel) -}} +{{- fail "topology.otel must be a map" -}} +{{- end -}} +{{- if get $otel "enabled" }} +{{- $otelConfigName := include "nemo-retriever.otel.configMapName" . -}} apiVersion: v1 kind: ConfigMap metadata: - name: {{ $otelName }}-config + name: {{ $otelConfigName }} labels: {{- include "nemo-retriever.role.labels" (dict "context" $ "role" "otel") | nindent 4 }} data: config.yaml: | - {{- toYaml .Values.topology.otel.config | nindent 4 }} + {{- include "nemo-retriever.otel.config" . | nindent 4 }} {{- end }} diff --git a/nemo_retriever/helm/templates/deployment-otel.yaml b/nemo_retriever/helm/templates/deployment-otel.yaml index f2c908fdd3..8074b7884c 100644 --- a/nemo_retriever/helm/templates/deployment-otel.yaml +++ b/nemo_retriever/helm/templates/deployment-otel.yaml @@ -1,7 +1,11 @@ -{{- if .Values.topology.otel.enabled }} -{{- $fullname := include "nemo-retriever.fullname" . -}} -{{- $otelName := printf "%s-otel" $fullname -}} -{{- $otel := .Values.topology.otel -}} +{{- $otel := .Values.topology.otel | default dict -}} +{{- if not (kindIs "map" $otel) -}} +{{- fail "topology.otel must be a map" -}} +{{- end -}} +{{- if get $otel "enabled" }} +{{- $otelName := include "nemo-retriever.otel.fullname" . -}} +{{- $otelConfigName := include "nemo-retriever.otel.configMapName" . -}} +{{- $otelPorts := include "nemo-retriever.otel.ports" . | fromYaml -}} apiVersion: apps/v1 kind: Deployment metadata: @@ -20,6 +24,7 @@ spec: annotations: checksum/otel-config: {{ include (print $.Template.BasePath "/configmap-otel.yaml") . | sha256sum }} spec: + {{- include "nemo-retriever.imagePullSecrets" . | nindent 6 }} {{- with $otel.nodeSelector }} nodeSelector: {{- toYaml . | nindent 8 }} @@ -40,13 +45,13 @@ spec: - --config=/etc/otelcol/config.yaml ports: - name: otlp-grpc - containerPort: {{ $otel.ports.otlpGrpc }} + containerPort: {{ get $otelPorts "otlpGrpc" }} protocol: TCP - name: otlp-http - containerPort: {{ $otel.ports.otlpHttp }} + containerPort: {{ get $otelPorts "otlpHttp" }} protocol: TCP - name: prometheus - containerPort: {{ $otel.ports.prometheus }} + containerPort: {{ get $otelPorts "prometheus" }} protocol: TCP resources: {{- toYaml $otel.resources | nindent 12 }} @@ -73,5 +78,5 @@ spec: volumes: - name: otel-config configMap: - name: {{ $otelName }}-config + name: {{ $otelConfigName }} {{- end }} diff --git a/nemo_retriever/helm/templates/deployment-zipkin.yaml b/nemo_retriever/helm/templates/deployment-zipkin.yaml new file mode 100644 index 0000000000..96ba38e000 --- /dev/null +++ b/nemo_retriever/helm/templates/deployment-zipkin.yaml @@ -0,0 +1,82 @@ +{{- $otel := .Values.topology.otel | default dict -}} +{{- $zipkin := .Values.topology.zipkin | default dict -}} +{{- if not (kindIs "map" $otel) -}} +{{- fail "topology.otel must be a map" -}} +{{- end -}} +{{- if not (kindIs "map" $zipkin) -}} +{{- fail "topology.zipkin must be a map" -}} +{{- end -}} +{{- if and (get $otel "enabled") (get $zipkin "enabled") }} +{{- $startupProbe := $zipkin.startupProbe | default dict -}} +{{- $livenessProbe := $zipkin.livenessProbe | default dict -}} +{{- $readinessProbe := $zipkin.readinessProbe | default dict -}} +{{- $resources := $zipkin.resources | default dict -}} +{{- $zipkinImage := include "nemo-retriever.zipkin.image" . | fromYaml -}} +{{- $zipkinPort := include "nemo-retriever.zipkin.port" . -}} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "nemo-retriever.zipkin.fullname" . }} + labels: + {{- include "nemo-retriever.role.labels" (dict "context" $ "role" "zipkin") | nindent 4 }} +spec: + replicas: 1 + selector: + matchLabels: + {{- include "nemo-retriever.role.selectorLabels" (dict "context" $ "role" "zipkin") | nindent 6 }} + template: + metadata: + labels: + {{- include "nemo-retriever.role.selectorLabels" (dict "context" $ "role" "zipkin") | nindent 8 }} + spec: + {{- include "nemo-retriever.imagePullSecrets" . | nindent 6 }} + {{- with $zipkin.podSecurityContext }} + securityContext: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with $zipkin.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with $zipkin.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with $zipkin.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + containers: + - name: zipkin + image: "{{ get $zipkinImage "repository" }}:{{ get $zipkinImage "tag" }}" + imagePullPolicy: {{ get $zipkinImage "pullPolicy" }} + {{- with $zipkin.securityContext }} + securityContext: + {{- toYaml . | nindent 12 }} + {{- end }} + {{- with $zipkin.javaOpts }} + env: + - name: JAVA_OPTS + value: {{ . | quote }} + {{- end }} + ports: + - name: http + containerPort: {{ $zipkinPort }} + protocol: TCP + {{- if get $startupProbe "enabled" }} + startupProbe: + {{- omit $startupProbe "enabled" | toYaml | nindent 12 }} + {{- end }} + {{- if get $livenessProbe "enabled" }} + livenessProbe: + {{- omit $livenessProbe "enabled" | toYaml | nindent 12 }} + {{- end }} + {{- if get $readinessProbe "enabled" }} + readinessProbe: + {{- omit $readinessProbe "enabled" | toYaml | nindent 12 }} + {{- end }} + {{- with $resources }} + resources: + {{- toYaml . | nindent 12 }} + {{- end }} +{{- end }} diff --git a/nemo_retriever/helm/templates/deployment.yaml b/nemo_retriever/helm/templates/deployment.yaml index 508ad5eed2..4dd69c9ece 100644 --- a/nemo_retriever/helm/templates/deployment.yaml +++ b/nemo_retriever/helm/templates/deployment.yaml @@ -101,6 +101,7 @@ spec: - name: INSTALL_FFMPEG value: "true" {{- end }} + {{ include "nemo-retriever.serviceOtelEnv" (dict "context" $ "role" "standalone") | nindent 12 }} {{- with $svc.env }} {{- toYaml . | nindent 12 }} {{- end }} @@ -265,6 +266,7 @@ spec: - name: INSTALL_FFMPEG value: "true" {{- end }} + {{ include "nemo-retriever.serviceOtelEnv" (dict "context" $ "role" $role) | nindent 12 }} {{- with $svc.env }} {{- toYaml . | nindent 12 }} {{- end }} diff --git a/nemo_retriever/helm/templates/nims/audio.yaml b/nemo_retriever/helm/templates/nims/audio.yaml index e179ad1b28..25a0d7dcf6 100644 --- a/nemo_retriever/helm/templates/nims/audio.yaml +++ b/nemo_retriever/helm/templates/nims/audio.yaml @@ -42,5 +42,6 @@ spec: expose: {{ toYaml .Values.nimOperator.audio.expose | indent 4 }} env: +{{ include "nemo-retriever.nimServiceOtelEnv" (dict "context" $ "key" "audio" "name" "audio") | nindent 4 }} {{ toYaml .Values.nimOperator.audio.env | indent 4 }} {{- end }} diff --git a/nemo_retriever/helm/templates/nims/llama-nemotron-embed-vl-1b-v2.yaml b/nemo_retriever/helm/templates/nims/llama-nemotron-embed-vl-1b-v2.yaml index 8a20b78eaa..88133b5c03 100644 --- a/nemo_retriever/helm/templates/nims/llama-nemotron-embed-vl-1b-v2.yaml +++ b/nemo_retriever/helm/templates/nims/llama-nemotron-embed-vl-1b-v2.yaml @@ -43,6 +43,7 @@ spec: expose: {{ toYaml .Values.nimOperator.vlm_embed.expose | indent 4 }} env: +{{ include "nemo-retriever.nimServiceOtelEnv" (dict "context" $ "key" "vlm_embed" "name" $name) | nindent 4 }} {{ toYaml .Values.nimOperator.vlm_embed.env | indent 4 }} - name: NGC_API_KEY valueFrom: diff --git a/nemo_retriever/helm/templates/nims/llama-nemotron-rerank-vl-1b-v2.yaml b/nemo_retriever/helm/templates/nims/llama-nemotron-rerank-vl-1b-v2.yaml index 26022d1b9b..68847a25a5 100644 --- a/nemo_retriever/helm/templates/nims/llama-nemotron-rerank-vl-1b-v2.yaml +++ b/nemo_retriever/helm/templates/nims/llama-nemotron-rerank-vl-1b-v2.yaml @@ -42,5 +42,6 @@ spec: expose: {{ toYaml .Values.nimOperator.rerankqa.expose | indent 4 }} env: +{{ include "nemo-retriever.nimServiceOtelEnv" (dict "context" $ "key" "rerankqa" "name" "llama-nemotron-rerank-vl-1b-v2") | nindent 4 }} {{ toYaml .Values.nimOperator.rerankqa.env | indent 4 }} {{- end }} diff --git a/nemo_retriever/helm/templates/nims/nemotron-3-nano-omni-30b-a3b-reasoning.yaml b/nemo_retriever/helm/templates/nims/nemotron-3-nano-omni-30b-a3b-reasoning.yaml index 7f39389d69..dbade9a2ba 100644 --- a/nemo_retriever/helm/templates/nims/nemotron-3-nano-omni-30b-a3b-reasoning.yaml +++ b/nemo_retriever/helm/templates/nims/nemotron-3-nano-omni-30b-a3b-reasoning.yaml @@ -42,5 +42,6 @@ spec: expose: {{ toYaml .Values.nimOperator.nemotron_3_nano_omni_30b_a3b_reasoning.expose | indent 4 }} env: +{{ include "nemo-retriever.nimServiceOtelEnv" (dict "context" $ "key" "nemotron_3_nano_omni_30b_a3b_reasoning" "name" "nemotron-3-nano-omni-30b-a3b-reasoning") | nindent 4 }} {{ toYaml .Values.nimOperator.nemotron_3_nano_omni_30b_a3b_reasoning.env | indent 4 }} {{- end }} diff --git a/nemo_retriever/helm/templates/nims/nemotron-ocr-v1.yaml b/nemo_retriever/helm/templates/nims/nemotron-ocr-v1.yaml index ef3e6726e2..2ec364414a 100644 --- a/nemo_retriever/helm/templates/nims/nemotron-ocr-v1.yaml +++ b/nemo_retriever/helm/templates/nims/nemotron-ocr-v1.yaml @@ -43,5 +43,6 @@ spec: expose: {{ toYaml .Values.nimOperator.ocr.expose | indent 4 }} env: +{{ include "nemo-retriever.nimServiceOtelEnv" (dict "context" $ "key" "ocr" "name" $name) | nindent 4 }} {{ toYaml .Values.nimOperator.ocr.env | indent 4 }} {{- end }} diff --git a/nemo_retriever/helm/templates/nims/nemotron-page-elements-v3.yaml b/nemo_retriever/helm/templates/nims/nemotron-page-elements-v3.yaml index dfe66b2e00..7cc0945d71 100644 --- a/nemo_retriever/helm/templates/nims/nemotron-page-elements-v3.yaml +++ b/nemo_retriever/helm/templates/nims/nemotron-page-elements-v3.yaml @@ -44,5 +44,6 @@ spec: expose: {{ toYaml .Values.nimOperator.page_elements.expose | indent 4 }} env: +{{ include "nemo-retriever.nimServiceOtelEnv" (dict "context" $ "key" "page_elements" "name" "nemotron-page-elements-v3") | nindent 4 }} {{ toYaml .Values.nimOperator.page_elements.env | indent 4 }} {{- end }} diff --git a/nemo_retriever/helm/templates/nims/nemotron-parse.yaml b/nemo_retriever/helm/templates/nims/nemotron-parse.yaml index 31033f1af0..2da43a7e3c 100644 --- a/nemo_retriever/helm/templates/nims/nemotron-parse.yaml +++ b/nemo_retriever/helm/templates/nims/nemotron-parse.yaml @@ -42,5 +42,6 @@ spec: expose: {{ toYaml .Values.nimOperator.nemotron_parse.expose | indent 4 }} env: +{{ include "nemo-retriever.nimServiceOtelEnv" (dict "context" $ "key" "nemotron_parse" "name" "nemotron-parse") | nindent 4 }} {{ toYaml .Values.nimOperator.nemotron_parse.env | indent 4 }} {{- end }} diff --git a/nemo_retriever/helm/templates/nims/nemotron-table-structure-v1.yaml b/nemo_retriever/helm/templates/nims/nemotron-table-structure-v1.yaml index 9f75fbead6..e9b5da2c46 100644 --- a/nemo_retriever/helm/templates/nims/nemotron-table-structure-v1.yaml +++ b/nemo_retriever/helm/templates/nims/nemotron-table-structure-v1.yaml @@ -42,5 +42,6 @@ spec: expose: {{ toYaml .Values.nimOperator.table_structure.expose | indent 4 }} env: +{{ include "nemo-retriever.nimServiceOtelEnv" (dict "context" $ "key" "table_structure" "name" "nemotron-table-structure-v1") | nindent 4 }} {{ toYaml .Values.nimOperator.table_structure.env | indent 4 }} {{- end }} diff --git a/nemo_retriever/helm/templates/service-otel.yaml b/nemo_retriever/helm/templates/service-otel.yaml index 83335998c2..1b55ba251e 100644 --- a/nemo_retriever/helm/templates/service-otel.yaml +++ b/nemo_retriever/helm/templates/service-otel.yaml @@ -1,10 +1,14 @@ -{{- if .Values.topology.otel.enabled }} -{{- $fullname := include "nemo-retriever.fullname" . -}} -{{- $otel := .Values.topology.otel -}} +{{- $otel := .Values.topology.otel | default dict -}} +{{- if not (kindIs "map" $otel) -}} +{{- fail "topology.otel must be a map" -}} +{{- end -}} +{{- if get $otel "enabled" }} +{{- $otelName := include "nemo-retriever.otel.fullname" . -}} +{{- $otelPorts := include "nemo-retriever.otel.ports" . | fromYaml -}} apiVersion: v1 kind: Service metadata: - name: {{ $fullname }}-otel + name: {{ $otelName }} labels: {{- include "nemo-retriever.role.labels" (dict "context" $ "role" "otel") | nindent 4 }} spec: @@ -14,14 +18,14 @@ spec: ports: - name: otlp-grpc protocol: TCP - port: {{ $otel.ports.otlpGrpc }} + port: {{ get $otelPorts "otlpGrpc" }} targetPort: otlp-grpc - name: otlp-http protocol: TCP - port: {{ $otel.ports.otlpHttp }} + port: {{ get $otelPorts "otlpHttp" }} targetPort: otlp-http - name: prometheus protocol: TCP - port: {{ $otel.ports.prometheus }} + port: {{ get $otelPorts "prometheus" }} targetPort: prometheus {{- end }} diff --git a/nemo_retriever/helm/templates/service-zipkin.yaml b/nemo_retriever/helm/templates/service-zipkin.yaml new file mode 100644 index 0000000000..a1439add67 --- /dev/null +++ b/nemo_retriever/helm/templates/service-zipkin.yaml @@ -0,0 +1,26 @@ +{{- $otel := .Values.topology.otel | default dict -}} +{{- $zipkin := .Values.topology.zipkin | default dict -}} +{{- if not (kindIs "map" $otel) -}} +{{- fail "topology.otel must be a map" -}} +{{- end -}} +{{- if not (kindIs "map" $zipkin) -}} +{{- fail "topology.zipkin must be a map" -}} +{{- end -}} +{{- if and (get $otel "enabled") (get $zipkin "enabled") }} +{{- $zipkinPort := include "nemo-retriever.zipkin.port" . -}} +apiVersion: v1 +kind: Service +metadata: + name: {{ include "nemo-retriever.zipkin.fullname" . }} + labels: + {{- include "nemo-retriever.role.labels" (dict "context" $ "role" "zipkin") | nindent 4 }} +spec: + type: ClusterIP + selector: + {{- include "nemo-retriever.role.selectorLabels" (dict "context" $ "role" "zipkin") | nindent 4 }} + ports: + - name: http + protocol: TCP + port: {{ $zipkinPort }} + targetPort: http +{{- end }} diff --git a/nemo_retriever/helm/values.yaml b/nemo_retriever/helm/values.yaml index 2486feb49c..b3f41f1dc5 100644 --- a/nemo_retriever/helm/values.yaml +++ b/nemo_retriever/helm/values.yaml @@ -178,6 +178,12 @@ service: # Do not also set INSTALL_FFMPEG manually in service.env. installFfmpeg: true + # OpenTelemetry env defaults for the retriever service container. + otel: + enabled: false + serviceName: nemo-retriever-service + env: {} + # Extra env vars (after the chart-managed ones). Use `envFrom` to pull # whole Secrets/ConfigMaps in. env: [] @@ -443,6 +449,59 @@ topology: processors: [batch] exporters: [debug] + # --------------------------------------------------------------------------- + # Zipkin + # --------------------------------------------------------------------------- + # Enabled by default to preserve managed trace-backend parity with the legacy + # 26.1.2 Helm chart. Set false to avoid deploying chart-owned Zipkin. + zipkin: + enabled: true + image: + repository: openzipkin/zipkin + tag: "3.5.0" + pullPolicy: IfNotPresent + port: 9411 + javaOpts: "-Xms128m -Xmx512m -XX:+ExitOnOutOfMemoryError" + livenessProbe: + enabled: true + tcpSocket: + port: http + initialDelaySeconds: 30 + periodSeconds: 30 + timeoutSeconds: 5 + failureThreshold: 6 + readinessProbe: + enabled: true + tcpSocket: + port: http + initialDelaySeconds: 5 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 6 + startupProbe: + enabled: true + tcpSocket: + port: http + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 5 + failureThreshold: 60 + resources: + requests: + cpu: "250m" + memory: "512Mi" + limits: + cpu: "1" + memory: "1Gi" + nodeSelector: {} + tolerations: [] + affinity: {} + podSecurityContext: {} + securityContext: {} + exporter: + enabled: true + endpoint: "" + # ============================================================================= # retriever-service.yaml configuration # ============================================================================= @@ -786,6 +845,17 @@ nimOperator: # ``resources`` overrides the entire block when non-empty. nimServiceGpuLimit: 1 + # OpenTelemetry defaults applied to NIMService env blocks. + otel: + enabled: false + endpoint: "" + tritonPath: "/v1/traces" + env: + NIM_ENABLE_OTEL: "true" + NIM_OTEL_TRACES_EXPORTER: "otlp" + NIM_OTEL_METRICS_EXPORTER: "console" + TRITON_OTEL_RATE: "1" + # --------------------------------------------------------------------------- # Per-NIM defaults # --------------------------------------------------------------------------- @@ -853,6 +923,11 @@ nimOperator: type: ClusterIP port: 8000 grpcPort: 8001 + otel: + enabled: null + serviceName: "" + endpoint: "" + env: {} env: - name: NIM_HTTP_API_PORT value: "8000" @@ -897,6 +972,11 @@ nimOperator: type: ClusterIP port: 8000 grpcPort: 8001 + otel: + enabled: null + serviceName: "" + endpoint: "" + env: {} env: - name: NIM_HTTP_API_PORT value: "8000" @@ -943,6 +1023,11 @@ nimOperator: type: ClusterIP port: 8000 grpcPort: 8001 + otel: + enabled: null + serviceName: "" + endpoint: "" + env: {} env: - name: OMP_NUM_THREADS value: "8" @@ -986,6 +1071,11 @@ nimOperator: type: ClusterIP port: 8000 grpcPort: 8001 + otel: + enabled: null + serviceName: "" + endpoint: "" + env: {} env: - name: NIM_HTTP_API_PORT value: "8000" @@ -1049,6 +1139,11 @@ nimOperator: type: ClusterIP port: 8000 grpcPort: 8001 + otel: + enabled: null + serviceName: "" + endpoint: "" + env: {} env: - name: NIM_HTTP_API_PORT value: "8000" @@ -1101,6 +1196,11 @@ nimOperator: type: ClusterIP port: 8000 grpcPort: 8001 + otel: + enabled: null + serviceName: "" + endpoint: "" + env: {} env: - name: NIM_HTTP_API_PORT value: "8000" @@ -1159,6 +1259,11 @@ nimOperator: type: ClusterIP port: 8000 grpcPort: 8001 + otel: + enabled: null + serviceName: "" + endpoint: "" + env: {} env: - name: NIM_HTTP_API_PORT value: "8000" @@ -1195,6 +1300,11 @@ nimOperator: type: ClusterIP port: 9000 grpcPort: 50051 + otel: + enabled: null + serviceName: "" + endpoint: "" + env: {} env: - name: NIM_TAGS_SELECTOR value: "name=parakeet-1-1b-ctc-en-us,mode=ofl,vad=default,diarizer=disabled" diff --git a/nemo_retriever/pyproject.toml b/nemo_retriever/pyproject.toml index 577974ffcf..a9982d8da2 100644 --- a/nemo_retriever/pyproject.toml +++ b/nemo_retriever/pyproject.toml @@ -47,6 +47,9 @@ dependencies = [ "uvicorn[standard]>=0.30.0", "python-multipart>=0.0.9", "prometheus-fastapi-instrumentator>=7.0,<8", + "opentelemetry-api>=1.41.1", + "opentelemetry-sdk>=1.41.1", + "opentelemetry-exporter-otlp-proto-grpc>=1.41.1", # MCP server for AI agent integration "fastmcp>=2.0.0", # HTTP clients diff --git a/nemo_retriever/src/nemo_retriever/api/internal/primitives/nim/nim_client.py b/nemo_retriever/src/nemo_retriever/api/internal/primitives/nim/nim_client.py index 56e51c1465..750eb69dc3 100644 --- a/nemo_retriever/src/nemo_retriever/api/internal/primitives/nim/nim_client.py +++ b/nemo_retriever/src/nemo_retriever/api/internal/primitives/nim/nim_client.py @@ -5,6 +5,7 @@ from __future__ import annotations import hashlib +import inspect import json import logging import re @@ -12,6 +13,7 @@ import time import queue from collections import namedtuple +from contextlib import nullcontext from concurrent.futures import Future, ThreadPoolExecutor, as_completed from typing import Any from typing import Optional @@ -27,6 +29,88 @@ logger = logging.getLogger(__name__) +def _service_tracing() -> Any | None: + try: + from nemo_retriever.service import tracing + except Exception: + logger.debug("Service tracing helper unavailable for NimClient request", exc_info=True) + return None + return tracing + + +def _capture_trace_context() -> dict[str, str] | None: + tracing = _service_tracing() + if tracing is None: + return None + try: + return dict(tracing.inject_trace_context()) + except Exception as exc: + logger.warning("OpenTelemetry trace context capture failed for NimClient request: %s", exc) + return None + + +def _extract_trace_context(trace_context: dict[str, str] | None) -> Any | None: + if not trace_context: + return None + tracing = _service_tracing() + if tracing is None: + return None + try: + return tracing.extract_trace_context(trace_context) + except Exception as exc: + logger.warning("OpenTelemetry trace context extraction failed for NimClient request: %s", exc) + return None + + +def _same_trace_context(left: dict[str, str] | None, right: dict[str, str] | None) -> bool: + return dict(left or {}) == dict(right or {}) + + +def _inject_trace_headers() -> dict[str, str]: + tracing = _service_tracing() + if tracing is None: + return {} + try: + return dict(tracing.inject_trace_context()) + except Exception as exc: + logger.warning("OpenTelemetry trace propagation failed for NimClient request: %s", exc) + return {} + + +def _call_infer_with_optional_headers(client: Any, *, headers: dict[str, str], **kwargs: Any) -> Any: + if not headers: + return client.infer(**kwargs) + + if not _callable_accepts_keyword(client.infer, "headers"): + logger.debug("Triton gRPC client does not expose infer(headers=...); skipping trace metadata propagation") + return client.infer(**kwargs) + + try: + return client.infer(headers=headers, **kwargs) + except TypeError as exc: + message = str(exc).lower() + if "headers" in message and "unexpected" in message: + logger.debug("Triton gRPC client rejected infer(headers=...); retrying without trace metadata") + return client.infer(**kwargs) + raise + + +def _callable_accepts_keyword(func: Any, keyword: str) -> bool: + try: + signature = inspect.signature(func) + except (TypeError, ValueError): + return True + for parameter in signature.parameters.values(): + if parameter.kind == inspect.Parameter.VAR_KEYWORD: + return True + if parameter.name == keyword and parameter.kind in { + inspect.Parameter.POSITIONAL_OR_KEYWORD, + inspect.Parameter.KEYWORD_ONLY, + }: + return True + return False + + def _triton_grpc(): """Load Triton gRPC client only when gRPC inference is used (slim installs skip tritonclient).""" import tritonclient.grpc as grpcclient # noqa: PLC0415 @@ -41,7 +125,7 @@ def _triton_grpc(): ) # A simple structure to hold a request's data and its Future for the result -InferenceRequest = namedtuple("InferenceRequest", ["data", "future", "model_name", "dims", "kwargs"]) +InferenceRequest = namedtuple("InferenceRequest", ["data", "future", "model_name", "dims", "kwargs", "trace_context"]) class NimClient: @@ -166,7 +250,9 @@ def _fetch_max_batch_size(self, model_name, model_version: str = "") -> int: return self._max_batch_sizes[model_name] - def _process_batch(self, batch_input, *, batch_data, model_name, **kwargs): + def _process_batch( + self, batch_input, *, batch_data, model_name, _trace_context: dict[str, str] | None = None, **kwargs + ): """ Process a single batch input for inference using its corresponding batch_data. @@ -186,21 +272,37 @@ def _process_batch(self, batch_input, *, batch_data, model_name, **kwargs): tuple A tuple (parsed_output, batch_data) for subsequent post-processing. """ - if self.protocol == "grpc": - logger.debug("Performing gRPC inference for a batch...") - response = self._grpc_infer(batch_input, model_name, **kwargs) - logger.debug("gRPC inference received response for a batch") - elif self.protocol == "http": - logger.debug("Performing HTTP inference for a batch...") - response = self._http_infer(batch_input) - logger.debug("HTTP inference received response for a batch") - else: - raise ValueError("Invalid protocol specified. Must be 'grpc' or 'http'.") - - parsed_output = self.model_interface.parse_output( - response, protocol=self.protocol, data=batch_data, model_name=model_name, **kwargs + tracing = _service_tracing() + span_parent_context = _extract_trace_context(_trace_context) + span_context = ( + tracing.start_span( + "nim.infer", + context=span_parent_context, + attributes={ + "nim.model": model_name, + "nim.protocol": self.protocol, + "nim.service": self.model_interface.name(), + }, + ) + if tracing is not None + else nullcontext() ) - return parsed_output, batch_data + with span_context: + if self.protocol == "grpc": + logger.debug("Performing gRPC inference for a batch...") + response = self._grpc_infer(batch_input, model_name, **kwargs) + logger.debug("gRPC inference received response for a batch") + elif self.protocol == "http": + logger.debug("Performing HTTP inference for a batch...") + response = self._http_infer(batch_input) + logger.debug("HTTP inference received response for a batch") + else: + raise ValueError("Invalid protocol specified. Must be 'grpc' or 'http'.") + + parsed_output = self.model_interface.parse_output( + response, protocol=self.protocol, data=batch_data, model_name=model_name, **kwargs + ) + return parsed_output, batch_data def try_set_max_batch_size(self, model_name, model_version: str = ""): """Attempt to set the max batch size for the model if it is not already set, ensuring thread safety.""" @@ -276,12 +378,18 @@ def infer(self, data: dict, model_name: str, **kwargs) -> Any: # 4. Process each batch concurrently using a thread pool. # We enumerate the batches so that we can later reassemble results in order. + trace_context = _capture_trace_context() results = [None] * len(formatted_batches) with ThreadPoolExecutor(max_workers=max_pool_workers) as executor: future_to_idx = {} for idx, (batch, batch_data) in enumerate(zip(formatted_batches, formatted_batch_data)): future = executor.submit( - self._process_batch, batch, batch_data=batch_data, model_name=model_name, **kwargs + self._process_batch, + batch, + batch_data=batch_data, + model_name=model_name, + _trace_context=trace_context, + **kwargs, ) future_to_idx[future] = idx @@ -354,8 +462,13 @@ def _grpc_infer( while attempt < self.max_retries: try: - response = self.client.infer( - model_name=model_name, parameters=parameters, inputs=input_tensors, outputs=outputs + response = _call_infer_with_optional_headers( + self.client, + headers=_inject_trace_headers(), + model_name=model_name, + parameters=parameters, + inputs=input_tensors, + outputs=outputs, ) logger.debug(f"gRPC inference response: {response}") @@ -479,8 +592,16 @@ def _http_infer(self, formatted_input: dict) -> dict: model_name = self.model_interface.name() logger.debug(f"{model_name}: Sending HTTP request with system prompt: '{system_content}'") + request_headers = dict(self.headers) + tracing = _service_tracing() + if tracing is not None: + try: + tracing.inject_trace_context(request_headers) + except Exception as exc: + logger.warning("OpenTelemetry trace propagation failed for NimClient HTTP request: %s", exc) + response = requests.post( - self.endpoint_url, json=formatted_input, headers=self.headers, timeout=self.timeout + self.endpoint_url, json=formatted_input, headers=request_headers, timeout=self.timeout ) status_code = response.status_code @@ -578,6 +699,8 @@ def _batcher_loop(self): next_req_peek = self._request_queue.queue[0] if next_req_peek is None: break + if not _same_trace_context(first_req.trace_context, next_req_peek.trace_context): + break if self._batch_memory_budget_bytes: if not self.model_interface.does_item_fit_in_batch( @@ -621,7 +744,13 @@ def _process_dynamic_batch(self, requests: list[InferenceRequest]): ) # 2. Perform inference using the existing _process_batch logic - parsed_output, _ = self._process_batch(batch_input, batch_data=batch_data, model_name=model_name, **kwargs) + parsed_output, _ = self._process_batch( + batch_input, + batch_data=batch_data, + model_name=model_name, + _trace_context=first_req.trace_context, + **kwargs, + ) # 3. Process the batched output to get final results all_results = self.model_interface.process_inference_results( @@ -667,7 +796,14 @@ def submit(self, data: Any, model_name: str, dims: Tuple[int, int], **kwargs) -> ) future = Future() - request = InferenceRequest(data=data, future=future, model_name=model_name, dims=dims, kwargs=kwargs) + request = InferenceRequest( + data=data, + future=future, + model_name=model_name, + dims=dims, + kwargs=kwargs, + trace_context=_capture_trace_context(), + ) self._request_queue.put(request) return future diff --git a/nemo_retriever/src/nemo_retriever/nim/nim.py b/nemo_retriever/src/nemo_retriever/nim/nim.py index 00766c02c1..c19ea38112 100644 --- a/nemo_retriever/src/nemo_retriever/nim/nim.py +++ b/nemo_retriever/src/nemo_retriever/nim/nim.py @@ -6,7 +6,9 @@ import logging import time +from contextlib import nullcontext from concurrent.futures import ThreadPoolExecutor, as_completed +from urllib.parse import urlsplit, urlunsplit from typing import Any, Dict, List, Optional, Sequence, Tuple import requests @@ -14,6 +16,74 @@ logger = logging.getLogger(__name__) +def _service_tracing() -> Any | None: + try: + from nemo_retriever.service import tracing + except Exception: + logger.debug("Service tracing helper unavailable for NIM request", exc_info=True) + return None + return tracing + + +def _capture_trace_context() -> dict[str, str] | None: + service_tracing = _service_tracing() + if service_tracing is None: + return None + try: + return dict(service_tracing.inject_trace_context()) + except Exception as exc: + logger.warning("OpenTelemetry trace context capture failed for NIM request: %s", exc) + return None + + +def _extract_trace_context(service_tracing: Any, trace_context: Dict[str, str] | None) -> Any | None: + if not trace_context: + return None + try: + return service_tracing.extract_trace_context(trace_context) + except Exception as exc: + logger.warning("OpenTelemetry trace context extraction failed for NIM request: %s", exc) + return None + + +def _set_span_attribute(span: Any, key: str, value: Any) -> None: + setter = getattr(span, "set_attribute", None) + if setter is None: + return + try: + setter(key, value) + except Exception: + logger.debug("Ignoring NIM tracing span attribute failure", exc_info=True) + + +def _set_span_error(span: Any, exc_or_status: Any) -> None: + if isinstance(exc_or_status, int): + error_type = f"HTTP {exc_or_status}" + else: + error_type = type(exc_or_status).__name__ + _set_span_attribute(span, "error.type", error_type) + + setter = getattr(span, "set_status", None) + if setter is None: + return + try: + from opentelemetry.trace import Status, StatusCode + + setter(Status(StatusCode.ERROR)) + except Exception: + logger.debug("Ignoring NIM tracing span status failure", exc_info=True) + + +def _safe_endpoint_attribute(invoke_url: str) -> str: + parts = urlsplit(str(invoke_url)) + if not parts.scheme or not parts.netloc: + return str(invoke_url).split("?", 1)[0].split("#", 1)[0] + netloc = parts.hostname or "" + if parts.port is not None: + netloc = f"{netloc}:{parts.port}" + return urlunsplit((parts.scheme, netloc, parts.path, "", "")) + + def _chunk_ranges(total: int, chunk_size: int) -> List[Tuple[int, int]]: if chunk_size <= 0: raise ValueError("chunk_size must be > 0") @@ -66,15 +136,50 @@ def _post_with_retries( timeout_s: float, max_retries: int, max_429_retries: int, + trace_context: Dict[str, str] | None = None, ) -> Any: base_delay = 2.0 attempt = 0 retries_429 = 0 + service_tracing = _service_tracing() + span_parent_context = ( + _extract_trace_context(service_tracing, trace_context) if service_tracing is not None else None + ) + while attempt < int(max_retries): + request_headers = dict(headers) try: - response = requests.post(invoke_url, headers=headers, json=payload, timeout=float(timeout_s)) - status_code = response.status_code + span_context = ( + service_tracing.start_span( + "nim.http.post", + context=span_parent_context, + attributes={ + "http.method": "POST", + "nim.endpoint": _safe_endpoint_attribute(invoke_url), + "retry.attempt": attempt, + }, + ) + if service_tracing is not None + else nullcontext() + ) + with span_context as span: + if service_tracing is not None: + try: + service_tracing.inject_trace_context(request_headers) + except Exception as exc: + logger.warning("OpenTelemetry trace propagation failed for NIM request: %s", exc) + try: + response = requests.post( + invoke_url, headers=request_headers, json=payload, timeout=float(timeout_s) + ) + except (requests.Timeout, requests.RequestException) as exc: + _set_span_error(span, exc) + raise + status_code = response.status_code + _set_span_attribute(span, "http.status_code", status_code) + if status_code >= 400: + _set_span_error(span, status_code) if status_code == 429: retries_429 += 1 @@ -204,6 +309,7 @@ def invoke_image_inference_batches( ranges = _chunk_ranges(n, int(max_batch_size)) flattened: List[Optional[Any]] = [None] * n + trace_context = _capture_trace_context() def _invoke_one_batch(start: int, end: int, endpoint_url: str) -> Tuple[int, int, List[Any]]: inputs = [ @@ -223,6 +329,7 @@ def _invoke_one_batch(start: int, end: int, endpoint_url: str) -> Tuple[int, int timeout_s=float(timeout_s), max_retries=int(max_retries), max_429_retries=int(max_429_retries), + trace_context=trace_context, ) per_image = _normalize_batch_response(response_json, end - start) return start, end, per_image @@ -301,6 +408,7 @@ def invoke_chat_completions( invoke_urls = _parse_invoke_urls(invoke_url) results: List[Optional[str]] = [None] * len(messages_list) + trace_context = _capture_trace_context() def _invoke_one(idx: int, messages: List[Dict[str, Any]], endpoint_url: str) -> Tuple[int, str]: payload: Dict[str, Any] = { @@ -318,6 +426,7 @@ def _invoke_one(idx: int, messages: List[Dict[str, Any]], endpoint_url: str) -> timeout_s=float(timeout_s), max_retries=int(max_retries), max_429_retries=int(max_429_retries), + trace_context=trace_context, ) return idx, extract_chat_completion_text(response_json) diff --git a/nemo_retriever/src/nemo_retriever/service/app.py b/nemo_retriever/src/nemo_retriever/service/app.py index d10d891733..f577ac672b 100644 --- a/nemo_retriever/src/nemo_retriever/service/app.py +++ b/nemo_retriever/src/nemo_retriever/service/app.py @@ -244,6 +244,10 @@ def create_app(config: ServiceConfig) -> FastAPI: _configure_logging(config) _apply_resource_limits(config) + from nemo_retriever.service.tracing import configure_tracing + + configure_tracing(service_role=config.mode) + app = FastAPI( title="Retriever Service", description="Low-latency document ingestion service powered by nemo-retriever", diff --git a/nemo_retriever/src/nemo_retriever/service/client.py b/nemo_retriever/src/nemo_retriever/service/client.py index e2e7a1053e..10d08f8f26 100644 --- a/nemo_retriever/src/nemo_retriever/service/client.py +++ b/nemo_retriever/src/nemo_retriever/service/client.py @@ -36,7 +36,7 @@ import logging import time from pathlib import Path -from typing import Any, AsyncIterator, Callable +from typing import Any, AsyncIterator, Callable, NamedTuple import httpx from rich.progress import ( @@ -65,6 +65,11 @@ ) +class _CreatedJob(NamedTuple): + job_id: str + trace_id: str | None = None + + # ------------------------------------------------------------------ # Errors # ------------------------------------------------------------------ @@ -217,8 +222,8 @@ async def _create_job( expected_documents: int, label: str | None = None, retain_results: bool = False, - ) -> str: - """Open a server-side job aggregate and return the assigned ``job_id``. + ) -> _CreatedJob: + """Open a server-side job aggregate and return its client-visible metadata. Every upload made through this client must reference a job (J3+). We open one job per ``ingest_documents`` / ``aingest_documents_stream`` @@ -256,7 +261,11 @@ async def _create_job( job_id = body.get("job_id") if not job_id: raise RuntimeError(f"Job creation returned no job_id: {body!r}") - return job_id + trace_id = body.get("trace_id") + return _CreatedJob( + job_id=job_id, + trace_id=trace_id if isinstance(trace_id, str) and trace_id else None, + ) # ------------------------------------------------------------------ # Upload @@ -567,7 +576,8 @@ async def ingest_documents( limits=pool_limits, headers=self._auth_headers, ) as client: - job_id = await self._create_job(client, expected_documents=len(files)) + created_job = await self._create_job(client, expected_documents=len(files)) + job_id = created_job.job_id upload_sem = asyncio.Semaphore(self._max_concurrency) upload_failures: list[tuple[str, str]] = [] @@ -670,16 +680,20 @@ async def aingest_documents_stream( limits=pool_limits, headers=self._auth_headers, ) as client: - job_id = await self._create_job( + created_job = await self._create_job( client, expected_documents=len(files), retain_results=retain_results, ) - yield { + job_id = created_job.job_id + event: dict[str, Any] = { "event": "job_created", "job_id": job_id, "expected_documents": len(files), } + if created_job.trace_id is not None: + event["trace_id"] = created_job.trace_id + yield event upload_sem = asyncio.Semaphore(self._max_concurrency) async def _upload_one_file(fpath: Path) -> None: diff --git a/nemo_retriever/src/nemo_retriever/service/dashboard/static/views/job_detail.jsx b/nemo_retriever/src/nemo_retriever/service/dashboard/static/views/job_detail.jsx index 8577662ac5..70882cb45d 100644 --- a/nemo_retriever/src/nemo_retriever/service/dashboard/static/views/job_detail.jsx +++ b/nemo_retriever/src/nemo_retriever/service/dashboard/static/views/job_detail.jsx @@ -123,6 +123,7 @@ function JobDetailView({ jobId, onBack }) { started_at: data.started_at || prev.started_at, finalized_at: data.finalized_at || prev.finalized_at, elapsed_s: data.elapsed_s != null ? data.elapsed_s : prev.elapsed_s, + trace_id: data.trace_id != null ? data.trace_id : prev.trace_id, } : prev); setSseStatus('connected'); } @@ -280,6 +281,16 @@ function JobDetailView({ jobId, onBack }) { React.createElement('div', { style: { fontSize: 12, color: 'var(--nv-text-muted)', marginTop: 4 } }, `Created ${fmtTime(job.created_at)} • Started ${fmtTime(job.started_at)} • Finalized ${fmtTime(job.finalized_at)}` ), + React.createElement('div', { + style: { display: 'flex', alignItems: 'center', gap: 6, flexWrap: 'wrap', fontSize: 12, color: 'var(--nv-text-muted)', marginTop: 6 } + }, + React.createElement('span', { style: { fontWeight: 600 } }, 'Trace ID'), + React.createElement('span', { + className: 'mono', + title: job.trace_id || '', + style: { color: 'var(--nv-text)', userSelect: 'text' }, + }, job.trace_id || '—'), + ), ), React.createElement('div', { className: 'mono', diff --git a/nemo_retriever/src/nemo_retriever/service/dashboard/static/views/jobs.jsx b/nemo_retriever/src/nemo_retriever/service/dashboard/static/views/jobs.jsx index 4c595b1532..10c2ac0d16 100644 --- a/nemo_retriever/src/nemo_retriever/service/dashboard/static/views/jobs.jsx +++ b/nemo_retriever/src/nemo_retriever/service/dashboard/static/views/jobs.jsx @@ -105,6 +105,7 @@ function JobsView({ onOpenJob }) { status: ev.status, expected_documents: ev.expected_documents, counts: ev.counts || {}, + trace_id: ev.trace_id != null ? ev.trace_id : (idx >= 0 ? prev[idx].trace_id : null), elapsed_s: ev.elapsed_s, started_at: ev.started_at, finalized_at: ev.finalized_at, @@ -266,7 +267,7 @@ function JobsView({ onOpenJob }) { React.createElement('table', null, React.createElement('thead', null, React.createElement('tr', null, - ['Job ID', 'Status', 'Label', 'Progress', 'Docs', 'Created', 'Elapsed'].map(h => + ['Job ID', 'Trace ID', 'Status', 'Label', 'Progress', 'Docs', 'Created', 'Elapsed'].map(h => React.createElement('th', { key: h }, h) ) ) @@ -295,6 +296,11 @@ function JobsView({ onOpenJob }) { style: { color: 'var(--nv-green)', textDecoration: 'none' }, }, (j.job_id || '').substring(0, 12) + '…') ), + React.createElement('td', { + className: 'mono', + style: { fontSize: 11, maxWidth: 180, overflow: 'hidden', textOverflow: 'ellipsis', whiteSpace: 'nowrap' }, + title: j.trace_id || '', + }, j.trace_id ? j.trace_id.substring(0, 16) + '…' : '—'), React.createElement('td', null, statusBadge(j.status)), React.createElement('td', { style: { maxWidth: 220, overflow: 'hidden', textOverflow: 'ellipsis', whiteSpace: 'nowrap' }, diff --git a/nemo_retriever/src/nemo_retriever/service/models/responses.py b/nemo_retriever/src/nemo_retriever/service/models/responses.py index 4e4ed2a26f..b5f7be66f7 100644 --- a/nemo_retriever/src/nemo_retriever/service/models/responses.py +++ b/nemo_retriever/src/nemo_retriever/service/models/responses.py @@ -76,6 +76,7 @@ class JobCreatedResponse(RichModel): status: str created_at: str label: str | None = None + trace_id: str | None = None class JobAggregateResponse(RichModel): @@ -95,6 +96,7 @@ class JobAggregateResponse(RichModel): finalized_at: str | None = None elapsed_s: float | None = None label: str | None = None + trace_id: str | None = None counts: dict[str, int] = Field(default_factory=dict) document_ids: list[str] = Field(default_factory=list) documents: list[dict[str, Any]] | None = None diff --git a/nemo_retriever/src/nemo_retriever/service/routers/dashboard.py b/nemo_retriever/src/nemo_retriever/service/routers/dashboard.py index 59f2615006..c4cf1b0014 100644 --- a/nemo_retriever/src/nemo_retriever/service/routers/dashboard.py +++ b/nemo_retriever/src/nemo_retriever/service/routers/dashboard.py @@ -264,6 +264,7 @@ def _serialize_job(agg) -> dict: "finalized_at": agg.finalized_at, "elapsed_s": agg.elapsed_s, "label": agg.label, + "trace_id": agg.trace_id, "document_ids": list(agg.document_ids), } diff --git a/nemo_retriever/src/nemo_retriever/service/routers/ingest.py b/nemo_retriever/src/nemo_retriever/service/routers/ingest.py index d9a8ed5909..4148b98231 100644 --- a/nemo_retriever/src/nemo_retriever/service/routers/ingest.py +++ b/nemo_retriever/src/nemo_retriever/service/routers/ingest.py @@ -218,6 +218,8 @@ async def _enqueue_or_reject(pool_type: PoolType, item: WorkItem) -> None: pool = get_pipeline_pool() if pool is None: return + if not item.trace_context: + item.trace_context = _safe_inject_trace_context() if not await pool.submit(pool_type, item): raise HTTPException( status_code=429, @@ -453,6 +455,65 @@ def _parse_backend_json(resp: Response) -> dict: return {} +def _safe_inject_trace_context() -> dict[str, str]: + """Best-effort W3C trace context capture for downstream propagation.""" + from nemo_retriever.service import tracing + + try: + return dict(tracing.inject_trace_context()) + except Exception as exc: + logger.warning("Trace context injection failed; continuing without propagated context: %s", exc) + return {} + + +def _safe_extract_trace_context(carrier: dict[str, str] | None) -> Any | None: + """Best-effort W3C trace context extraction for accept spans.""" + from nemo_retriever.service import tracing + + try: + return tracing.extract_trace_context(carrier) + except Exception as exc: + logger.warning("Trace context extraction failed; continuing without parent context: %s", exc) + return None + + +def _trace_context_from_request_or_job(request: Request, job_id: str | None) -> dict[str, str]: + inbound_traceparent = request.headers.get("traceparent") + if inbound_traceparent: + carrier = {"traceparent": inbound_traceparent} + if tracestate := request.headers.get("tracestate"): + carrier["tracestate"] = tracestate + return carrier + + if not job_id: + return {} + tracker = get_job_tracker() + if tracker is None: + return {} + agg = tracker.get_job(job_id) + if agg is None: + return {} + return dict(agg.trace_context) + + +def _start_accept_span(request: Request, job_id: str, name: str): + from opentelemetry.trace import SpanKind + + from nemo_retriever.service import tracing + + carrier = _trace_context_from_request_or_job(request, job_id) + return tracing.start_span( + name, + kind=SpanKind.SERVER, + context=_safe_extract_trace_context(carrier), + attributes={ + "service.role": _role(request), + "job.id": job_id, + "route": request.url.path, + }, + ) + + # ------------------------------------------------------------------ # POST /v1/ingest/job — create a new job aggregate # GET /v1/ingest/job/{job_id} — fetch the aggregate's current state @@ -470,6 +531,7 @@ def _aggregate_to_response(agg, *, documents: list[dict[str, Any]] | None = None finalized_at=agg.finalized_at, elapsed_s=agg.elapsed_s, label=agg.label, + trace_id=agg.trace_id, counts=dict(agg.counts), document_ids=list(agg.document_ids), documents=documents, @@ -482,7 +544,7 @@ def _aggregate_to_response(agg, *, documents: list[dict[str, Any]] | None = None status_code=201, summary="Create a new ingestion job aggregate", ) -async def create_job(request: Request, body: JobCreateRequest) -> JobCreatedResponse: +async def create_job(request: Request, response: Response, body: JobCreateRequest) -> JobCreatedResponse: """Open a job that will receive ``expected_documents`` uploads. The server returns an opaque ``job_id`` the client uses for every @@ -490,23 +552,43 @@ async def create_job(request: Request, body: JobCreateRequest) -> JobCreatedResp call. The job is in-memory only; gateway pod restarts erase it (this is intentional — see the J1 design notes). """ + from opentelemetry.trace import SpanKind + + from nemo_retriever.service import tracing from nemo_retriever.service.services.job_tracker import JobTrackerError tracker = get_job_tracker() if tracker is None: raise HTTPException(status_code=503, detail="Job tracker not available") job_id = uuid.uuid4().hex + trace_id: str | None = None + inbound_trace_context = _trace_context_from_request_or_job(request, None) try: - agg = tracker.register_job( - job_id, - expected_documents=body.expected_documents, - label=body.label, - metadata=body.metadata, - retain_results=body.retain_results, - ) + with tracing.start_span( + "ingest.job", + kind=SpanKind.SERVER, + context=_safe_extract_trace_context(inbound_trace_context), + attributes={ + "service.role": _role(request), + "job.expected_documents": body.expected_documents, + }, + ): + trace_context = _safe_inject_trace_context() + trace_id = tracing.current_trace_id_hex() if trace_context else None + agg = tracker.register_job( + job_id, + expected_documents=body.expected_documents, + label=body.label, + metadata=body.metadata, + retain_results=body.retain_results, + trace_id=trace_id, + trace_context=trace_context, + ) except JobTrackerError as exc: raise HTTPException(status_code=getattr(exc, "status_code", 500), detail=str(exc)) from exc + if trace_id: + response.headers[tracing.TRACE_ID_HEADER] = trace_id if (m := get_metrics()) is not None: m.record_request("/v1/ingest/job") m.record_job_created(job_id) @@ -516,6 +598,7 @@ async def create_job(request: Request, body: JobCreateRequest) -> JobCreatedResp status=agg.status.value, created_at=agg.created_at, label=agg.label, + trace_id=agg.trace_id, ) @@ -711,114 +794,115 @@ async def submit_document_to_job( _check_upload_size(file, request) validated_spec = _resolve_pipeline_spec(request, meta) - if _is_gateway(request): + with _start_accept_span(request, job_id, "ingest.document.accept"): + if _is_gateway(request): + classification = FileClassifier.classify(file, filename_override=meta.filename or "") + enforce_media_dependencies(classification) + file_size = _file_size_from_upload(file, request) + + file_bytes = await file.read() + route = _route_by_page_count(file_bytes, meta, file_category=classification.category) + + document_id = uuid.uuid4().hex + content_sha256 = hashlib.sha256(file_bytes).hexdigest() + now = datetime.now(timezone.utc).isoformat() + + _register_document_under_job(document_id=document_id, job_id=job_id, filename=file.filename) + tracker = get_job_tracker() + if tracker is not None: + tracker.mark_processing(document_id) + + callback_url = _build_callback_url(request) + extra_headers = { + _GATEWAY_DOC_ID_HEADER: document_id, + _GATEWAY_JOB_ID_HEADER: job_id, + _GATEWAY_CALLBACK_HEADER: callback_url, + **_gateway_retain_results_headers(job_id), + } + if validated_spec is not None: + extra_headers[_GATEWAY_PIPELINE_SPEC_HEADER] = validated_spec.model_dump_json() + resp = await _gateway_forward(request, route, extra_headers=extra_headers) + + if resp.status_code not in (200, 202): + if tracker is not None: + tracker.mark_failed(document_id, f"Worker returned HTTP {resp.status_code}") + return resp + + _record_prometheus(request, "/v1/ingest/job/document", "2xx", file_size=file_size) + if (m := get_metrics()) is not None: + m.record_request("/v1/ingest/job/document") + m.record_document_accepted( + document_id=document_id, + job_id=job_id, + filename=classification.filename, + file_category=classification.category.value, + content_type=classification.content_type, + file_size_bytes=file_size, + endpoint="/v1/ingest/job/document", + ) + + return IngestAccepted( + document_id=document_id, + job_id=job_id, + content_sha256=content_sha256, + status="accepted", + created_at=now, + ) + + # ── worker / standalone ────────────────────────────────────── classification = FileClassifier.classify(file, filename_override=meta.filename or "") enforce_media_dependencies(classification) - file_size = _file_size_from_upload(file, request) file_bytes = await file.read() route = _route_by_page_count(file_bytes, meta, file_category=classification.category) - - document_id = uuid.uuid4().hex content_sha256 = hashlib.sha256(file_bytes).hexdigest() now = datetime.now(timezone.utc).isoformat() - _register_document_under_job(document_id=document_id, job_id=job_id, filename=file.filename) - tracker = get_job_tracker() - if tracker is not None: - tracker.mark_processing(document_id) - - callback_url = _build_callback_url(request) - extra_headers = { - _GATEWAY_DOC_ID_HEADER: document_id, - _GATEWAY_JOB_ID_HEADER: job_id, - _GATEWAY_CALLBACK_HEADER: callback_url, - **_gateway_retain_results_headers(job_id), - } - if validated_spec is not None: - extra_headers[_GATEWAY_PIPELINE_SPEC_HEADER] = validated_spec.model_dump_json() - resp = await _gateway_forward(request, route, extra_headers=extra_headers) + gw_doc_id = request.headers.get(_GATEWAY_DOC_ID_HEADER) + gw_callback_url = request.headers.get(_GATEWAY_CALLBACK_HEADER) + gw_job_id = request.headers.get(_GATEWAY_JOB_ID_HEADER) or job_id + document_id = gw_doc_id or uuid.uuid4().hex - if resp.status_code not in (200, 202): - if tracker is not None: - tracker.mark_failed(document_id, f"Worker returned HTTP {resp.status_code}") - return resp + worker_spec = _spec_from_gateway_header(request) if gw_doc_id else validated_spec + + if not gw_callback_url: + _register_document_under_job(document_id=document_id, job_id=job_id, filename=file.filename) + + await _enqueue_or_reject( + route, + WorkItem( + id=document_id, + payload=file_bytes, + filename=file.filename, + callback_url=gw_callback_url, + job_id=gw_job_id, + pipeline_spec=worker_spec.model_dump(mode="json") if worker_spec is not None else None, + retain_results=_work_item_retain_results(request, job_id=gw_job_id), + ), + ) + + _record_prometheus(request, "/v1/ingest/job/document", "2xx", file_size=len(file_bytes)) - _record_prometheus(request, "/v1/ingest/job/document", "2xx", file_size=file_size) if (m := get_metrics()) is not None: m.record_request("/v1/ingest/job/document") m.record_document_accepted( document_id=document_id, - job_id=job_id, + job_id=gw_job_id, filename=classification.filename, file_category=classification.category.value, content_type=classification.content_type, - file_size_bytes=file_size, + file_size_bytes=len(file_bytes), endpoint="/v1/ingest/job/document", ) return IngestAccepted( document_id=document_id, - job_id=job_id, + job_id=gw_job_id, content_sha256=content_sha256, status="accepted", created_at=now, ) - # ── worker / standalone ────────────────────────────────────── - classification = FileClassifier.classify(file, filename_override=meta.filename or "") - enforce_media_dependencies(classification) - - file_bytes = await file.read() - route = _route_by_page_count(file_bytes, meta, file_category=classification.category) - content_sha256 = hashlib.sha256(file_bytes).hexdigest() - now = datetime.now(timezone.utc).isoformat() - - gw_doc_id = request.headers.get(_GATEWAY_DOC_ID_HEADER) - gw_callback_url = request.headers.get(_GATEWAY_CALLBACK_HEADER) - gw_job_id = request.headers.get(_GATEWAY_JOB_ID_HEADER) or job_id - document_id = gw_doc_id or uuid.uuid4().hex - - worker_spec = _spec_from_gateway_header(request) if gw_doc_id else validated_spec - - if not gw_callback_url: - _register_document_under_job(document_id=document_id, job_id=job_id, filename=file.filename) - - await _enqueue_or_reject( - route, - WorkItem( - id=document_id, - payload=file_bytes, - filename=file.filename, - callback_url=gw_callback_url, - job_id=gw_job_id, - pipeline_spec=worker_spec.model_dump(mode="json") if worker_spec is not None else None, - retain_results=_work_item_retain_results(request, job_id=gw_job_id), - ), - ) - - _record_prometheus(request, "/v1/ingest/job/document", "2xx", file_size=len(file_bytes)) - - if (m := get_metrics()) is not None: - m.record_request("/v1/ingest/job/document") - m.record_document_accepted( - document_id=document_id, - job_id=gw_job_id, - filename=classification.filename, - file_category=classification.category.value, - content_type=classification.content_type, - file_size_bytes=len(file_bytes), - endpoint="/v1/ingest/job/document", - ) - - return IngestAccepted( - document_id=document_id, - job_id=gw_job_id, - content_sha256=content_sha256, - status="accepted", - created_at=now, - ) - @router.post( "/ingest/job/{job_id}/page", @@ -840,44 +924,97 @@ async def submit_page_to_job( _require_job(job_id) _check_upload_size(file, request) - if _is_gateway(request): + with _start_accept_span(request, job_id, "ingest.page.accept"): + if _is_gateway(request): + classification = FileClassifier.classify(file, filename_override=filename) + enforce_media_dependencies(classification) + file_size = _file_size_from_upload(file, request) + + page_id = uuid.uuid4().hex + content_sha256 = hashlib.sha256((await file.read()) or b"").hexdigest() + now = datetime.now(timezone.utc).isoformat() + + _register_document_under_job(document_id=page_id, job_id=job_id, filename=filename or file.filename) + tracker = get_job_tracker() + if tracker is not None: + tracker.mark_processing(page_id) + + callback_url = _build_callback_url(request) + resp = await _gateway_forward( + request, + PoolType.REALTIME, + extra_headers={ + _GATEWAY_DOC_ID_HEADER: page_id, + _GATEWAY_JOB_ID_HEADER: job_id, + _GATEWAY_CALLBACK_HEADER: callback_url, + **_gateway_retain_results_headers(job_id), + }, + ) + + if resp.status_code not in (200, 202): + if tracker is not None: + tracker.mark_failed(page_id, f"Worker returned HTTP {resp.status_code}") + return resp + + _record_prometheus( + request, + "/v1/ingest/job/page", + "2xx", + file_size=file_size, + is_page=True, + ) + if (m := get_metrics()) is not None: + m.record_request("/v1/ingest/job/page") + m.record_page_accepted( + page_id=page_id, + document_id=document_id, + endpoint="/v1/ingest/job/page", + page_number=page_number, + file_size_bytes=file_size, + file_category=classification.category.value, + content_type=classification.content_type, + ) + + return PageIngestAccepted( + page_id=page_id, + document_id=document_id, + page_number=page_number, + content_sha256=content_sha256, + status="accepted", + created_at=now, + ) + + # ── worker / standalone ────────────────────────────────────── + dry_run = _is_dry_run(request) classification = FileClassifier.classify(file, filename_override=filename) enforce_media_dependencies(classification) - file_size = _file_size_from_upload(file, request) - page_id = uuid.uuid4().hex - content_sha256 = hashlib.sha256((await file.read()) or b"").hexdigest() + file_bytes = await file.read() + content_sha256 = hashlib.sha256(file_bytes).hexdigest() now = datetime.now(timezone.utc).isoformat() - _register_document_under_job(document_id=page_id, job_id=job_id, filename=filename or file.filename) - tracker = get_job_tracker() - if tracker is not None: - tracker.mark_processing(page_id) - - callback_url = _build_callback_url(request) - resp = await _gateway_forward( - request, - PoolType.REALTIME, - extra_headers={ - _GATEWAY_DOC_ID_HEADER: page_id, - _GATEWAY_JOB_ID_HEADER: job_id, - _GATEWAY_CALLBACK_HEADER: callback_url, - **_gateway_retain_results_headers(job_id), - }, - ) + gw_doc_id = request.headers.get(_GATEWAY_DOC_ID_HEADER) + gw_callback_url = request.headers.get(_GATEWAY_CALLBACK_HEADER) + gw_job_id = request.headers.get(_GATEWAY_JOB_ID_HEADER) or job_id + page_id = gw_doc_id or uuid.uuid4().hex + + if not dry_run: + if not gw_callback_url: + _register_document_under_job(document_id=page_id, job_id=job_id, filename=filename or file.filename) + await _enqueue_or_reject( + PoolType.REALTIME, + WorkItem( + id=page_id, + payload=file_bytes, + filename=file.filename, + callback_url=gw_callback_url, + job_id=gw_job_id, + retain_results=_work_item_retain_results(request, job_id=gw_job_id), + ), + ) + + _record_prometheus(request, "/v1/ingest/job/page", "2xx", file_size=len(file_bytes), is_page=True) - if resp.status_code not in (200, 202): - if tracker is not None: - tracker.mark_failed(page_id, f"Worker returned HTTP {resp.status_code}") - return resp - - _record_prometheus( - request, - "/v1/ingest/job/page", - "2xx", - file_size=file_size, - is_page=True, - ) if (m := get_metrics()) is not None: m.record_request("/v1/ingest/job/page") m.record_page_accepted( @@ -885,7 +1022,7 @@ async def submit_page_to_job( document_id=document_id, endpoint="/v1/ingest/job/page", page_number=page_number, - file_size_bytes=file_size, + file_size_bytes=len(file_bytes), file_category=classification.category.value, content_type=classification.content_type, ) @@ -899,58 +1036,6 @@ async def submit_page_to_job( created_at=now, ) - # ── worker / standalone ────────────────────────────────────── - dry_run = _is_dry_run(request) - classification = FileClassifier.classify(file, filename_override=filename) - enforce_media_dependencies(classification) - - file_bytes = await file.read() - content_sha256 = hashlib.sha256(file_bytes).hexdigest() - now = datetime.now(timezone.utc).isoformat() - - gw_doc_id = request.headers.get(_GATEWAY_DOC_ID_HEADER) - gw_callback_url = request.headers.get(_GATEWAY_CALLBACK_HEADER) - gw_job_id = request.headers.get(_GATEWAY_JOB_ID_HEADER) or job_id - page_id = gw_doc_id or uuid.uuid4().hex - - if not dry_run: - if not gw_callback_url: - _register_document_under_job(document_id=page_id, job_id=job_id, filename=filename or file.filename) - await _enqueue_or_reject( - PoolType.REALTIME, - WorkItem( - id=page_id, - payload=file_bytes, - filename=file.filename, - callback_url=gw_callback_url, - job_id=gw_job_id, - retain_results=_work_item_retain_results(request, job_id=gw_job_id), - ), - ) - - _record_prometheus(request, "/v1/ingest/job/page", "2xx", file_size=len(file_bytes), is_page=True) - - if (m := get_metrics()) is not None: - m.record_request("/v1/ingest/job/page") - m.record_page_accepted( - page_id=page_id, - document_id=document_id, - endpoint="/v1/ingest/job/page", - page_number=page_number, - file_size_bytes=len(file_bytes), - file_category=classification.category.value, - content_type=classification.content_type, - ) - - return PageIngestAccepted( - page_id=page_id, - document_id=document_id, - page_number=page_number, - content_sha256=content_sha256, - status="accepted", - created_at=now, - ) - @router.post( "/ingest/job/{job_id}/whole", @@ -976,47 +1061,103 @@ async def submit_whole_document_to_job( _check_upload_size(file, request) validated_spec = _resolve_pipeline_spec(request, meta) - if _is_gateway(request): + with _start_accept_span(request, job_id, "ingest.whole.accept"): + if _is_gateway(request): + classification = FileClassifier.classify(file, filename_override=meta.filename or "") + enforce_media_dependencies(classification) + file_size = _file_size_from_upload(file, request) + + document_id = uuid.uuid4().hex + file_bytes = await file.read() + content_sha256 = hashlib.sha256(file_bytes).hexdigest() + now = datetime.now(timezone.utc).isoformat() + + _register_document_under_job(document_id=document_id, job_id=job_id, filename=file.filename) + tracker = get_job_tracker() + if tracker is not None: + tracker.mark_processing(document_id) + + callback_url = _build_callback_url(request) + extra_headers = { + _GATEWAY_DOC_ID_HEADER: document_id, + _GATEWAY_JOB_ID_HEADER: job_id, + _GATEWAY_CALLBACK_HEADER: callback_url, + **_gateway_retain_results_headers(job_id), + } + if validated_spec is not None: + extra_headers[_GATEWAY_PIPELINE_SPEC_HEADER] = validated_spec.model_dump_json() + resp = await _gateway_forward(request, PoolType.BATCH, extra_headers=extra_headers) + + if resp.status_code not in (200, 202): + if tracker is not None: + tracker.mark_failed(document_id, f"Worker returned HTTP {resp.status_code}") + return resp + + _record_prometheus(request, "/v1/ingest/job/whole", "2xx", file_size=file_size) + if (m := get_metrics()) is not None: + m.record_request("/v1/ingest/job/whole") + m.record_document_accepted( + document_id=document_id, + job_id=job_id, + filename=classification.filename, + file_category=classification.category.value, + content_type=classification.content_type, + file_size_bytes=file_size, + endpoint="/v1/ingest/job/whole", + ) + + return DocumentIngestAccepted( + document_id=document_id, + filename=classification.filename, + file_size_bytes=len(file_bytes), + content_sha256=content_sha256, + status="accepted", + created_at=now, + ) + + # ── worker / standalone ────────────────────────────────────── + dry_run = _is_dry_run(request) classification = FileClassifier.classify(file, filename_override=meta.filename or "") enforce_media_dependencies(classification) - file_size = _file_size_from_upload(file, request) - document_id = uuid.uuid4().hex file_bytes = await file.read() content_sha256 = hashlib.sha256(file_bytes).hexdigest() now = datetime.now(timezone.utc).isoformat() - _register_document_under_job(document_id=document_id, job_id=job_id, filename=file.filename) - tracker = get_job_tracker() - if tracker is not None: - tracker.mark_processing(document_id) - - callback_url = _build_callback_url(request) - extra_headers = { - _GATEWAY_DOC_ID_HEADER: document_id, - _GATEWAY_JOB_ID_HEADER: job_id, - _GATEWAY_CALLBACK_HEADER: callback_url, - **_gateway_retain_results_headers(job_id), - } - if validated_spec is not None: - extra_headers[_GATEWAY_PIPELINE_SPEC_HEADER] = validated_spec.model_dump_json() - resp = await _gateway_forward(request, PoolType.BATCH, extra_headers=extra_headers) + gw_doc_id = request.headers.get(_GATEWAY_DOC_ID_HEADER) + gw_callback_url = request.headers.get(_GATEWAY_CALLBACK_HEADER) + gw_job_id = request.headers.get(_GATEWAY_JOB_ID_HEADER) or job_id + document_id = gw_doc_id or uuid.uuid4().hex + + worker_spec = _spec_from_gateway_header(request) if gw_doc_id else validated_spec + + if not dry_run: + if not gw_callback_url: + _register_document_under_job(document_id=document_id, job_id=job_id, filename=file.filename) + await _enqueue_or_reject( + PoolType.BATCH, + WorkItem( + id=document_id, + payload=file_bytes, + filename=file.filename, + callback_url=gw_callback_url, + job_id=gw_job_id, + pipeline_spec=worker_spec.model_dump(mode="json") if worker_spec is not None else None, + retain_results=_work_item_retain_results(request, job_id=gw_job_id), + ), + ) - if resp.status_code not in (200, 202): - if tracker is not None: - tracker.mark_failed(document_id, f"Worker returned HTTP {resp.status_code}") - return resp + _record_prometheus(request, "/v1/ingest/job/whole", "2xx", file_size=len(file_bytes)) - _record_prometheus(request, "/v1/ingest/job/whole", "2xx", file_size=file_size) if (m := get_metrics()) is not None: m.record_request("/v1/ingest/job/whole") m.record_document_accepted( document_id=document_id, - job_id=job_id, + job_id=gw_job_id, filename=classification.filename, file_category=classification.category.value, content_type=classification.content_type, - file_size_bytes=file_size, + file_size_bytes=len(file_bytes), endpoint="/v1/ingest/job/whole", ) @@ -1029,61 +1170,6 @@ async def submit_whole_document_to_job( created_at=now, ) - # ── worker / standalone ────────────────────────────────────── - dry_run = _is_dry_run(request) - classification = FileClassifier.classify(file, filename_override=meta.filename or "") - enforce_media_dependencies(classification) - - file_bytes = await file.read() - content_sha256 = hashlib.sha256(file_bytes).hexdigest() - now = datetime.now(timezone.utc).isoformat() - - gw_doc_id = request.headers.get(_GATEWAY_DOC_ID_HEADER) - gw_callback_url = request.headers.get(_GATEWAY_CALLBACK_HEADER) - gw_job_id = request.headers.get(_GATEWAY_JOB_ID_HEADER) or job_id - document_id = gw_doc_id or uuid.uuid4().hex - - worker_spec = _spec_from_gateway_header(request) if gw_doc_id else validated_spec - - if not dry_run: - if not gw_callback_url: - _register_document_under_job(document_id=document_id, job_id=job_id, filename=file.filename) - await _enqueue_or_reject( - PoolType.BATCH, - WorkItem( - id=document_id, - payload=file_bytes, - filename=file.filename, - callback_url=gw_callback_url, - job_id=gw_job_id, - pipeline_spec=worker_spec.model_dump(mode="json") if worker_spec is not None else None, - retain_results=_work_item_retain_results(request, job_id=gw_job_id), - ), - ) - - _record_prometheus(request, "/v1/ingest/job/whole", "2xx", file_size=len(file_bytes)) - - if (m := get_metrics()) is not None: - m.record_request("/v1/ingest/job/whole") - m.record_document_accepted( - document_id=document_id, - job_id=gw_job_id, - filename=classification.filename, - file_category=classification.category.value, - content_type=classification.content_type, - file_size_bytes=len(file_bytes), - endpoint="/v1/ingest/job/whole", - ) - - return DocumentIngestAccepted( - document_id=document_id, - filename=classification.filename, - file_size_bytes=len(file_bytes), - content_sha256=content_sha256, - status="accepted", - created_at=now, - ) - # ------------------------------------------------------------------ # GET /v1/ingest/status/{item_id} — status for general ingest items diff --git a/nemo_retriever/src/nemo_retriever/service/services/job_tracker.py b/nemo_retriever/src/nemo_retriever/service/services/job_tracker.py index 844779edb6..8740dcfadf 100644 --- a/nemo_retriever/src/nemo_retriever/service/services/job_tracker.py +++ b/nemo_retriever/src/nemo_retriever/service/services/job_tracker.py @@ -46,6 +46,8 @@ from enum import Enum from typing import Any +from pydantic import Field + from nemo_retriever.service.models.base import RichModel logger = logging.getLogger(__name__) @@ -174,6 +176,8 @@ class JobAggregate(RichModel): label: str | None = None """Optional client-supplied tag, e.g. ``"Q4-2026-corpus"``.""" metadata: dict[str, Any] = {} + trace_id: str | None = None + trace_context: dict[str, str] = Field(default_factory=dict) retain_results: bool = False """When false, :meth:`JobTracker.mark_completed` drops bulky ``result_data``.""" @@ -276,6 +280,8 @@ def register_job( label: str | None = None, metadata: dict[str, Any] | None = None, retain_results: bool = False, + trace_id: str | None = None, + trace_context: dict[str, str] | None = None, ) -> JobAggregate: """Create a new :class:`JobAggregate` in ``pending`` state.""" if expected_documents <= 0: @@ -298,6 +304,8 @@ def register_job( created_at=_utcnow_iso(), label=label, metadata=dict(metadata or {}), + trace_id=trace_id, + trace_context=dict(trace_context or {}), retain_results=retain_results, ) agg.counts[DocumentStatus.PENDING.value] = 0 @@ -675,6 +683,8 @@ def _publish_job_event(self, event_type: str, agg: JobAggregate) -> None: "finalized_at": agg.finalized_at, "label": agg.label, } + if agg.trace_id: + event["trace_id"] = agg.trace_id self._event_bus.publish_sync(event, job_id=agg.job_id) # ── reads ──────────────────────────────────────────────────────── diff --git a/nemo_retriever/src/nemo_retriever/service/services/pipeline_executor.py b/nemo_retriever/src/nemo_retriever/service/services/pipeline_executor.py index ee4d41c490..708621ed4a 100644 --- a/nemo_retriever/src/nemo_retriever/service/services/pipeline_executor.py +++ b/nemo_retriever/src/nemo_retriever/service/services/pipeline_executor.py @@ -90,6 +90,39 @@ def _sanitize_result_data(df: Any) -> list[dict[str, Any]]: return dataframe_to_transport_records(df) +def _pipeline_tracing() -> Any | None: + try: + from nemo_retriever.service import tracing + except Exception: + logger.debug("Service tracing helper unavailable for pipeline execution", exc_info=True) + return None + return tracing + + +def _capture_trace_context_for_pipeline() -> dict[str, str]: + tracing = _pipeline_tracing() + if tracing is None: + return {} + try: + return dict(tracing.inject_trace_context()) + except Exception as exc: + logger.warning("OpenTelemetry trace context capture failed for pipeline execution: %s", exc) + return {} + + +def _extract_trace_context_for_pipeline(trace_context: dict[str, str] | None) -> Any | None: + if not trace_context: + return None + tracing = _pipeline_tracing() + if tracing is None: + return None + try: + return tracing.extract_trace_context(trace_context) + except Exception as exc: + logger.warning("OpenTelemetry trace context extraction failed for pipeline execution: %s", exc) + return None + + # ── Process pool registry ──────────────────────────────────────────── _process_executors: list[ProcessPoolExecutor] = [] @@ -498,6 +531,9 @@ def _run_pipeline_in_process( pipeline_spec: dict[str, Any] | None = None, caption_params_dict: dict[str, Any] | None = None, asr_params_dict: dict[str, Any] | None = None, + trace_context: dict[str, str] | None = None, + pool_label: str | None = None, + service_role: str | None = None, ) -> tuple[int, list[dict[str, Any]], float]: """Execute one pipeline run inside a child process. @@ -517,19 +553,32 @@ def _run_pipeline_in_process( When ``pipeline_spec`` is ``None`` (or empty) the behaviour exactly matches the original closure-baked pipeline. """ + from nemo_retriever.service import tracing + t0 = time.monotonic() - ingestor, _extraction_mode, has_per_request_vdb = _build_graph_ingestor_from_spec( - filename, - payload, - extract_params_dict, - embed_params_dict, - pipeline_spec, - caption_params_dict, - asr_params_dict, - ) + tracing.configure_tracing(service_role=service_role or "worker-process") + span_context = _extract_trace_context_for_pipeline(trace_context) + span_attributes = { + "pool": (pool_label or "").lower(), + "document.filename": filename, + } + try: + with tracing.start_span("pipeline.ingest", context=span_context, attributes=span_attributes): + ingestor, _extraction_mode, has_per_request_vdb = _build_graph_ingestor_from_spec( + filename, + payload, + extract_params_dict, + embed_params_dict, + pipeline_spec, + caption_params_dict, + asr_params_dict, + ) + + result_df = ingestor.ingest() + finally: + tracing.force_flush(timeout_millis=500) - result_df = ingestor.ingest() elapsed = time.monotonic() - t0 row_count = len(result_df) @@ -718,6 +767,7 @@ async def _work(item: WorkItem) -> tuple[int, list[dict[str, Any]]]: resolved_spec = _resolve_sidecar_in_spec(item.pipeline_spec) try: + trace_context = _capture_trace_context_for_pipeline() row_count, result_data, elapsed = await loop.run_in_executor( executor_ref[0], _run_pipeline_in_process, @@ -729,6 +779,9 @@ async def _work(item: WorkItem) -> tuple[int, list[dict[str, Any]]]: resolved_spec, caption_params_dict, asr_params_dict, + trace_context, + label, + config.mode, ) except BrokenProcessPool: logger.error( diff --git a/nemo_retriever/src/nemo_retriever/service/services/pipeline_pool.py b/nemo_retriever/src/nemo_retriever/service/services/pipeline_pool.py index c097233bee..10c12ca06b 100644 --- a/nemo_retriever/src/nemo_retriever/service/services/pipeline_pool.py +++ b/nemo_retriever/src/nemo_retriever/service/services/pipeline_pool.py @@ -27,9 +27,9 @@ import logging import time from enum import Enum -from typing import Any, Callable +from typing import Any, Callable, Mapping -from pydantic import ConfigDict +from pydantic import ConfigDict, Field from nemo_retriever.service.config import PipelinePoolConfig from nemo_retriever.service.models.base import RichModel @@ -55,6 +55,22 @@ class PoolType(str, Enum): BATCH = "batch" +def _safe_extract_trace_context(carrier: Mapping[str, str] | None, *, pool_name: str, item_id: str) -> Any | None: + """Best-effort W3C trace context extraction for worker processing.""" + from nemo_retriever.service import tracing + + try: + return tracing.extract_trace_context(carrier) + except Exception as exc: + logger.warning( + "Pool '%s' trace context extraction failed for item %s; continuing without parent context: %s", + pool_name, + item_id, + exc, + ) + return None + + class WorkItem(RichModel): """A unit of work submitted to a pool.""" @@ -72,6 +88,8 @@ class WorkItem(RichModel): # Validated per-request pipeline overrides (PipelineSpec serialised # to a dict). ``None`` means: run the legacy startup-baked pipeline. pipeline_spec: dict[str, Any] | None = None + trace_context: dict[str, str] = Field(default_factory=dict) + enqueued_at_monotonic_s: float | None = None async def _fire_gateway_callback( @@ -238,6 +256,7 @@ async def _worker_loop(self, worker_id: int) -> None: updating a local job tracker. In standalone mode (no callback), the local tracker is updated directly. """ + from nemo_retriever.service import tracing from nemo_retriever.service.services.job_tracker import get_job_tracker assert self._queue is not None @@ -249,79 +268,96 @@ async def _worker_loop(self, worker_id: int) -> None: if item is None: self._queue.task_done() return - # Per-item timer covers the *useful* work — tracker bookkeeping - # is excluded so the histogram reflects pipeline cost only. - t0 = time.monotonic() - outcome = "completed" - try: - tracker = get_job_tracker() - if tracker is not None: - tracker.mark_processing(item.id) - result_rows = 0 - result_data = None - if self._work_fn is not None: - result = self._work_fn(item) - if asyncio.iscoroutine(result): - result = await result - if isinstance(result, tuple) and len(result) == 2: - result_rows, result_data = result - elif isinstance(result, int): - result_rows = result - - retain_results = item.retain_results - if not retain_results and item.job_id: - tracker_lookup = get_job_tracker() - if tracker_lookup is not None: - retain_results = tracker_lookup.should_retain_results(item.job_id) - - if item.callback_url: - if retain_results: - from nemo_retriever.service.services.worker_result_store import store_result_data - - store_result_data(item.id, result_data) - await _fire_gateway_callback( - item.callback_url, - item.id, - "completed", - result_rows=result_rows, - ) - elif tracker is not None: - tracker.mark_completed( - item.id, - result_rows=result_rows, - result_data=result_data if retain_results else None, - ) - self._processed += 1 - except Exception as exc: - outcome = "failed" - if item.callback_url: - await _fire_gateway_callback( - item.callback_url, - item.id, - "failed", - error=f"{type(exc).__name__}: {exc}", + ctx = _safe_extract_trace_context(item.trace_context, pool_name=self._name, item_id=item.id) + with tracing.start_span( + f"pool.{self._name}.process", + context=ctx, + attributes={ + "pool": self._name, + "document.id": item.id, + "job.id": item.job_id or "", + }, + ) as span: + if item.enqueued_at_monotonic_s is not None and hasattr(span, "set_attribute"): + span.set_attribute( + "queue.wait_ms", + (time.monotonic() - item.enqueued_at_monotonic_s) * 1000.0, ) - else: + # Per-item timer covers the *useful* work — tracker bookkeeping + # is excluded so the histogram reflects pipeline cost only. + t0 = time.monotonic() + outcome = "completed" + try: tracker = get_job_tracker() if tracker is not None: - tracker.mark_failed(item.id, f"{type(exc).__name__}: {exc}") - logger.exception("Pool '%s' worker %d failed on item %s", self._name, worker_id, item.id) - finally: - # Always observe; cheaper to keep latency series complete - # than to gate on outcome. Bucketed histogram, so even - # very-failed-fast items show up in the low buckets. - duration_h.observe(time.monotonic() - t0) - if outcome == "completed": - processed_ok.inc() - else: - processed_err.inc() - self._queue.task_done() + tracker.mark_processing(item.id) + result_rows = 0 + result_data = None + if self._work_fn is not None: + result = self._work_fn(item) + if asyncio.iscoroutine(result): + result = await result + if isinstance(result, tuple) and len(result) == 2: + result_rows, result_data = result + elif isinstance(result, int): + result_rows = result + + retain_results = item.retain_results + if not retain_results and item.job_id: + tracker_lookup = get_job_tracker() + if tracker_lookup is not None: + retain_results = tracker_lookup.should_retain_results(item.job_id) + + if item.callback_url: + if retain_results: + from nemo_retriever.service.services.worker_result_store import store_result_data + + store_result_data(item.id, result_data) + await _fire_gateway_callback( + item.callback_url, + item.id, + "completed", + result_rows=result_rows, + ) + elif tracker is not None: + tracker.mark_completed( + item.id, + result_rows=result_rows, + result_data=result_data if retain_results else None, + ) + self._processed += 1 + except Exception as exc: + outcome = "failed" + if item.callback_url: + await _fire_gateway_callback( + item.callback_url, + item.id, + "failed", + error=f"{type(exc).__name__}: {exc}", + ) + else: + tracker = get_job_tracker() + if tracker is not None: + tracker.mark_failed(item.id, f"{type(exc).__name__}: {exc}") + logger.exception("Pool '%s' worker %d failed on item %s", self._name, worker_id, item.id) + finally: + # Always observe; cheaper to keep latency series complete + # than to gate on outcome. Bucketed histogram, so even + # very-failed-fast items show up in the low buckets. + duration_h.observe(time.monotonic() - t0) + if outcome == "completed": + processed_ok.inc() + else: + processed_err.inc() + self._queue.task_done() async def submit(self, item: WorkItem) -> bool: """Enqueue a work item. Returns ``False`` if the queue is full.""" if not self._running or self._queue is None: return False try: + if item.enqueued_at_monotonic_s is None: + item.enqueued_at_monotonic_s = time.monotonic() self._queue.put_nowait(item) return True except asyncio.QueueFull: diff --git a/nemo_retriever/src/nemo_retriever/service/services/proxy.py b/nemo_retriever/src/nemo_retriever/service/services/proxy.py index 3073c02a47..46bcf62ebf 100644 --- a/nemo_retriever/src/nemo_retriever/service/services/proxy.py +++ b/nemo_retriever/src/nemo_retriever/service/services/proxy.py @@ -30,10 +30,19 @@ from nemo_retriever.service.config import GatewayConfig from nemo_retriever.service.services.pipeline_pool import PoolType +from nemo_retriever.service import tracing as tracing_module logger = logging.getLogger(__name__) +def _inject_trace_context(headers: dict[str, str]) -> None: + try: + tracing_module.inject_trace_context(headers) + except Exception as exc: + logger.warning("Skipping outbound trace context injection after tracing failure: %s", exc) + logger.debug("Trace context injection failure", exc_info=True) + + def _error_response(status_code: int, detail: str) -> Response: """Build a JSON error response with full diagnostic detail.""" import json as _json @@ -118,6 +127,7 @@ async def forward( fwd_headers = {k: v for k, v in request.headers.items() if k.lower() not in ("host", "transfer-encoding")} if extra_headers: fwd_headers.update(extra_headers) + _inject_trace_context(fwd_headers) try: backend_resp = await client.request( @@ -176,6 +186,7 @@ async def forward_get( client = self._client_for(pool_type) backend_label = pool_type.value fwd_headers = {k: v for k, v in request.headers.items() if k.lower() not in ("host",)} + _inject_trace_context(fwd_headers) try: backend_resp = await client.get(path, headers=fwd_headers) diff --git a/nemo_retriever/src/nemo_retriever/service/tracing.py b/nemo_retriever/src/nemo_retriever/service/tracing.py new file mode 100644 index 0000000000..7ed57913ad --- /dev/null +++ b/nemo_retriever/src/nemo_retriever/service/tracing.py @@ -0,0 +1,278 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. +# All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""OpenTelemetry tracing helpers for retriever service roles.""" + +from __future__ import annotations + +import logging +import os +import re +from contextlib import nullcontext +from typing import Any, Mapping, MutableMapping + +from opentelemetry import trace +from opentelemetry.trace.propagation.tracecontext import TraceContextTextMapPropagator + +try: + from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter +except Exception: # pragma: no cover - exercised through configure_tracing failure handling. + OTLPSpanExporter = None # type: ignore[assignment] + +try: + from opentelemetry.sdk.resources import Resource + from opentelemetry.sdk.trace import TracerProvider + from opentelemetry.sdk.trace.export import BatchSpanProcessor +except Exception: # pragma: no cover - exercised through configure_tracing failure handling. + BatchSpanProcessor = None # type: ignore[assignment] + Resource = None # type: ignore[assignment] + TracerProvider = None # type: ignore[assignment] + +TRACE_ID_HEADER = "x-trace-id" +_W3C_TRACE_CONTEXT_HEADER_NAMES = frozenset({"traceparent", "tracestate"}) + +logger = logging.getLogger(__name__) + +_DEFAULT_SERVICE_NAME = "nemo-retriever-service" +_CONFIGURED_PROVIDER: Any | None = None +_TRACE_CONTEXT_PROPAGATOR = TraceContextTextMapPropagator() +_SENSITIVE_ATTRIBUTE_TOKENS = frozenset( + {"authorization", "auth", "token", "password", "secret", "credential", "credentials"} +) +_RAW_CONTENT_TOKENS = frozenset({"body", "payload", "content"}) +_BENIGN_CONTENT_METADATA_TOKENS = frozenset({"count", "encoding", "language", "length", "size", "type"}) + + +def tracing_enabled_from_env(env: Mapping[str, str] | None = None) -> bool: + """Return whether Helm-compatible OpenTelemetry env enables tracing.""" + source = os.environ if env is None else env + if source.get("OTEL_SDK_DISABLED", "").strip().lower() == "true": + return False + + traces_exporter = source.get("OTEL_TRACES_EXPORTER", "").strip() + endpoint = source.get("OTEL_EXPORTER_OTLP_ENDPOINT", "").strip() + return bool(traces_exporter and traces_exporter.lower() == "otlp" and endpoint) + + +def configure_tracing(*, service_role: str, service_name: str | None = None) -> bool: + """Configure process-wide OTLP tracing when enabled by environment. + + Tracing is observability-only. Any setup failure is logged and reported as + ``False`` without preventing service startup. + """ + global _CONFIGURED_PROVIDER + + if _CONFIGURED_PROVIDER is not None: + return True + + if not tracing_enabled_from_env(): + return False + + if _global_tracer_provider_is_already_configured(): + return True + + provider: Any | None = None + exporter: Any | None = None + processor: Any | None = None + processor_added = False + + try: + if OTLPSpanExporter is None or BatchSpanProcessor is None or Resource is None or TracerProvider is None: + raise RuntimeError("OpenTelemetry SDK/exporter packages are not importable") + + resolved_service_name = (service_name or os.environ.get("OTEL_SERVICE_NAME") or _DEFAULT_SERVICE_NAME).strip() + if not resolved_service_name: + resolved_service_name = _DEFAULT_SERVICE_NAME + + resource = Resource.create({"service.name": resolved_service_name, "service.role": service_role}) + provider = TracerProvider(resource=resource) + exporter = OTLPSpanExporter() + processor = BatchSpanProcessor(exporter) + provider.add_span_processor(processor) + processor_added = True + trace.set_tracer_provider(provider) + + if trace.get_tracer_provider() is not provider: + raise RuntimeError("OpenTelemetry tracer provider is already configured") + + _CONFIGURED_PROVIDER = provider + logger.info("OpenTelemetry tracing configured: service=%s role=%s", resolved_service_name, service_role) + return True + except Exception as exc: + _cleanup_partial_tracing_setup( + provider=provider, processor=processor, exporter=exporter, processor_added=processor_added + ) + logger.warning("OpenTelemetry tracing setup failed: %s", exc) + return False + + +def get_tracer(name: str = "nemo_retriever.service") -> Any: + """Return a tracer for service instrumentation.""" + return trace.get_tracer(name) + + +def start_span( + name: str, + *, + kind: Any | None = None, + context: Any | None = None, + attributes: Mapping[str, Any] | None = None, +) -> Any: + """Start a current span after removing sensitive attributes.""" + kwargs: dict[str, Any] = {} + if kind is not None: + kwargs["kind"] = kind + if context is not None: + kwargs["context"] = context + sanitized_attributes = _sanitize_span_attributes(attributes) + if sanitized_attributes is not None: + kwargs["attributes"] = sanitized_attributes + try: + span_context = get_tracer().start_as_current_span(name, **kwargs) + except Exception as exc: + logger.warning("OpenTelemetry span setup failed: %s", exc) + return nullcontext() + return _SafeSpanContext(span_context) + + +class _SafeSpanContext: + """Context manager that keeps tracing failures observability-only.""" + + def __init__(self, span_context: Any) -> None: + self._span_context = span_context + self._entered = False + + def __enter__(self) -> Any: + try: + span = self._span_context.__enter__() + except Exception as exc: + logger.warning("OpenTelemetry span setup failed: %s", exc) + return None + self._entered = True + return span + + def __exit__(self, exc_type: Any, exc: Any, traceback: Any) -> bool: + if not self._entered: + return False + try: + return bool(self._span_context.__exit__(exc_type, exc, traceback)) + except Exception as span_exc: + logger.warning("OpenTelemetry span teardown failed: %s", span_exc) + return False + + +def current_trace_id_hex() -> str | None: + """Return the current valid trace id as 32 lowercase hex chars.""" + span = trace.get_current_span() + context = span.get_span_context() + if not context.is_valid: + return None + return f"{context.trace_id:032x}" + + +def inject_trace_context(carrier: MutableMapping[str, str] | None = None) -> MutableMapping[str, str]: + """Inject W3C trace context into a clean or provided mutable carrier.""" + output: MutableMapping[str, str] = {} if carrier is None else carrier + for key in list(output): + if key.lower() in _W3C_TRACE_CONTEXT_HEADER_NAMES: + del output[key] + _TRACE_CONTEXT_PROPAGATOR.inject(output) + return output + + +def extract_trace_context(carrier: Mapping[str, str] | None) -> Any: + """Extract W3C trace context from a carrier mapping.""" + return _TRACE_CONTEXT_PROPAGATOR.extract(dict(carrier or {})) + + +def force_flush(timeout_millis: int = 1000) -> None: + """Best-effort flush of configured spans.""" + if _CONFIGURED_PROVIDER is None: + return + try: + _CONFIGURED_PROVIDER.force_flush(timeout_millis=timeout_millis) + except Exception as exc: + logger.warning("OpenTelemetry tracing flush failed: %s", exc) + + +def _reset_tracing_for_tests() -> None: + """Reset tracing globals so tests can configure providers repeatedly.""" + global _CONFIGURED_PROVIDER + + provider = _CONFIGURED_PROVIDER + _CONFIGURED_PROVIDER = None + if provider is not None: + try: + provider.shutdown() + except Exception: + logger.debug("Ignoring OpenTelemetry provider shutdown failure during test reset", exc_info=True) + + try: + trace._TRACER_PROVIDER = None # type: ignore[attr-defined] # noqa: SLF001 + trace._TRACER_PROVIDER_SET_ONCE._done = False # type: ignore[attr-defined] # noqa: SLF001 + except AttributeError: + logger.debug("OpenTelemetry test reset skipped private provider state reset", exc_info=True) + + +def _sanitize_span_attributes(attributes: Mapping[str, Any] | None) -> dict[str, Any] | None: + if attributes is None: + return None + + return {key: value for key, value in attributes.items() if not _is_sensitive_span_attribute_name(key)} + + +def _global_tracer_provider_is_already_configured() -> bool: + provider = trace.get_tracer_provider() + proxy_provider_type = getattr(trace, "ProxyTracerProvider", None) + if proxy_provider_type is not None: + return not isinstance(provider, proxy_provider_type) + return TracerProvider is not None and isinstance(provider, TracerProvider) + + +def _cleanup_partial_tracing_setup(*, provider: Any, processor: Any, exporter: Any, processor_added: bool) -> None: + if not processor_added: + _shutdown_quietly(processor) + _shutdown_quietly(exporter) + _shutdown_quietly(provider) + + +def _shutdown_quietly(resource: Any | None) -> None: + if resource is None: + return + shutdown = getattr(resource, "shutdown", None) + if shutdown is None: + return + try: + shutdown() + except Exception: + logger.debug("Ignoring OpenTelemetry cleanup failure", exc_info=True) + + +def _is_sensitive_span_attribute_name(key: str) -> bool: + tokens = _attribute_name_tokens(key) + token_set = set(tokens) + compact = "".join(tokens) + + if token_set & _SENSITIVE_ATTRIBUTE_TOKENS: + return True + if "api" in token_set and "key" in token_set: + return True + if "apikey" in compact: + return True + if compact in {"body", "payload", "content", "filebytes"}: + return True + if "file" in token_set and "bytes" in token_set: + return True + if "request" in token_set and token_set & _RAW_CONTENT_TOKENS: + return True + if "raw" in token_set and (token_set & _RAW_CONTENT_TOKENS or "bytes" in token_set): + return True + if token_set & _RAW_CONTENT_TOKENS and not token_set & _BENIGN_CONTENT_METADATA_TOKENS: + return True + return False + + +def _attribute_name_tokens(key: str) -> list[str]: + camel_split = re.sub(r"([a-z0-9])([A-Z])", r"\1_\2", key) + return [token for token in re.split(r"[^A-Za-z0-9]+", camel_split.lower()) if token] diff --git a/nemo_retriever/src/nemo_retriever/service_ingestor.py b/nemo_retriever/src/nemo_retriever/service_ingestor.py index cd0e3783c8..a3954b7609 100644 --- a/nemo_retriever/src/nemo_retriever/service_ingestor.py +++ b/nemo_retriever/src/nemo_retriever/service_ingestor.py @@ -16,7 +16,7 @@ finished, returns a :class:`ServiceIngestResult` (a ``list`` subclass holding per-document completion events, plus ``job_id`` / ``failures`` / ``document_ids`` / ``document_filenames`` / ``elapsed_s`` / - ``job_status`` / ``dataframe`` + ``job_status`` / ``trace_id`` / ``dataframe`` attributes). By default ``return_results=True`` fetches each completed document's rows from the status endpoint into ``result.dataframe``. Each @@ -131,6 +131,10 @@ class ServiceIngestResult(list): lifecycle event was observed during the run. ``None`` if the stream closed without a terminal job event (e.g. SSE fallback only delivered per-document completions). + trace_id + Trace id returned by the server on the ``job_created`` event. + ``None`` when tracing is disabled or the service does not include + a trace id in the job creation event. dataframe When :meth:`ServiceIngestor.ingest` is called with ``return_results=True`` (the default), a ``pandas.DataFrame`` @@ -149,6 +153,7 @@ def __init__(self, items: list[dict[str, Any]] | None = None) -> None: self.document_filenames: dict[str, str] = {} self.elapsed_s: float = 0.0 self.job_status: str | None = None + self.trace_id: str | None = None self.dataframe: Any = None def __repr__(self) -> str: @@ -1087,6 +1092,7 @@ def ingest(self, params: Any = None, **kwargs: Any) -> Any: if event_type == "job_created": result.job_id = evt.get("job_id") or result.job_id + result.trace_id = evt.get("trace_id") or result.trace_id continue if event_type in ("job_finalized", "job_partial", "job_failed"): diff --git a/nemo_retriever/tests/test_dashboard_static_trace_ui.py b/nemo_retriever/tests/test_dashboard_static_trace_ui.py new file mode 100644 index 0000000000..488225b3fb --- /dev/null +++ b/nemo_retriever/tests/test_dashboard_static_trace_ui.py @@ -0,0 +1,31 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. +# All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[1] +STATIC_VIEWS = ROOT / "src" / "nemo_retriever" / "service" / "dashboard" / "static" / "views" + + +def _source(name: str) -> str: + return (STATIC_VIEWS / name).read_text(encoding="utf-8") + + +def test_jobs_table_renders_and_retains_trace_id() -> None: + source = _source("jobs.jsx") + + assert "'Trace ID'" in source + assert "trace_id: ev.trace_id != null ? ev.trace_id" in source + assert "j.trace_id" in source + + +def test_job_detail_renders_and_retains_trace_id() -> None: + source = _source("job_detail.jsx") + + assert "'Trace ID'" in source + assert "trace_id: data.trace_id != null ? data.trace_id" in source + assert "job.trace_id" in source diff --git a/nemo_retriever/tests/test_helm_tracing_zipkin.py b/nemo_retriever/tests/test_helm_tracing_zipkin.py new file mode 100644 index 0000000000..0358f96065 --- /dev/null +++ b/nemo_retriever/tests/test_helm_tracing_zipkin.py @@ -0,0 +1,834 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. +# All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Regression tests for Helm-rendered tracing and Zipkin resources.""" + +from __future__ import annotations + +import shutil +import subprocess +from pathlib import Path +from unittest import SkipTest + +import yaml + + +RELEASE = "tracing-regression" +FULLNAME = f"{RELEASE}-nemo-retriever" +ZIPKIN_NAME = f"{FULLNAME}-zipkin" +OTEL_NAME = f"{FULLNAME}-otel" +OTEL_CONFIG_NAME = f"{OTEL_NAME}-config" +SERVICE_OTEL_ENABLED = ["service.otel.enabled=true"] +NIM_OTEL_ENABLED = ["nimOperator.otel.enabled=true"] +POD_OTEL_ENABLED = SERVICE_OTEL_ENABLED + NIM_OTEL_ENABLED + +NIMSERVICE_NAMES = { + "audio", + "llama-nemotron-embed-vl-1b-v2", + "llama-nemotron-rerank-vl-1b-v2", + "nemotron-3-nano-omni-30b-a3b-reasoning", + "nemotron-ocr-v1", + "nemotron-page-elements-v3", + "nemotron-parse", + "nemotron-table-structure-v1", +} + + +def _repo_root() -> Path: + return Path(__file__).resolve().parents[2] + + +def _helm_template_cmd( + extra_sets: list[str] | None = None, + extra_args: list[str] | None = None, + release_name: str = RELEASE, +) -> list[str]: + helm = shutil.which("helm") + if helm is None: + raise SkipTest("`helm` binary not available in this environment.") + chart_path = _repo_root() / "nemo_retriever/helm" + cmd = [ + helm, + "template", + release_name, + str(chart_path), + "--set", + "ngcImagePullSecret.create=false", + "--set", + "ngcApiSecret.create=false", + "--set", + "nimOperator.rerankqa.enabled=true", + "--set", + "nimOperator.audio.enabled=true", + "--set", + "nimOperator.nemotron_parse.enabled=true", + "--set", + "nimOperator.nemotron_3_nano_omni_30b_a3b_reasoning.enabled=true", + "--api-versions", + "apps.nvidia.com/v1alpha1", + ] + for flag in extra_sets or []: + cmd.extend(["--set", flag]) + cmd.extend(extra_args or []) + return cmd + + +def _helm_template_process( + extra_sets: list[str] | None = None, + extra_args: list[str] | None = None, + release_name: str = RELEASE, +) -> subprocess.CompletedProcess[str]: + return subprocess.run( + _helm_template_cmd(extra_sets=extra_sets, extra_args=extra_args, release_name=release_name), + check=False, + capture_output=True, + text=True, + ) + + +def _helm_template( + extra_sets: list[str] | None = None, + extra_args: list[str] | None = None, + release_name: str = RELEASE, +) -> list[dict]: + proc = _helm_template_process(extra_sets=extra_sets, extra_args=extra_args, release_name=release_name) + if proc.returncode != 0: + raise AssertionError(f"`helm template` failed:\nSTDOUT:\n{proc.stdout}\nSTDERR:\n{proc.stderr}") + return [doc for doc in yaml.safe_load_all(proc.stdout) if doc] + + +def _write_values_file(tmp_path: Path, values: dict) -> Path: + values_path = tmp_path / "values.yaml" + values_path.write_text(yaml.safe_dump(values), encoding="utf-8") + return values_path + + +def _find(docs: list[dict], kind: str, name: str) -> dict: + for doc in docs: + if doc.get("kind") == kind and doc.get("metadata", {}).get("name") == name: + return doc + raise AssertionError(f"Rendered {kind}/{name} not found.") + + +def _names_for_kind(docs: list[dict], kind: str) -> set[str]: + return {doc.get("metadata", {}).get("name") for doc in docs if doc.get("kind") == kind} + + +def _deployment_env(doc: dict) -> list[dict]: + return doc["spec"]["template"]["spec"]["containers"][0]["env"] + + +def _nim_env(doc: dict) -> list[dict]: + return doc["spec"]["env"] + + +def _env_values(env: list[dict]) -> dict[str, str]: + return {item["name"]: item.get("value") for item in env} + + +def _assert_unique_env_names(env: list[dict]) -> None: + names = [item["name"] for item in env] + assert len(names) == len(set(names)) + + +def _container_names(doc: dict) -> set[str]: + return {container.get("name") for container in doc["spec"]["template"]["spec"].get("containers", [])} + + +def _find_deployment_by_container(docs: list[dict], container_name: str) -> dict: + for doc in docs: + if doc.get("kind") == "Deployment" and container_name in _container_names(doc): + return doc + raise AssertionError(f"Rendered Deployment with container {container_name!r} not found.") + + +def _find_service_by_port_name(docs: list[dict], port_name: str) -> dict: + for doc in docs: + if doc.get("kind") != "Service": + continue + port_names = {port.get("name") for port in doc.get("spec", {}).get("ports", [])} + if port_name in port_names: + return doc + raise AssertionError(f"Rendered Service with port {port_name!r} not found.") + + +def _find_configmap_with_key(docs: list[dict], key: str) -> dict: + for doc in docs: + if doc.get("kind") == "ConfigMap" and key in doc.get("data", {}): + return doc + raise AssertionError(f"Rendered ConfigMap with key {key!r} not found.") + + +def _find_by_component(docs: list[dict], kind: str, component: str) -> dict: + for doc in docs: + if ( + doc.get("kind") == kind + and doc.get("metadata", {}).get("labels", {}).get("app.kubernetes.io/component") == component + ): + return doc + raise AssertionError(f"Rendered {kind} with component {component!r} not found.") + + +def test_long_release_preserves_tracing_name_suffixes_and_references() -> None: + long_release = "a" * 53 + long_fullname = "b" * 63 + docs = _helm_template( + SERVICE_OTEL_ENABLED, + release_name=long_release, + extra_args=["--set-string", f"fullnameOverride={long_fullname}"], + ) + + otel_deployment = _find_deployment_by_container(docs, "otel-collector") + zipkin_deployment = _find_deployment_by_container(docs, "zipkin") + service_deployment = _find_deployment_by_container(docs, "nemo-retriever") + otel_service = _find_service_by_port_name(docs, "otlp-grpc") + zipkin_service = _find_by_component(docs, "Service", "zipkin") + otel_config = _find_configmap_with_key(docs, "config.yaml") + + deployment_names = [doc["metadata"]["name"] for doc in docs if doc.get("kind") == "Deployment"] + service_names = [doc["metadata"]["name"] for doc in docs if doc.get("kind") == "Service"] + assert len(deployment_names) == len(set(deployment_names)) + assert len(service_names) == len(set(service_names)) + + otel_name = otel_deployment["metadata"]["name"] + zipkin_name = zipkin_deployment["metadata"]["name"] + relevant_names = { + otel_name, + otel_service["metadata"]["name"], + otel_config["metadata"]["name"], + zipkin_name, + zipkin_service["metadata"]["name"], + } + assert all(len(name) <= 63 for name in relevant_names) + assert otel_name.endswith("-otel") + assert otel_service["metadata"]["name"] == otel_name + assert zipkin_name.endswith("-zipkin") + assert zipkin_service["metadata"]["name"] == zipkin_name + + volume_config_name = otel_deployment["spec"]["template"]["spec"]["volumes"][0]["configMap"]["name"] + assert volume_config_name == otel_config["metadata"]["name"] + + service_env = _env_values(_deployment_env(service_deployment)) + assert service_env["OTEL_EXPORTER_OTLP_ENDPOINT"] == f"http://{otel_name}:4317" + + otel_config_data = yaml.safe_load(otel_config["data"]["config.yaml"]) + assert otel_config_data["exporters"]["zipkin"]["endpoint"] == f"http://{zipkin_name}:9411/api/v2/spans" + + +def test_null_otel_env_maps_render_as_empty_maps() -> None: + docs = _helm_template( + POD_OTEL_ENABLED, + extra_args=[ + "--set-json", + "service.otel.env=null", + "--set-json", + "nimOperator.page_elements.otel=null", + "--set-json", + "nimOperator.otel.env=null", + "--set-json", + "nimOperator.rerankqa.otel.env=null", + ], + ) + + service_env = _env_values(_deployment_env(_find(docs, "Deployment", FULLNAME))) + assert service_env["OTEL_EXPORTER_OTLP_ENDPOINT"] == f"http://{OTEL_NAME}:4317" + + page_env = _env_values(_nim_env(_find(docs, "NIMService", "nemotron-page-elements-v3"))) + assert page_env["NIM_ENABLE_OTEL"] == "true" + assert page_env["NIM_OTEL_EXPORTER_OTLP_ENDPOINT"] == f"http://{OTEL_NAME}:4318" + + rerank_env = _env_values(_nim_env(_find(docs, "NIMService", "llama-nemotron-rerank-vl-1b-v2"))) + assert rerank_env["NIM_ENABLE_OTEL"] == "true" + assert rerank_env["NIM_OTEL_EXPORTER_OTLP_ENDPOINT"] == f"http://{OTEL_NAME}:4318" + + +def test_default_renders_zipkin_deployment_and_service() -> None: + docs = _helm_template() + + _find(docs, "Deployment", ZIPKIN_NAME) + service = _find(docs, "Service", ZIPKIN_NAME) + _find(docs, "Deployment", OTEL_NAME) + _find(docs, "Service", OTEL_NAME) + _find(docs, "ConfigMap", OTEL_CONFIG_NAME) + + assert service["spec"]["type"] == "ClusterIP" + assert service["spec"]["ports"] == [{"name": "http", "protocol": "TCP", "port": 9411, "targetPort": "http"}] + + +def test_default_omits_managed_service_and_nim_otel_env() -> None: + docs = _helm_template() + service_values = _env_values(_deployment_env(_find(docs, "Deployment", FULLNAME))) + service_managed_names = { + "OTEL_EXPORTER_OTLP_ENDPOINT", + "OTEL_SERVICE_NAME", + "OTEL_TRACES_EXPORTER", + "OTEL_METRICS_EXPORTER", + "OTEL_LOGS_EXPORTER", + "OTEL_PROPAGATORS", + "OTEL_RESOURCE_ATTRIBUTES", + "OTEL_PYTHON_EXCLUDED_URLS", + } + nim_managed_names = { + "NIM_ENABLE_OTEL", + "NIM_OTEL_EXPORTER_OTLP_ENDPOINT", + "TRITON_OTEL_URL", + } + + assert service_managed_names.isdisjoint(service_values) + for doc in [doc for doc in docs if doc.get("kind") == "NIMService"]: + assert nim_managed_names.isdisjoint(_env_values(_nim_env(doc))) + + +def test_tracing_pods_include_image_pull_secrets() -> None: + docs = _helm_template(extra_args=["--set", "imagePullSecrets[0].name=trace-registry"]) + + for deployment_name in (OTEL_NAME, ZIPKIN_NAME): + pod_spec = _find(docs, "Deployment", deployment_name)["spec"]["template"]["spec"] + + assert pod_spec["imagePullSecrets"] == [ + {"name": "ngc-secret"}, + {"name": "trace-registry"}, + ] + + +def test_zipkin_disabled_omits_zipkin_resources_and_exporter() -> None: + docs = _helm_template(["topology.zipkin.enabled=false"]) + + deployment_names = _names_for_kind(docs, "Deployment") + service_names = _names_for_kind(docs, "Service") + + assert ZIPKIN_NAME not in deployment_names + assert ZIPKIN_NAME not in service_names + assert OTEL_NAME in deployment_names + assert OTEL_NAME in service_names + + config = yaml.safe_load(_find(docs, "ConfigMap", OTEL_CONFIG_NAME)["data"]["config.yaml"]) + assert "zipkin" not in config["exporters"] + assert "zipkin" not in config["service"]["pipelines"]["traces"]["exporters"] + assert ZIPKIN_NAME not in _find(docs, "ConfigMap", OTEL_CONFIG_NAME)["data"]["config.yaml"] + + +def test_zipkin_disabled_with_external_endpoint_still_exports_to_zipkin() -> None: + external_endpoint = "http://external-zipkin:9411/api/v2/spans" + docs = _helm_template( + ["topology.zipkin.enabled=false"], + extra_args=[ + "--set-string", + f"topology.zipkin.exporter.endpoint={external_endpoint}", + ], + ) + config = yaml.safe_load(_find(docs, "ConfigMap", OTEL_CONFIG_NAME)["data"]["config.yaml"]) + + assert ZIPKIN_NAME not in _names_for_kind(docs, "Deployment") + assert config["exporters"]["zipkin"]["endpoint"] == external_endpoint + assert "zipkin" in config["service"]["pipelines"]["traces"]["exporters"] + + +def test_null_otel_config_fails_with_chart_authored_message() -> None: + proc = _helm_template_process(extra_args=["--set-json", "topology.otel.config=null"]) + + assert proc.returncode != 0 + assert "topology.otel.config must be a map when topology.otel.enabled=true" in proc.stderr + assert "reflect:" not in proc.stderr + assert "nil pointer" not in proc.stderr + + +def test_null_otel_ports_fails_with_chart_authored_message() -> None: + proc = _helm_template_process(extra_args=["--set-json", "topology.otel.ports=null"]) + + assert proc.returncode != 0 + assert "topology.otel.ports must be a map when topology.otel.enabled=true" in proc.stderr + assert "reflect:" not in proc.stderr + assert "nil pointer" not in proc.stderr + + +def test_null_zipkin_subtree_omits_zipkin_resources_and_exporter() -> None: + docs = _helm_template(extra_args=["--set-json", "topology.zipkin=null"]) + + deployment_names = _names_for_kind(docs, "Deployment") + service_names = _names_for_kind(docs, "Service") + config_text = _find(docs, "ConfigMap", OTEL_CONFIG_NAME)["data"]["config.yaml"] + config = yaml.safe_load(config_text) + + assert ZIPKIN_NAME not in deployment_names + assert ZIPKIN_NAME not in service_names + assert OTEL_NAME in deployment_names + assert OTEL_NAME in service_names + assert "zipkin" not in config["exporters"] + assert "zipkin" not in config["service"]["pipelines"]["traces"]["exporters"] + assert ZIPKIN_NAME not in config_text + + +def test_null_zipkin_exporter_omits_exporter_injection() -> None: + docs = _helm_template(extra_args=["--set-json", "topology.zipkin.exporter=null"]) + + config = yaml.safe_load(_find(docs, "ConfigMap", OTEL_CONFIG_NAME)["data"]["config.yaml"]) + + assert ZIPKIN_NAME in _names_for_kind(docs, "Deployment") + assert ZIPKIN_NAME in _names_for_kind(docs, "Service") + assert "zipkin" not in config["exporters"] + assert "zipkin" not in config["service"]["pipelines"]["traces"]["exporters"] + + +def test_null_zipkin_image_fails_with_chart_authored_message() -> None: + proc = _helm_template_process(extra_args=["--set-json", "topology.zipkin.image=null"]) + + assert proc.returncode != 0 + assert "topology.zipkin.image must be a map when topology.zipkin.enabled=true" in proc.stderr + assert "reflect:" not in proc.stderr + assert "nil pointer" not in proc.stderr + + +def test_null_zipkin_port_fails_with_chart_authored_message() -> None: + proc = _helm_template_process(extra_args=["--set-json", "topology.zipkin.port=null"]) + + assert proc.returncode != 0 + assert "topology.zipkin.port is required when topology.zipkin.enabled=true" in proc.stderr + assert "reflect:" not in proc.stderr + assert "nil pointer" not in proc.stderr + assert f"http://{ZIPKIN_NAME}:/api/v2/spans" not in proc.stdout + assert "port: null" not in proc.stdout + + +def test_zipkin_exporter_injection_preserves_custom_exporter_settings() -> None: + external_endpoint = "http://external-zipkin:9411/api/v2/spans" + docs = _helm_template( + extra_args=[ + "--set-string", + "topology.otel.config.exporters.zipkin.endpoint=http://stale-zipkin:9411/api/v2/spans", + "--set-string", + "topology.otel.config.exporters.zipkin.timeout=15s", + "--set", + "topology.otel.config.exporters.zipkin.sending_queue.enabled=true", + "--set-string", + f"topology.zipkin.exporter.endpoint={external_endpoint}", + ] + ) + config = yaml.safe_load(_find(docs, "ConfigMap", OTEL_CONFIG_NAME)["data"]["config.yaml"]) + + zipkin_exporter = config["exporters"]["zipkin"] + assert zipkin_exporter["endpoint"] == external_endpoint + assert zipkin_exporter["timeout"] == "15s" + assert zipkin_exporter["sending_queue"]["enabled"] is True + + +def test_zipkin_deployment_renders_security_contexts() -> None: + docs = _helm_template( + extra_args=[ + "--set-json", + 'topology.zipkin.podSecurityContext={"runAsNonRoot":true,"seccompProfile":{"type":"RuntimeDefault"}}', + "--set-json", + ( + 'topology.zipkin.securityContext={"allowPrivilegeEscalation":false,' + '"readOnlyRootFilesystem":true,"capabilities":{"drop":["ALL"]}}' + ), + ] + ) + deployment = _find(docs, "Deployment", ZIPKIN_NAME) + pod_spec = deployment["spec"]["template"]["spec"] + container = pod_spec["containers"][0] + + assert pod_spec["securityContext"] == { + "runAsNonRoot": True, + "seccompProfile": {"type": "RuntimeDefault"}, + } + assert container["securityContext"] == { + "allowPrivilegeEscalation": False, + "readOnlyRootFilesystem": True, + "capabilities": {"drop": ["ALL"]}, + } + + +def test_zipkin_deployment_renders_default_tcp_health_probes() -> None: + docs = _helm_template() + deployment = _find(docs, "Deployment", ZIPKIN_NAME) + container = deployment["spec"]["template"]["spec"]["containers"][0] + + for probe_name in ("livenessProbe", "readinessProbe", "startupProbe"): + assert container[probe_name]["tcpSocket"] == {"port": "http"} + + +def test_zipkin_deployment_omits_disabled_probe() -> None: + docs = _helm_template(["topology.zipkin.readinessProbe.enabled=false"]) + deployment = _find(docs, "Deployment", ZIPKIN_NAME) + container = deployment["spec"]["template"]["spec"]["containers"][0] + + assert "readinessProbe" not in container + assert "livenessProbe" in container + assert "startupProbe" in container + + +def test_zipkin_deployment_allows_null_probe_maps_from_values_file( + tmp_path: Path, +) -> None: + values_path = _write_values_file( + tmp_path, + { + "topology": { + "zipkin": { + "startupProbe": None, + "livenessProbe": None, + "readinessProbe": None, + } + } + }, + ) + docs = _helm_template(extra_args=["--values", str(values_path)]) + deployment = _find(docs, "Deployment", ZIPKIN_NAME) + container = deployment["spec"]["template"]["spec"]["containers"][0] + + assert "startupProbe" not in container + assert "livenessProbe" not in container + assert "readinessProbe" not in container + + +def test_zipkin_deployment_omits_null_resources_from_values_file( + tmp_path: Path, +) -> None: + values_path = _write_values_file(tmp_path, {"topology": {"zipkin": {"resources": None}}}) + docs = _helm_template(extra_args=["--values", str(values_path)]) + deployment = _find(docs, "Deployment", ZIPKIN_NAME) + container = deployment["spec"]["template"]["spec"]["containers"][0] + + assert "resources" not in container + + +def test_zipkin_injection_allows_traces_pipeline_without_processors() -> None: + docs = _helm_template( + extra_args=[ + "--set-json", + "topology.otel.config.service.pipelines.traces.processors=[]", + ] + ) + config = yaml.safe_load(_find(docs, "ConfigMap", OTEL_CONFIG_NAME)["data"]["config.yaml"]) + traces = config["service"]["pipelines"]["traces"] + + assert traces["receivers"] == ["otlp"] + assert traces["processors"] == [] + assert "zipkin" in traces["exporters"] + assert traces["exporters"].count("zipkin") == 1 + + +def test_zipkin_injection_initializes_missing_trace_exporters() -> None: + docs = _helm_template( + extra_args=[ + "--set-json", + "topology.otel.config.service.pipelines.traces.exporters=null", + ] + ) + config = yaml.safe_load(_find(docs, "ConfigMap", OTEL_CONFIG_NAME)["data"]["config.yaml"]) + + assert config["service"]["pipelines"]["traces"]["exporters"] == ["zipkin"] + + +def test_zipkin_injection_allows_empty_trace_component_maps() -> None: + docs = _helm_template( + extra_args=[ + "--set-json", + "topology.otel.config.receivers.custom={}", + "--set-json", + "topology.otel.config.processors.custom={}", + "--set-json", + "topology.otel.config.exporters.custom={}", + "--set-json", + 'topology.otel.config.service.pipelines.traces.receivers=["custom"]', + "--set-json", + 'topology.otel.config.service.pipelines.traces.processors=["custom"]', + "--set-json", + 'topology.otel.config.service.pipelines.traces.exporters=["custom"]', + ] + ) + config = yaml.safe_load(_find(docs, "ConfigMap", OTEL_CONFIG_NAME)["data"]["config.yaml"]) + traces = config["service"]["pipelines"]["traces"] + + assert config["receivers"]["custom"] == {} + assert config["processors"]["custom"] == {} + assert config["exporters"]["custom"] == {} + assert traces["receivers"] == ["custom"] + assert traces["processors"] == ["custom"] + assert traces["exporters"] == ["custom", "zipkin"] + + +def test_zipkin_injection_requires_existing_traces_pipeline() -> None: + proc = _helm_template_process(extra_args=["--set-json", "topology.otel.config.service.pipelines.traces=null"]) + + assert proc.returncode != 0 + assert ( + "topology.zipkin.exporter.enabled requires topology.otel.config.service.pipelines.traces " + "with non-empty receivers; provide that traces pipeline or set topology.zipkin.exporter.enabled=false" + ) in proc.stderr + + +def test_zipkin_injection_requires_non_empty_trace_receivers() -> None: + expected = ( + "topology.zipkin.exporter.enabled requires topology.otel.config.service.pipelines.traces " + "with non-empty receivers; provide that traces pipeline or set topology.zipkin.exporter.enabled=false" + ) + + for receivers_override in ( + "topology.otel.config.service.pipelines.traces.receivers=null", + "topology.otel.config.service.pipelines.traces.receivers=[]", + ): + proc = _helm_template_process(extra_args=["--set-json", receivers_override]) + + assert proc.returncode != 0 + assert expected in proc.stderr + + +def test_zipkin_injection_requires_referenced_receiver_to_exist() -> None: + proc = _helm_template_process(extra_args=["--set-json", "topology.otel.config.receivers.otlp=null"]) + + assert proc.returncode != 0 + assert 'trace receiver "otlp" is missing or null' in proc.stderr + assert "fix topology.otel.config or set topology.zipkin.exporter.enabled=false" in proc.stderr + + +def test_zipkin_injection_requires_referenced_processor_to_exist() -> None: + proc = _helm_template_process(extra_args=["--set-json", "topology.otel.config.processors.batch=null"]) + + assert proc.returncode != 0 + assert 'trace processor "batch" is missing or null' in proc.stderr + assert "fix topology.otel.config or set topology.zipkin.exporter.enabled=false" in proc.stderr + + +def test_zipkin_injection_requires_referenced_exporter_to_exist() -> None: + proc = _helm_template_process(extra_args=["--set-json", "topology.otel.config.exporters.debug=null"]) + + assert proc.returncode != 0 + assert 'trace exporter "debug" is missing or null' in proc.stderr + assert "fix topology.otel.config or set topology.zipkin.exporter.enabled=false" in proc.stderr + + +def test_otel_config_exports_traces_to_rendered_zipkin_endpoint() -> None: + docs = _helm_template() + config = yaml.safe_load(_find(docs, "ConfigMap", OTEL_CONFIG_NAME)["data"]["config.yaml"]) + + assert config["exporters"]["zipkin"]["endpoint"] == f"http://{ZIPKIN_NAME}:9411/api/v2/spans" + trace_exporters = config["service"]["pipelines"]["traces"]["exporters"] + assert "zipkin" in trace_exporters + assert trace_exporters.count("zipkin") == 1 + + +def test_standalone_service_gets_otel_env_without_duplicate_user_overrides() -> None: + docs = _helm_template( + SERVICE_OTEL_ENABLED + + [ + "service.env[0].name=OTEL_SERVICE_NAME", + "service.env[0].value=user-service-name", + ] + ) + env = _deployment_env(_find(docs, "Deployment", FULLNAME)) + values = _env_values(env) + + _assert_unique_env_names(env) + assert values["OTEL_EXPORTER_OTLP_ENDPOINT"] == f"http://{OTEL_NAME}:4317" + assert values["OTEL_SERVICE_NAME"] == "user-service-name" + assert values["OTEL_TRACES_EXPORTER"] == "otlp" + assert values["OTEL_METRICS_EXPORTER"] == "otlp" + assert values["OTEL_LOGS_EXPORTER"] == "none" + assert values["OTEL_PROPAGATORS"] == "tracecontext,baggage" + assert values["OTEL_RESOURCE_ATTRIBUTES"] == "service.namespace=nemo-retriever,service.role=standalone" + assert values["OTEL_PYTHON_EXCLUDED_URLS"] == "health" + + +def test_split_roles_get_otel_env() -> None: + docs = _helm_template(SERVICE_OTEL_ENABLED + ["topology.mode=split"]) + + for role in ("gateway", "realtime", "batch"): + env = _deployment_env(_find(docs, "Deployment", f"{FULLNAME}-{role}")) + values = _env_values(env) + + _assert_unique_env_names(env) + assert values["OTEL_EXPORTER_OTLP_ENDPOINT"] == f"http://{OTEL_NAME}:4317" + assert values["OTEL_SERVICE_NAME"] == "nemo-retriever-service" + assert values["OTEL_RESOURCE_ATTRIBUTES"] == f"service.namespace=nemo-retriever,service.role={role}" + + +def test_all_enabled_nimservices_inherit_otel_env() -> None: + docs = _helm_template(NIM_OTEL_ENABLED) + nimservices = [doc for doc in docs if doc.get("kind") == "NIMService"] + + assert {doc["metadata"]["name"] for doc in nimservices} == NIMSERVICE_NAMES + for doc in nimservices: + name = doc["metadata"]["name"] + env = _nim_env(doc) + values = _env_values(env) + + _assert_unique_env_names(env) + assert values["NIM_ENABLE_OTEL"] == "true" + assert values["NIM_OTEL_SERVICE_NAME"] == name + assert values["NIM_OTEL_TRACES_EXPORTER"] == "otlp" + assert values["NIM_OTEL_METRICS_EXPORTER"] == "console" + assert values["NIM_OTEL_EXPORTER_OTLP_ENDPOINT"] == f"http://{OTEL_NAME}:4318" + assert values["TRITON_OTEL_URL"] == f"http://{OTEL_NAME}:4318/v1/traces" + assert values["TRITON_OTEL_RATE"] == "1" + + +def test_chart_wide_nim_otel_disable_omits_managed_env() -> None: + docs = _helm_template(["nimOperator.otel.enabled=false"]) + nimservices = [doc for doc in docs if doc.get("kind") == "NIMService"] + chart_managed_names = { + "NIM_ENABLE_OTEL", + "NIM_OTEL_EXPORTER_OTLP_ENDPOINT", + "TRITON_OTEL_URL", + } + + assert {doc["metadata"]["name"] for doc in nimservices} == NIMSERVICE_NAMES + for doc in nimservices: + env = _nim_env(doc) + values = _env_values(env) + + _assert_unique_env_names(env) + assert chart_managed_names.isdisjoint(values) + + table_values = _env_values(_nim_env(_find(docs, "NIMService", "nemotron-table-structure-v1"))) + assert table_values["NIM_TRITON_CUDA_MEMORY_POOL_MB"] == "2048" + + +def test_per_nim_otel_endpoint_overrides_chart_endpoint() -> None: + docs = _helm_template( + NIM_OTEL_ENABLED + + [ + "nimOperator.otel.endpoint=http://chart-otel:4318", + "nimOperator.rerankqa.otel.endpoint=http://per-nim-otel:4318", + ] + ) + + page_elements = _find(docs, "NIMService", "nemotron-page-elements-v3") + page_values = _env_values(_nim_env(page_elements)) + assert page_values["NIM_OTEL_EXPORTER_OTLP_ENDPOINT"] == "http://chart-otel:4318" + assert page_values["TRITON_OTEL_URL"] == "http://chart-otel:4318/v1/traces" + + rerank = _find(docs, "NIMService", "llama-nemotron-rerank-vl-1b-v2") + rerank_values = _env_values(_nim_env(rerank)) + assert rerank_values["NIM_OTEL_EXPORTER_OTLP_ENDPOINT"] == "http://per-nim-otel:4318" + assert rerank_values["TRITON_OTEL_URL"] == "http://per-nim-otel:4318/v1/traces" + + +def test_chart_nim_otel_env_endpoint_drives_triton_url() -> None: + docs = _helm_template( + NIM_OTEL_ENABLED + ["nimOperator.otel.env.NIM_OTEL_EXPORTER_OTLP_ENDPOINT=http://env-otel:4318"] + ) + + page_elements = _find(docs, "NIMService", "nemotron-page-elements-v3") + page_values = _env_values(_nim_env(page_elements)) + + assert page_values["NIM_OTEL_EXPORTER_OTLP_ENDPOINT"] == "http://env-otel:4318" + assert page_values["TRITON_OTEL_URL"] == "http://env-otel:4318/v1/traces" + + +def test_per_nim_otel_env_endpoint_drives_triton_url() -> None: + docs = _helm_template( + NIM_OTEL_ENABLED + ["nimOperator.rerankqa.otel.env.NIM_OTEL_EXPORTER_OTLP_ENDPOINT=http://per-env-otel:4318"] + ) + + rerank = _find(docs, "NIMService", "llama-nemotron-rerank-vl-1b-v2") + rerank_values = _env_values(_nim_env(rerank)) + + assert rerank_values["NIM_OTEL_EXPORTER_OTLP_ENDPOINT"] == "http://per-env-otel:4318" + assert rerank_values["TRITON_OTEL_URL"] == "http://per-env-otel:4318/v1/traces" + + +def test_nim_otel_env_triton_url_override_is_preserved() -> None: + docs = _helm_template( + NIM_OTEL_ENABLED + + [ + "nimOperator.otel.env.NIM_OTEL_EXPORTER_OTLP_ENDPOINT=http://env-otel:4318", + "nimOperator.otel.env.TRITON_OTEL_URL=http://explicit-triton/v1/traces", + ] + ) + + page_elements = _find(docs, "NIMService", "nemotron-page-elements-v3") + page_values = _env_values(_nim_env(page_elements)) + + assert page_values["NIM_OTEL_EXPORTER_OTLP_ENDPOINT"] == "http://env-otel:4318" + assert page_values["TRITON_OTEL_URL"] == "http://explicit-triton/v1/traces" + + +def test_existing_nim_env_endpoint_drives_triton_url_without_duplicate_endpoint() -> None: + docs = _helm_template( + NIM_OTEL_ENABLED + + [ + "nimOperator.rerankqa.env[0].name=NIM_OTEL_EXPORTER_OTLP_ENDPOINT", + "nimOperator.rerankqa.env[0].value=http://manual-otel:4318", + ] + ) + + rerank = _find(docs, "NIMService", "llama-nemotron-rerank-vl-1b-v2") + env = _nim_env(rerank) + values = _env_values(env) + + _assert_unique_env_names(env) + assert values["NIM_OTEL_EXPORTER_OTLP_ENDPOINT"] == "http://manual-otel:4318" + assert values["TRITON_OTEL_URL"] == "http://manual-otel:4318/v1/traces" + + +def test_existing_nim_env_endpoint_value_from_omits_chart_managed_triton_url() -> None: + docs = _helm_template( + NIM_OTEL_ENABLED + + [ + "nimOperator.page_elements.env[0].name=NIM_OTEL_EXPORTER_OTLP_ENDPOINT", + "nimOperator.page_elements.env[0].valueFrom.secretKeyRef.name=otel-endpoint", + "nimOperator.page_elements.env[0].valueFrom.secretKeyRef.key=endpoint", + ] + ) + + page_elements = _find(docs, "NIMService", "nemotron-page-elements-v3") + env = _nim_env(page_elements) + values = _env_values(env) + + _assert_unique_env_names(env) + assert values["NIM_OTEL_EXPORTER_OTLP_ENDPOINT"] is None + assert next(item for item in env if item["name"] == "NIM_OTEL_EXPORTER_OTLP_ENDPOINT")["valueFrom"] == { + "secretKeyRef": {"name": "otel-endpoint", "key": "endpoint"} + } + assert "TRITON_OTEL_URL" not in values + + +def test_existing_nim_env_triton_url_override_is_preserved() -> None: + docs = _helm_template( + NIM_OTEL_ENABLED + + [ + "nimOperator.rerankqa.env[0].name=NIM_OTEL_EXPORTER_OTLP_ENDPOINT", + "nimOperator.rerankqa.env[0].value=http://manual-otel:4318", + "nimOperator.rerankqa.env[1].name=TRITON_OTEL_URL", + "nimOperator.rerankqa.env[1].value=http://manual-triton/v1/traces", + ] + ) + + rerank = _find(docs, "NIMService", "llama-nemotron-rerank-vl-1b-v2") + env = _nim_env(rerank) + values = _env_values(env) + + _assert_unique_env_names(env) + assert values["NIM_OTEL_EXPORTER_OTLP_ENDPOINT"] == "http://manual-otel:4318" + assert values["TRITON_OTEL_URL"] == "http://manual-triton/v1/traces" + + +def test_per_nim_otel_opt_out_and_override() -> None: + docs = _helm_template( + NIM_OTEL_ENABLED + + [ + "nimOperator.page_elements.otel.enabled=false", + "nimOperator.otel.endpoint=http://custom-otel:4318", + "nimOperator.rerankqa.otel.serviceName=custom-rerank", + "nimOperator.rerankqa.otel.env.NIM_OTEL_METRICS_EXPORTER=none", + "nimOperator.rerankqa.env[0].name=NIM_OTEL_TRACES_EXPORTER", + "nimOperator.rerankqa.env[0].value=zipkin", + ] + ) + + page_elements = _find(docs, "NIMService", "nemotron-page-elements-v3") + assert "NIM_ENABLE_OTEL" not in _env_values(_nim_env(page_elements)) + + rerank = _find(docs, "NIMService", "llama-nemotron-rerank-vl-1b-v2") + env = _nim_env(rerank) + values = _env_values(env) + + _assert_unique_env_names(env) + assert values["NIM_OTEL_SERVICE_NAME"] == "custom-rerank" + assert values["NIM_OTEL_TRACES_EXPORTER"] == "zipkin" + assert values["NIM_OTEL_METRICS_EXPORTER"] == "none" + assert values["NIM_OTEL_EXPORTER_OTLP_ENDPOINT"] == "http://custom-otel:4318" + assert values["TRITON_OTEL_URL"] == "http://custom-otel:4318/v1/traces" diff --git a/nemo_retriever/tests/test_nim_tracing.py b/nemo_retriever/tests/test_nim_tracing.py new file mode 100644 index 0000000000..a6525772c8 --- /dev/null +++ b/nemo_retriever/tests/test_nim_tracing.py @@ -0,0 +1,492 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. +# All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Tests for OpenTelemetry propagation in NIM HTTP clients.""" + +from __future__ import annotations + +from typing import Any + +import numpy as np +import pytest +import requests +from opentelemetry.sdk.trace.export import SimpleSpanProcessor, SpanExportResult +from opentelemetry.trace import StatusCode + +from nemo_retriever.api.internal.primitives.nim.nim_client import NimClient as InternalNimClient +from nemo_retriever.nim.nim import NIMClient as HttpNIMClient +from nemo_retriever.nim.nim import _post_with_retries +from nemo_retriever.service import tracing + + +class _CollectingExporter: + def __init__(self, exported: list[Any]) -> None: + self._exported = exported + + def export(self, spans: Any) -> SpanExportResult: + self._exported.extend(spans) + return SpanExportResult.SUCCESS + + def shutdown(self) -> None: + return None + + def force_flush(self, timeout_millis: int = 30_000) -> bool: + return True + + +@pytest.fixture(autouse=True) +def reset_tracing() -> None: + tracing._reset_tracing_for_tests() + yield + tracing._reset_tracing_for_tests() + + +@pytest.fixture +def exported_spans(monkeypatch: pytest.MonkeyPatch) -> list[Any]: + exported: list[Any] = [] + monkeypatch.setenv("OTEL_TRACES_EXPORTER", "otlp") + monkeypatch.setenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://otel:4317") + monkeypatch.setattr( + "nemo_retriever.service.tracing.OTLPSpanExporter", + lambda *args, **kwargs: _CollectingExporter(exported), + ) + monkeypatch.setattr("nemo_retriever.service.tracing.BatchSpanProcessor", SimpleSpanProcessor) + assert tracing.configure_tracing(service_role="standalone") is True + return exported + + +def test_post_with_retries_injects_trace_context_and_emits_safe_span( + monkeypatch: pytest.MonkeyPatch, + exported_spans: list[Any], +) -> None: + captured_headers: list[dict[str, str]] = [] + + class _Response: + status_code = 200 + text = '{"ok": true}' + + def raise_for_status(self) -> None: + return None + + def json(self) -> dict[str, bool]: + return {"ok": True} + + def _post(*args: Any, headers: dict[str, str], **kwargs: Any) -> _Response: + captured_headers.append(dict(headers)) + return _Response() + + monkeypatch.setattr("nemo_retriever.nim.nim.requests.post", _post) + original_headers = { + "Accept": "application/json", + "Authorization": "Bearer secret-token", + } + payload = {"input": [{"secret": "request-body"}]} + + with tracing.start_span("test.parent"): + parent_trace_id = tracing.current_trace_id_hex() + result = _post_with_retries( + invoke_url="http://nim.example/v1/infer", + payload=payload, + headers=original_headers, + timeout_s=10, + max_retries=1, + max_429_retries=1, + ) + + assert result == {"ok": True} + assert parent_trace_id is not None + assert captured_headers + assert "traceparent" in captured_headers[0] + assert captured_headers[0]["Authorization"] == "Bearer secret-token" + assert "traceparent" not in original_headers + + spans_by_name = {span.name: span for span in exported_spans} + assert "nim.http.post" in spans_by_name + span = spans_by_name["nim.http.post"] + assert f"{span.context.trace_id:032x}" == parent_trace_id + + attrs = dict(span.attributes) + assert attrs["http.method"] == "POST" + assert attrs["nim.endpoint"] == "http://nim.example/v1/infer" + assert attrs["retry.attempt"] == 0 + assert attrs["http.status_code"] == 200 + assert "Authorization" not in attrs + assert "request.body" not in attrs + assert "payload" not in attrs + + +class _PrimitiveModelInterface: + def name(self) -> str: + return "page-elements" + + def prepare_data_for_inference(self, data: Any) -> Any: + return data + + def format_input(self, data: Any, **kwargs: Any) -> tuple[list[Any], list[dict[str, Any]]]: + return [data], [{"original_image_shapes": None}] + + def parse_output(self, response: Any, **kwargs: Any) -> Any: + return response + + def process_inference_results(self, parsed_output: Any, **kwargs: Any) -> Any: + return parsed_output + + +def _span_by_name(exported_spans: list[Any], name: str) -> Any: + return next(span for span in exported_spans if span.name == name) + + +def test_internal_nim_client_http_injects_trace_context_and_emits_infer_span( + monkeypatch: pytest.MonkeyPatch, + exported_spans: list[Any], +) -> None: + captured_headers: list[dict[str, str]] = [] + + class _Response: + status_code = 200 + reason = "OK" + + def raise_for_status(self) -> None: + return None + + def json(self) -> dict[str, bool]: + return {"ok": True} + + def _post(*args: Any, headers: dict[str, str], **kwargs: Any) -> _Response: + captured_headers.append(dict(headers)) + return _Response() + + monkeypatch.setattr("nemo_retriever.api.internal.primitives.nim.nim_client.requests.post", _post) + + client = InternalNimClient( + model_interface=_PrimitiveModelInterface(), + protocol="http", + endpoints=("", "http://nim.example/v1/infer"), + auth_token="secret-token", + max_retries=1, + max_429_retries=1, + ) + + with tracing.start_span("test.parent"): + parent_trace_id = tracing.current_trace_id_hex() + parsed_output, batch_data = client._process_batch( + {"request": "body"}, + batch_data={"batch": 1}, + model_name="detector", + ) + + assert parsed_output == {"ok": True} + assert batch_data == {"batch": 1} + assert parent_trace_id is not None + assert captured_headers + assert "traceparent" in captured_headers[0] + assert captured_headers[0]["Authorization"] == "Bearer secret-token" + assert "traceparent" not in client.headers + + span = _span_by_name(exported_spans, "nim.infer") + assert f"{span.context.trace_id:032x}" == parent_trace_id + assert span.attributes["nim.model"] == "detector" + assert span.attributes["nim.protocol"] == "http" + assert span.attributes["nim.service"] == "page-elements" + assert "Authorization" not in span.attributes + assert "request.body" not in span.attributes + + +def test_internal_nim_client_grpc_passes_trace_context_headers_when_supported( + monkeypatch: pytest.MonkeyPatch, + exported_spans: list[Any], +) -> None: + captured_headers: list[dict[str, str]] = [] + + class _InferInput: + def __init__(self, name: str, shape: Any, datatype: str) -> None: + self.name = name + self.shape = shape + self.datatype = datatype + + def set_data_from_numpy(self, value: Any) -> None: + self.value = value + + class _InferRequestedOutput: + def __init__(self, name: str) -> None: + self._name = name + + def name(self) -> str: + return self._name + + class _Response: + def as_numpy(self, name: str) -> np.ndarray: + return np.array([[1.0]], dtype=np.float32) + + class _InferenceServerException(Exception): + def status(self) -> str: + return "StatusCode.UNKNOWN" + + def message(self) -> str: + return str(self) + + class _FakeGrpcClient: + def infer(self, *, headers: dict[str, str] | None = None, **kwargs: Any) -> _Response: + captured_headers.append(dict(headers or {})) + return _Response() + + def close(self) -> None: + return None + + class _GrpcModule: + InferenceServerException = _InferenceServerException + InferInput = _InferInput + InferRequestedOutput = _InferRequestedOutput + + class InferenceServerClient: + def __init__(self, url: str) -> None: + self._client = _FakeGrpcClient() + + def infer(self, **kwargs: Any) -> _Response: + return self._client.infer(**kwargs) + + def close(self) -> None: + return None + + monkeypatch.setattr("nemo_retriever.api.internal.primitives.nim.nim_client._triton_grpc", lambda: _GrpcModule) + + client = InternalNimClient( + model_interface=_PrimitiveModelInterface(), + protocol="grpc", + endpoints=("nim-grpc:8001", ""), + max_retries=1, + max_429_retries=1, + ) + + with tracing.start_span("test.parent"): + parent_trace_id = tracing.current_trace_id_hex() + parsed_output, batch_data = client._process_batch( + np.array([[1.0]], dtype=np.float32), + batch_data={"batch": 2}, + model_name="detector-grpc", + ) + + assert isinstance(parsed_output, np.ndarray) + assert batch_data == {"batch": 2} + assert parent_trace_id is not None + assert captured_headers + assert "traceparent" in captured_headers[0] + + span = _span_by_name(exported_spans, "nim.infer") + assert f"{span.context.trace_id:032x}" == parent_trace_id + assert span.attributes["nim.model"] == "detector-grpc" + assert span.attributes["nim.protocol"] == "grpc" + assert span.attributes["nim.service"] == "page-elements" + + +def test_public_nim_client_executor_path_preserves_parent_trace_context( + monkeypatch: pytest.MonkeyPatch, + exported_spans: list[Any], +) -> None: + captured_headers: list[dict[str, str]] = [] + + class _Response: + status_code = 200 + text = "{}" + + def raise_for_status(self) -> None: + return None + + def json(self) -> dict[str, list[dict[str, bool]]]: + return {"data": [{"ok": True}]} + + def _post(*args: Any, headers: dict[str, str], **kwargs: Any) -> _Response: + captured_headers.append(dict(headers)) + return _Response() + + monkeypatch.setattr("nemo_retriever.nim.nim.requests.post", _post) + client = HttpNIMClient(max_pool_workers=1) + try: + with tracing.start_span("test.parent"): + parent_trace_id = tracing.current_trace_id_hex() + result = client.invoke_image_inference_batches( + invoke_url="http://nim.example/v1/infer", + image_b64_list=["iVBORw0KGgo="], + max_batch_size=1, + max_retries=1, + max_429_retries=1, + ) + finally: + client.shutdown() + + assert result == [{"ok": True}] + assert parent_trace_id is not None + assert captured_headers + assert "traceparent" in captured_headers[0] + + span = _span_by_name(exported_spans, "nim.http.post") + assert f"{span.context.trace_id:032x}" == parent_trace_id + + +def test_internal_nim_client_infer_executor_path_preserves_parent_trace_context( + monkeypatch: pytest.MonkeyPatch, + exported_spans: list[Any], +) -> None: + captured_headers: list[dict[str, str]] = [] + + class _Response: + status_code = 200 + reason = "OK" + + def raise_for_status(self) -> None: + return None + + def json(self) -> dict[str, bool]: + return {"ok": True} + + def _post(*args: Any, headers: dict[str, str], **kwargs: Any) -> _Response: + captured_headers.append(dict(headers)) + return _Response() + + monkeypatch.setattr("nemo_retriever.api.internal.primitives.nim.nim_client.requests.post", _post) + client = InternalNimClient( + model_interface=_PrimitiveModelInterface(), + protocol="http", + endpoints=("", "http://nim.example/v1/infer"), + max_retries=1, + max_429_retries=1, + ) + + with tracing.start_span("test.parent"): + parent_trace_id = tracing.current_trace_id_hex() + result = client.infer({"request": "body"}, model_name="detector", max_pool_workers=1) + + assert result == [{"ok": True}] + assert parent_trace_id is not None + assert captured_headers + assert "traceparent" in captured_headers[0] + + span = _span_by_name(exported_spans, "nim.infer") + assert f"{span.context.trace_id:032x}" == parent_trace_id + + +def test_public_nim_client_chat_executor_path_preserves_parent_trace_context( + monkeypatch: pytest.MonkeyPatch, + exported_spans: list[Any], +) -> None: + captured_headers: list[dict[str, str]] = [] + + class _Response: + status_code = 200 + text = "{}" + + def raise_for_status(self) -> None: + return None + + def json(self) -> dict[str, list[dict[str, dict[str, str]]]]: + return {"choices": [{"message": {"content": "hello"}}]} + + def _post(*args: Any, headers: dict[str, str], **kwargs: Any) -> _Response: + captured_headers.append(dict(headers)) + return _Response() + + monkeypatch.setattr("nemo_retriever.nim.nim.requests.post", _post) + client = HttpNIMClient(max_pool_workers=1) + try: + with tracing.start_span("test.parent"): + parent_trace_id = tracing.current_trace_id_hex() + result = client.invoke_chat_completions( + invoke_url="http://nim.example/v1/chat/completions", + messages_list=[[{"role": "user", "content": "hi"}]], + max_retries=1, + max_429_retries=1, + ) + finally: + client.shutdown() + + assert result == ["hello"] + assert parent_trace_id is not None + assert captured_headers + assert "traceparent" in captured_headers[0] + + span = _span_by_name(exported_spans, "nim.http.post") + assert f"{span.context.trace_id:032x}" == parent_trace_id + + +def test_internal_dynamic_batching_keeps_different_trace_contexts_in_separate_batches( + exported_spans: list[Any], +) -> None: + del exported_spans + client = InternalNimClient( + model_interface=_PrimitiveModelInterface(), + protocol="http", + endpoints=("", "http://nim.example/v1/infer"), + enable_dynamic_batching=True, + dynamic_batch_timeout=0.01, + ) + client._batch_size = 2 + captured_batches: list[list[Any]] = [] + + def _capture_batch(requests: list[Any]) -> None: + captured_batches.append(list(requests)) + client._stop_event.set() + + client._process_dynamic_batch = _capture_batch # type: ignore[method-assign] + + with tracing.start_span("test.parent.one"): + first_trace_id = tracing.current_trace_id_hex() + first_future = client.submit("one", "detector", (1, 1)) + with tracing.start_span("test.parent.two"): + second_trace_id = tracing.current_trace_id_hex() + second_future = client.submit("two", "detector", (1, 1)) + + assert first_trace_id is not None and second_trace_id is not None + assert first_trace_id != second_trace_id + + client._batcher_loop() + + assert len(captured_batches) == 1 + assert [request.data for request in captured_batches[0]] == ["one"] + assert client._request_queue.qsize() == 1 + assert first_future.done() is False + assert second_future.done() is False + + +def test_post_with_retries_marks_final_500_span_as_error_without_secrets( + monkeypatch: pytest.MonkeyPatch, + exported_spans: list[Any], +) -> None: + class _Response: + status_code = 500 + text = "server body with secret-token" + + def raise_for_status(self) -> None: + raise requests.HTTPError("HTTP 500 from upstream", response=self) + + def json(self) -> dict[str, bool]: + return {"ok": False} + + def _post(*args: Any, **kwargs: Any) -> _Response: + return _Response() + + monkeypatch.setattr("nemo_retriever.nim.nim.requests.post", _post) + + with pytest.raises(requests.HTTPError): + _post_with_retries( + invoke_url="http://nim.example/v1/infer?api_key=secret-token", + payload={ + "input": [ + {"Authorization": "Bearer secret-token", "secret": "payload"}, + ], + }, + headers={"Authorization": "Bearer secret-token"}, + timeout_s=10, + max_retries=1, + max_429_retries=1, + ) + + span = _span_by_name(exported_spans, "nim.http.post") + attrs = dict(span.attributes) + assert attrs["http.status_code"] == 500 + assert span.status.status_code == StatusCode.ERROR or attrs.get("error.type") == "HTTP 500" + attr_text = repr(attrs) + assert "secret-token" not in attr_text + assert "Bearer" not in attr_text + assert "payload" not in attr_text + assert "server body" not in attr_text diff --git a/nemo_retriever/tests/test_service_client_compat.py b/nemo_retriever/tests/test_service_client_compat.py index 2064ca6b29..b0394653b0 100644 --- a/nemo_retriever/tests/test_service_client_compat.py +++ b/nemo_retriever/tests/test_service_client_compat.py @@ -39,6 +39,7 @@ from __future__ import annotations import asyncio +from types import SimpleNamespace from pathlib import Path from typing import Iterator @@ -201,15 +202,44 @@ def test_create_job_success_returns_job_id() -> None: """Smoke test: a healthy 201 response is unaffected by the compat logic.""" def _handler(request: httpx.Request) -> httpx.Response: - return httpx.Response(201, json={"job_id": "JOB-1"}) + return httpx.Response(201, json={"job_id": "JOB-1", "trace_id": "trace-123"}) rc = RetrieverServiceClient(base_url="http://nrl:7670") - async def _call() -> str: + async def _call(): async with httpx.AsyncClient(transport=_make_transport(_handler)) as client: return await rc._create_job(client, expected_documents=1) - assert _run_async(_call()) == "JOB-1" + created = _run_async(_call()) + assert created.job_id == "JOB-1" + assert created.trace_id == "trace-123" + + +def test_stream_job_created_event_includes_trace_id(monkeypatch: pytest.MonkeyPatch) -> None: + """The public streaming path preserves the trace id returned by job creation.""" + + rc = RetrieverServiceClient(base_url="http://nrl:7670") + + async def _create_job(*args, **kwargs): + return SimpleNamespace(job_id="JOB-1", trace_id="trace-123") + + async def _consume_sse(_client, _pending, uploads_done, _tracker, **_kwargs): + uploads_done.set() + + monkeypatch.setattr(rc, "_create_job", _create_job) + monkeypatch.setattr(rc, "_consume_sse", _consume_sse) + + async def _first_event() -> dict[str, object]: + async for event in rc.aingest_documents_stream([]): + return event + raise AssertionError("stream produced no events") + + assert _run_async(_first_event()) == { + "event": "job_created", + "job_id": "JOB-1", + "expected_documents": 0, + "trace_id": "trace-123", + } # ---------------------------------------------------------------------- diff --git a/nemo_retriever/tests/test_service_ingest_async.py b/nemo_retriever/tests/test_service_ingest_async.py index d225ebebb6..aea357e209 100644 --- a/nemo_retriever/tests/test_service_ingest_async.py +++ b/nemo_retriever/tests/test_service_ingest_async.py @@ -33,7 +33,12 @@ def _stub_event_sequence() -> list[dict[str, Any]]: return [ - {"event": "job_created", "job_id": "JOB-1", "expected_documents": 2}, + { + "event": "job_created", + "job_id": "JOB-1", + "expected_documents": 2, + "trace_id": "trace-123", + }, {"event": "upload_complete", "filename": "a.pdf", "document_id": "doc-a"}, {"event": "upload_complete", "filename": "b.pdf", "document_id": "doc-b"}, { @@ -113,6 +118,12 @@ def test_ingest_default_returns_service_ingest_result(stub_ingestor: ServiceInge assert result.dataframe.iloc[0]["text"] == "content-doc-a" +def test_ingest_default_exposes_trace_id_from_job_created(stub_ingestor: ServiceIngestor) -> None: + result = stub_ingestor.ingest() + assert isinstance(result, ServiceIngestResult) + assert result.trace_id == "trace-123" + + def test_ingest_return_failures_returns_tuple(stub_ingestor: ServiceIngestor) -> None: result, failures = stub_ingestor.ingest(return_failures=True) assert isinstance(result, ServiceIngestResult) @@ -126,6 +137,7 @@ def test_ingest_return_traces_returns_tuple_with_all_events(stub_ingestor: Servi assert isinstance(result, ServiceIngestResult) assert isinstance(traces, list) assert traces == _stub_event_sequence() + assert traces[0]["trace_id"] == "trace-123" def test_ingest_both_flags_returns_three_tuple(stub_ingestor: ServiceIngestor) -> None: diff --git a/nemo_retriever/tests/test_service_ingest_router.py b/nemo_retriever/tests/test_service_ingest_router.py index 28ff7932eb..fd62aee25f 100644 --- a/nemo_retriever/tests/test_service_ingest_router.py +++ b/nemo_retriever/tests/test_service_ingest_router.py @@ -19,10 +19,12 @@ from __future__ import annotations import json +import re from typing import Any import pytest from fastapi.testclient import TestClient +from opentelemetry.sdk.trace.export import SimpleSpanProcessor, SpanExportResult from nemo_retriever.service.app import create_app from nemo_retriever.service.config import ( @@ -30,6 +32,7 @@ PipelinePoolConfig, ServiceConfig, ) +from nemo_retriever.service import tracing from nemo_retriever.service.services.pipeline_pool import WorkItem from .conftest import create_test_job @@ -39,6 +42,21 @@ def captured_items() -> list[WorkItem]: return [] +class _CollectingExporter: + def __init__(self, exported: list[Any]) -> None: + self._exported = exported + + def export(self, spans: Any) -> SpanExportResult: + self._exported.extend(spans) + return SpanExportResult.SUCCESS + + def shutdown(self) -> None: + return None + + def force_flush(self, timeout_millis: int = 30_000) -> bool: + return True + + @pytest.fixture def app_with_stub_pool(monkeypatch: pytest.MonkeyPatch, captured_items: list[WorkItem]): """Build a standalone-mode app whose pools record items instead of running pipelines.""" @@ -72,11 +90,92 @@ def _stub_batch(_config: ServiceConfig): yield client +@pytest.fixture +def traced_app_with_stub_pool(monkeypatch: pytest.MonkeyPatch, captured_items: list[WorkItem]): + """Build a traced standalone app with in-memory span export.""" + from nemo_retriever.service.tracing import _reset_tracing_for_tests + + exported: list[Any] = [] + _reset_tracing_for_tests() + monkeypatch.setenv("OTEL_TRACES_EXPORTER", "otlp") + monkeypatch.setenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://otel:4317") + monkeypatch.delenv("OTEL_SDK_DISABLED", raising=False) + monkeypatch.setattr( + "nemo_retriever.service.tracing.OTLPSpanExporter", + lambda *args, **kwargs: _CollectingExporter(exported), + ) + monkeypatch.setattr("nemo_retriever.service.tracing.BatchSpanProcessor", SimpleSpanProcessor) + + async def _stub_work(item: WorkItem) -> tuple[int, list[dict[str, Any]]]: + captured_items.append(item) + return 1, [{"id": item.id, "stub": True}] + + def _stub_realtime(_config: ServiceConfig): + return _stub_work + + def _stub_batch(_config: ServiceConfig): + return _stub_work + + monkeypatch.setattr( + "nemo_retriever.service.services.pipeline_executor.create_realtime_work_fn", + _stub_realtime, + ) + monkeypatch.setattr( + "nemo_retriever.service.services.pipeline_executor.create_batch_work_fn", + _stub_batch, + ) + + cfg = ServiceConfig( + mode="standalone", + pipeline=PipelinePoolConfig(realtime_workers=1, batch_workers=1), + pipeline_overrides=PipelineOverridesConfig(), + ) + try: + app = create_app(cfg) + with TestClient(app) as client: + yield client, exported + finally: + _reset_tracing_for_tests() + + +@pytest.fixture +def traced_gateway_app(monkeypatch: pytest.MonkeyPatch): + """Build a traced gateway app so dashboard job APIs are mounted.""" + from nemo_retriever.service.tracing import _reset_tracing_for_tests + + exported: list[Any] = [] + _reset_tracing_for_tests() + monkeypatch.setenv("OTEL_TRACES_EXPORTER", "otlp") + monkeypatch.setenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://otel:4317") + monkeypatch.delenv("OTEL_SDK_DISABLED", raising=False) + monkeypatch.setattr( + "nemo_retriever.service.tracing.OTLPSpanExporter", + lambda *args, **kwargs: _CollectingExporter(exported), + ) + monkeypatch.setattr("nemo_retriever.service.tracing.BatchSpanProcessor", SimpleSpanProcessor) + + cfg = ServiceConfig(mode="gateway") + try: + app = create_app(cfg) + with TestClient(app) as client: + yield client, exported + finally: + _reset_tracing_for_tests() + + def _make_pdf_bytes() -> bytes: """Return a 1-byte non-PDF payload — the worker is stubbed so content doesn't matter.""" return b"%PDF-1.4\n%stub\n" +def _wait_for_items(captured_items: list[WorkItem], count: int) -> None: + import time as _time + + deadline = _time.monotonic() + 5.0 + while len(captured_items) < count and _time.monotonic() < deadline: + _time.sleep(0.05) + + def test_ingest_without_spec_falls_back_to_legacy_pipeline( app_with_stub_pool: TestClient, captured_items: list[WorkItem] ) -> None: @@ -205,6 +304,304 @@ def test_create_job_returns_201_and_aggregate_fields(app_with_stub_pool: TestCli assert body["job_id"] +def test_create_job_succeeds_when_tracing_span_setup_fails( + app_with_stub_pool: TestClient, monkeypatch: pytest.MonkeyPatch +) -> None: + def _raise(*args: Any, **kwargs: Any) -> Any: + raise RuntimeError("tracer unavailable") + + monkeypatch.setattr("nemo_retriever.service.tracing.get_tracer", _raise) + + resp = app_with_stub_pool.post( + "/v1/ingest/job", + json={"expected_documents": 1, "label": "trace-failure"}, + ) + + assert resp.status_code == 201, resp.text + body = resp.json() + assert body["job_id"] + assert body.get("trace_id") is None + assert "x-trace-id" not in resp.headers + + +def test_create_job_succeeds_when_tracing_span_enter_fails( + app_with_stub_pool: TestClient, monkeypatch: pytest.MonkeyPatch +) -> None: + class _FailingSpanContext: + def __enter__(self) -> Any: + raise RuntimeError("span enter failed") + + def __exit__(self, exc_type: Any, exc: Any, traceback: Any) -> bool: + raise AssertionError("span context should not be entered") + + class _Tracer: + def start_as_current_span(self, name: str, **kwargs: Any) -> Any: + return _FailingSpanContext() + + monkeypatch.setattr("nemo_retriever.service.tracing.get_tracer", lambda: _Tracer()) + + resp = app_with_stub_pool.post( + "/v1/ingest/job", + json={"expected_documents": 1, "label": "trace-enter-failure"}, + ) + + assert resp.status_code == 201, resp.text + body = resp.json() + assert body["job_id"] + assert body.get("trace_id") is None + assert "x-trace-id" not in resp.headers + + +def test_create_job_succeeds_when_trace_context_injection_fails( + app_with_stub_pool: TestClient, monkeypatch: pytest.MonkeyPatch +) -> None: + def _raise() -> Any: + raise RuntimeError("propagator unavailable") + + monkeypatch.setattr("nemo_retriever.service.tracing.inject_trace_context", _raise) + + resp = app_with_stub_pool.post( + "/v1/ingest/job", + json={"expected_documents": 1, "label": "inject-failure"}, + ) + + assert resp.status_code == 201, resp.text + body = resp.json() + assert body["job_id"] + assert body.get("trace_id") is None + assert "x-trace-id" not in resp.headers + + +def test_document_upload_succeeds_when_accept_trace_context_extraction_fails( + app_with_stub_pool: TestClient, + captured_items: list[WorkItem], + monkeypatch: pytest.MonkeyPatch, +) -> None: + job_id = create_test_job(app_with_stub_pool) + + def _raise(_carrier: dict[str, str]) -> Any: + raise RuntimeError("propagator unavailable") + + monkeypatch.setattr("nemo_retriever.service.tracing.extract_trace_context", _raise) + + resp = app_with_stub_pool.post( + f"/v1/ingest/job/{job_id}/document", + headers={"traceparent": "00-" + "1" * 32 + "-" + "2" * 16 + "-01"}, + files={"file": ("doc.pdf", _make_pdf_bytes(), "application/pdf")}, + data={"metadata": "{}"}, + ) + + assert resp.status_code == 202, resp.text + _wait_for_items(captured_items, 1) + assert captured_items[0].job_id == job_id + + +def test_document_upload_succeeds_when_enqueue_trace_context_injection_fails( + app_with_stub_pool: TestClient, + captured_items: list[WorkItem], + monkeypatch: pytest.MonkeyPatch, +) -> None: + job_id = create_test_job(app_with_stub_pool) + + def _raise() -> Any: + raise RuntimeError("propagator unavailable") + + monkeypatch.setattr("nemo_retriever.service.tracing.inject_trace_context", _raise) + + resp = app_with_stub_pool.post( + f"/v1/ingest/job/{job_id}/document", + files={"file": ("doc.pdf", _make_pdf_bytes(), "application/pdf")}, + data={"metadata": "{}"}, + ) + + assert resp.status_code == 202, resp.text + _wait_for_items(captured_items, 1) + assert captured_items[0].job_id == job_id + assert captured_items[0].trace_context == {} + + +def test_create_job_with_tracing_returns_trace_id_body_header_and_snapshot( + traced_app_with_stub_pool: tuple[TestClient, list[Any]], +) -> None: + client, exported_spans = traced_app_with_stub_pool + + resp = client.post( + "/v1/ingest/job", + json={"expected_documents": 2, "label": "trace-smoke"}, + ) + + assert resp.status_code == 201, resp.text + body = resp.json() + trace_id = body["trace_id"] + assert re.fullmatch(r"[0-9a-f]{32}", trace_id) + assert resp.headers["x-trace-id"] == trace_id + + snapshot = client.get(f"/v1/ingest/job/{body['job_id']}") + assert snapshot.status_code == 200, snapshot.text + assert snapshot.json()["trace_id"] == trace_id + assert exported_spans + + +def test_create_job_with_tracing_uses_inbound_traceparent( + traced_app_with_stub_pool: tuple[TestClient, list[Any]], +) -> None: + client, exported_spans = traced_app_with_stub_pool + + with tracing.start_span("client.parent"): + inbound_trace_id = tracing.current_trace_id_hex() + carrier = dict(tracing.inject_trace_context()) + assert inbound_trace_id is not None + + resp = client.post( + "/v1/ingest/job", + headers=carrier, + json={"expected_documents": 1, "label": "trace-parent"}, + ) + + assert resp.status_code == 201, resp.text + body = resp.json() + assert body["trace_id"] == inbound_trace_id + assert resp.headers["x-trace-id"] == inbound_trace_id + + ingest_span = next(span for span in exported_spans if span.name == "ingest.job") + assert f"{ingest_span.context.trace_id:032x}" == inbound_trace_id + + snapshot = client.get(f"/v1/ingest/job/{body['job_id']}") + assert snapshot.status_code == 200, snapshot.text + assert snapshot.json()["trace_id"] == inbound_trace_id + + +def test_job_upload_routes_emit_accept_spans( + traced_app_with_stub_pool: tuple[TestClient, list[Any]], + captured_items: list[WorkItem], +) -> None: + client, exported_spans = traced_app_with_stub_pool + job_id = create_test_job(client, expected_documents=3) + + document_resp = client.post( + f"/v1/ingest/job/{job_id}/document", + files={"file": ("doc.pdf", _make_pdf_bytes(), "application/pdf")}, + data={"metadata": "{}"}, + ) + page_resp = client.post( + f"/v1/ingest/job/{job_id}/page", + files={"file": ("page.png", b"page", "image/png")}, + data={"document_id": "source-doc", "page_number": "1", "filename": "source.pdf"}, + ) + whole_resp = client.post( + f"/v1/ingest/job/{job_id}/whole", + files={"file": ("whole.pdf", _make_pdf_bytes(), "application/pdf")}, + data={"metadata": "{}"}, + ) + + assert document_resp.status_code == 202, document_resp.text + assert page_resp.status_code == 202, page_resp.text + assert whole_resp.status_code == 202, whole_resp.text + _wait_for_items(captured_items, 3) + + names = {span.name for span in exported_spans} + assert "ingest.document.accept" in names + assert "ingest.page.accept" in names + assert "ingest.whole.accept" in names + + accept_spans = { + span.name: span + for span in exported_spans + if span.name in {"ingest.document.accept", "ingest.page.accept", "ingest.whole.accept"} + } + for span in accept_spans.values(): + attrs = dict(span.attributes) + assert attrs["service.role"] == "standalone" + assert attrs["job.id"] == job_id + assert attrs["route"].startswith("/v1/ingest/job/") + + +def test_job_upload_accept_span_uses_job_trace_when_request_has_no_traceparent( + traced_app_with_stub_pool: tuple[TestClient, list[Any]], + captured_items: list[WorkItem], +) -> None: + client, exported_spans = traced_app_with_stub_pool + create_resp = client.post("/v1/ingest/job", json={"expected_documents": 1}) + assert create_resp.status_code == 201, create_resp.text + job_id = create_resp.json()["job_id"] + job_trace_id = create_resp.json()["trace_id"] + + upload_resp = client.post( + f"/v1/ingest/job/{job_id}/document", + files={"file": ("doc.pdf", _make_pdf_bytes(), "application/pdf")}, + data={"metadata": "{}"}, + ) + + assert upload_resp.status_code == 202, upload_resp.text + _wait_for_items(captured_items, 1) + + accept_span = next(span for span in exported_spans if span.name == "ingest.document.accept") + assert f"{accept_span.context.trace_id:032x}" == job_trace_id + assert captured_items[0].trace_context["traceparent"].split("-")[1] == job_trace_id + + +def test_job_upload_accept_span_prefers_inbound_traceparent_over_job_trace( + traced_app_with_stub_pool: tuple[TestClient, list[Any]], + captured_items: list[WorkItem], +) -> None: + client, exported_spans = traced_app_with_stub_pool + create_resp = client.post("/v1/ingest/job", json={"expected_documents": 1}) + assert create_resp.status_code == 201, create_resp.text + job_id = create_resp.json()["job_id"] + job_trace_id = create_resp.json()["trace_id"] + + with tracing.start_span("client.parent"): + inbound_trace_id = tracing.current_trace_id_hex() + carrier = dict(tracing.inject_trace_context()) + assert inbound_trace_id is not None + assert inbound_trace_id != job_trace_id + + upload_resp = client.post( + f"/v1/ingest/job/{job_id}/document", + headers=carrier, + files={"file": ("doc.pdf", _make_pdf_bytes(), "application/pdf")}, + data={"metadata": "{}"}, + ) + + assert upload_resp.status_code == 202, upload_resp.text + _wait_for_items(captured_items, 1) + + accept_span = next(span for span in exported_spans if span.name == "ingest.document.accept") + assert f"{accept_span.context.trace_id:032x}" == inbound_trace_id + assert f"{accept_span.context.trace_id:032x}" != job_trace_id + assert captured_items[0].trace_context["traceparent"].split("-")[1] == inbound_trace_id + + +def test_dashboard_job_views_include_trace_id(traced_gateway_app: tuple[TestClient, list[Any]]) -> None: + client, exported_spans = traced_gateway_app + + resp = client.post( + "/v1/ingest/job", + json={"expected_documents": 1, "label": "dashboard-trace"}, + ) + + assert resp.status_code == 201, resp.text + body = resp.json() + job_id = body["job_id"] + trace_id = body["trace_id"] + assert re.fullmatch(r"[0-9a-f]{32}", trace_id) + + snapshot = client.get("/v1/dashboard/api/jobs/snapshot") + assert snapshot.status_code == 200, snapshot.text + snapshot_jobs = {job["job_id"]: job for job in snapshot.json()["jobs"]} + assert snapshot_jobs[job_id]["trace_id"] == trace_id + + listing = client.get("/v1/dashboard/api/jobs/list") + assert listing.status_code == 200, listing.text + listed_jobs = {job["job_id"]: job for job in listing.json()["jobs"]} + assert listed_jobs[job_id]["trace_id"] == trace_id + + detail = client.get(f"/v1/dashboard/api/jobs/{job_id}") + assert detail.status_code == 200, detail.text + assert detail.json()["trace_id"] == trace_id + assert exported_spans + + def test_create_job_retain_results_persisted_on_aggregate(app_with_stub_pool: TestClient) -> None: from nemo_retriever.service.services.job_tracker import get_job_tracker diff --git a/nemo_retriever/tests/test_service_job_tracker.py b/nemo_retriever/tests/test_service_job_tracker.py index a5dd1f15a4..1bd9588216 100644 --- a/nemo_retriever/tests/test_service_job_tracker.py +++ b/nemo_retriever/tests/test_service_job_tracker.py @@ -81,6 +81,36 @@ def test_register_job_creates_pending_aggregate() -> None: assert event["type"] == "job_created" +def test_register_job_persists_trace_context_and_events_include_trace_id() -> None: + tracker, bus = _make_tracker_with_bus() + trace_id = "0123456789abcdef0123456789abcdef" + trace_context = { + "traceparent": f"00-{trace_id}-0123456789abcdef-01", + } + + agg = tracker.register_job( + "job-trace", + expected_documents=1, + trace_id=trace_id, + trace_context=trace_context, + ) + + assert agg.trace_id == trace_id + assert agg.trace_context == trace_context + trace_context["traceparent"] = "mutated" + assert agg.trace_context["traceparent"] != "mutated" + + stored = tracker.get_job("job-trace") + assert stored is not None + assert stored.trace_id == trace_id + assert stored.trace_context["traceparent"] != "mutated" + + routing, event = bus.events[0] + assert routing == "job-trace" + assert event["type"] == "job_created" + assert event["trace_id"] == trace_id + + def test_register_job_rejects_nonpositive_expected_count() -> None: tracker = JobTracker() with pytest.raises(ValueError, match="positive"): diff --git a/nemo_retriever/tests/test_service_pipeline_tracing.py b/nemo_retriever/tests/test_service_pipeline_tracing.py new file mode 100644 index 0000000000..1300670ab4 --- /dev/null +++ b/nemo_retriever/tests/test_service_pipeline_tracing.py @@ -0,0 +1,177 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. +# All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Tracing coverage for process-isolated pipeline execution.""" + +from __future__ import annotations + +import asyncio +from concurrent.futures import ThreadPoolExecutor +from types import SimpleNamespace +from typing import Any + +import pandas as pd +import pytest +from opentelemetry.sdk.trace.export import SimpleSpanProcessor, SpanExportResult + +from nemo_retriever.service import tracing +from nemo_retriever.service.services import pipeline_executor + + +class _CollectingExporter: + def __init__(self, exported: list[Any]) -> None: + self._exported = exported + + def export(self, spans: Any) -> SpanExportResult: + self._exported.extend(spans) + return SpanExportResult.SUCCESS + + def shutdown(self) -> None: + return None + + def force_flush(self, timeout_millis: int = 30_000) -> bool: + return True + + +class _FakeIngestor: + def ingest(self) -> pd.DataFrame: + return pd.DataFrame([{"document_id": "doc-1", "text": "chunk"}]) + + +@pytest.fixture +def exported_spans(monkeypatch: pytest.MonkeyPatch) -> list[Any]: + exported: list[Any] = [] + tracing._reset_tracing_for_tests() + monkeypatch.setenv("OTEL_TRACES_EXPORTER", "otlp") + monkeypatch.setenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://otel:4317") + monkeypatch.delenv("OTEL_SDK_DISABLED", raising=False) + monkeypatch.setattr( + "nemo_retriever.service.tracing.OTLPSpanExporter", + lambda *args, **kwargs: _CollectingExporter(exported), + ) + monkeypatch.setattr("nemo_retriever.service.tracing.BatchSpanProcessor", SimpleSpanProcessor) + try: + yield exported + finally: + tracing._reset_tracing_for_tests() + + +def test_run_pipeline_in_process_links_child_span_to_parent_trace( + monkeypatch: pytest.MonkeyPatch, + exported_spans: list[Any], +) -> None: + def _fake_build_graph_ingestor_from_spec(*args: Any, **kwargs: Any) -> tuple[_FakeIngestor, str, bool]: + return _FakeIngestor(), "pdf", False + + monkeypatch.setattr( + pipeline_executor, + "_build_graph_ingestor_from_spec", + _fake_build_graph_ingestor_from_spec, + ) + + tracing.configure_tracing(service_role="parent-test") + with tracing.start_span("parent.request"): + parent_trace_id = tracing.current_trace_id_hex() + carrier = dict(tracing.inject_trace_context()) + + row_count, result_data, _elapsed = pipeline_executor._run_pipeline_in_process( + "contract.pdf", + b"%PDF-1.4\n", + {}, + None, + trace_context=carrier, + pool_label="Realtime", + service_role="standalone", + ) + + assert parent_trace_id is not None + assert row_count == 1 + assert result_data + + pipeline_span = next(span for span in exported_spans if span.name == "pipeline.ingest") + assert f"{pipeline_span.context.trace_id:032x}" == parent_trace_id + assert pipeline_span.attributes["pool"] == "realtime" + assert pipeline_span.attributes["document.filename"] == "contract.pdf" + + +def test_run_pipeline_in_process_continues_when_trace_extraction_fails( + monkeypatch: pytest.MonkeyPatch, +) -> None: + def _fake_build_graph_ingestor_from_spec(*args: Any, **kwargs: Any) -> tuple[_FakeIngestor, str, bool]: + return _FakeIngestor(), "pdf", False + + def _raise_extract(*args: Any, **kwargs: Any) -> None: + raise RuntimeError("bad carrier") + + monkeypatch.setattr( + pipeline_executor, + "_build_graph_ingestor_from_spec", + _fake_build_graph_ingestor_from_spec, + ) + monkeypatch.setattr(tracing, "extract_trace_context", _raise_extract) + + row_count, result_data, _elapsed = pipeline_executor._run_pipeline_in_process( + "contract.pdf", + b"%PDF-1.4\n", + {}, + None, + trace_context={"traceparent": "invalid"}, + pool_label="Realtime", + service_role="standalone", + ) + + assert row_count == 1 + assert result_data == [{"document_id": "doc-1", "text": "chunk"}] + + +def test_make_work_fn_continues_when_trace_capture_fails( + monkeypatch: pytest.MonkeyPatch, +) -> None: + class _Params: + def model_dump(self, *args: Any, **kwargs: Any) -> dict[str, Any]: + return {} + + class _WorkItem: + id = "item-1" + filename = "contract.pdf" + payload = b"%PDF-1.4\n" + pipeline_spec = None + + def _fake_process_pool_executor(*args: Any, **kwargs: Any) -> ThreadPoolExecutor: + return ThreadPoolExecutor(max_workers=1) + + def _raise_inject(*args: Any, **kwargs: Any) -> None: + raise RuntimeError("inject failed") + + def _fake_run_pipeline_in_process(*args: Any, **kwargs: Any) -> tuple[int, list[dict[str, Any]], float]: + return 1, [{"document_id": "doc-1", "text": "chunk"}], 0.01 + + monkeypatch.setattr(pipeline_executor, "build_extract_params", lambda nim: _Params()) + monkeypatch.setattr(pipeline_executor, "build_embed_params", lambda nim: None) + monkeypatch.setattr(pipeline_executor, "build_caption_params", lambda nim: None) + monkeypatch.setattr(pipeline_executor, "build_asr_params", lambda nim: None) + monkeypatch.setattr(pipeline_executor, "ProcessPoolExecutor", _fake_process_pool_executor) + monkeypatch.setattr(pipeline_executor, "_run_pipeline_in_process", _fake_run_pipeline_in_process) + monkeypatch.setattr(tracing, "inject_trace_context", _raise_inject) + + config = SimpleNamespace( + nim_endpoints=SimpleNamespace(model_dump=lambda *args, **kwargs: {}), + vectordb=SimpleNamespace(enabled=False, vectordb_url=None), + pipeline=SimpleNamespace( + realtime_workers=1, + batch_workers=1, + realtime_queue_size=1, + batch_queue_size=1, + ), + mode="standalone", + ) + + work = pipeline_executor._make_work_fn(config, label="Realtime") + try: + row_count, result_data = asyncio.run(work(_WorkItem())) + finally: + pipeline_executor.shutdown_process_executors() + + assert row_count == 1 + assert result_data == [{"document_id": "doc-1", "text": "chunk"}] diff --git a/nemo_retriever/tests/test_service_pool_tracing.py b/nemo_retriever/tests/test_service_pool_tracing.py new file mode 100644 index 0000000000..f42f92219e --- /dev/null +++ b/nemo_retriever/tests/test_service_pool_tracing.py @@ -0,0 +1,131 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. +# All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Tests for OpenTelemetry context propagation through pipeline pools.""" + +from __future__ import annotations + +import asyncio +from typing import Any + +import pytest +from opentelemetry.sdk.trace.export import SimpleSpanProcessor, SpanExportResult + +from nemo_retriever.service import tracing +from nemo_retriever.service.services.pipeline_pool import WorkItem, _Pool + + +class _CollectingExporter: + def __init__(self, exported: list[Any]) -> None: + self._exported = exported + + def export(self, spans: Any) -> SpanExportResult: + self._exported.extend(spans) + return SpanExportResult.SUCCESS + + def shutdown(self) -> None: + return None + + def force_flush(self, timeout_millis: int = 30_000) -> bool: + return True + + +def _run(coro: Any) -> Any: + return asyncio.new_event_loop().run_until_complete(coro) + + +@pytest.fixture(autouse=True) +def reset_tracing() -> None: + tracing._reset_tracing_for_tests() + yield + tracing._reset_tracing_for_tests() + + +@pytest.fixture +def exported_spans(monkeypatch: pytest.MonkeyPatch) -> list[Any]: + exported: list[Any] = [] + monkeypatch.setenv("OTEL_TRACES_EXPORTER", "otlp") + monkeypatch.setenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://otel:4317") + monkeypatch.setattr( + "nemo_retriever.service.tracing.OTLPSpanExporter", + lambda *args, **kwargs: _CollectingExporter(exported), + ) + monkeypatch.setattr("nemo_retriever.service.tracing.BatchSpanProcessor", SimpleSpanProcessor) + assert tracing.configure_tracing(service_role="standalone") is True + return exported + + +def test_pool_process_span_uses_work_item_trace_context(exported_spans: list[Any]) -> None: + async def body() -> None: + done = asyncio.Event() + + async def work(_item: WorkItem) -> int: + done.set() + return 1 + + with tracing.start_span("test.parent"): + parent_trace_id = tracing.current_trace_id_hex() + carrier = dict(tracing.inject_trace_context()) + + assert parent_trace_id is not None + + pool = _Pool(name="rt-trace", num_workers=1, max_queue_size=4, work_fn=work) + pool.start() + try: + assert await pool.submit( + WorkItem( + id="doc-1", + job_id="job-1", + trace_context=carrier, + ) + ) + await asyncio.wait_for(done.wait(), timeout=2.0) + await asyncio.sleep(0.05) + finally: + await pool.shutdown() + + spans_by_name = {span.name: span for span in exported_spans} + assert "pool.rt-trace.process" in spans_by_name + pool_span = spans_by_name["pool.rt-trace.process"] + assert f"{pool_span.context.trace_id:032x}" == parent_trace_id + + attrs = dict(pool_span.attributes) + assert attrs["pool"] == "rt-trace" + assert attrs["document.id"] == "doc-1" + assert attrs["job.id"] == "job-1" + assert attrs["queue.wait_ms"] >= 0.0 + + _run(body()) + + +def test_pool_processes_item_when_trace_context_extraction_fails( + monkeypatch: pytest.MonkeyPatch, +) -> None: + def _raise(_carrier: dict[str, str]) -> Any: + raise RuntimeError("propagator unavailable") + + monkeypatch.setattr("nemo_retriever.service.tracing.extract_trace_context", _raise) + + async def body() -> None: + done = asyncio.Event() + + async def work(_item: WorkItem) -> int: + done.set() + return 1 + + pool = _Pool(name="rt-trace-fallback", num_workers=1, max_queue_size=4, work_fn=work) + pool.start() + try: + assert await pool.submit( + WorkItem( + id="doc-1", + job_id="job-1", + trace_context={"traceparent": "00-" + "1" * 32 + "-" + "2" * 16 + "-01"}, + ) + ) + await asyncio.wait_for(done.wait(), timeout=2.0) + finally: + await pool.shutdown() + + _run(body()) diff --git a/nemo_retriever/tests/test_service_proxy_tracing.py b/nemo_retriever/tests/test_service_proxy_tracing.py new file mode 100644 index 0000000000..73ddabf46b --- /dev/null +++ b/nemo_retriever/tests/test_service_proxy_tracing.py @@ -0,0 +1,287 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. +# All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Tests for gateway proxy W3C trace-context propagation.""" + +from __future__ import annotations + +import asyncio +from typing import Any + +import httpx +import pytest +from fastapi import Request +from opentelemetry.sdk.trace.export import SimpleSpanProcessor, SpanExportResult + +from nemo_retriever.service.config import GatewayConfig +from nemo_retriever.service.services.pipeline_pool import PoolType +from nemo_retriever.service.services import proxy as proxy_module +from nemo_retriever.service.services.proxy import GatewayProxy +from nemo_retriever.service.tracing import ( + _reset_tracing_for_tests, + configure_tracing, + current_trace_id_hex, + start_span, +) + + +class _CollectingExporter: + def __init__(self, exported: list[Any]) -> None: + self._exported = exported + + def export(self, spans: Any) -> SpanExportResult: + self._exported.extend(spans) + return SpanExportResult.SUCCESS + + def shutdown(self) -> None: + return None + + def force_flush(self, timeout_millis: int = 30_000) -> bool: + return True + + +class _RecordingClient: + def __init__(self) -> None: + self.calls: list[dict[str, Any]] = [] + + async def request(self, *, method: str, url: str, content: bytes, headers: dict[str, str]) -> httpx.Response: + self.calls.append( + { + "method": method, + "url": url, + "content": content, + "headers": dict(headers), + } + ) + return httpx.Response(202, content=b'{"ok": true}', headers={"content-type": "application/json"}) + + async def get(self, path: str, *, headers: dict[str, str]) -> httpx.Response: + self.calls.append( + { + "method": "GET", + "url": path, + "headers": dict(headers), + } + ) + return httpx.Response(200, content=b'{"config": true}', headers={"content-type": "application/json"}) + + +@pytest.fixture(autouse=True) +def reset_tracing() -> None: + _reset_tracing_for_tests() + yield + _reset_tracing_for_tests() + + +@pytest.fixture +def configured_tracing(monkeypatch: pytest.MonkeyPatch) -> list[Any]: + exported: list[Any] = [] + monkeypatch.setenv("OTEL_TRACES_EXPORTER", "otlp") + monkeypatch.setenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://otel:4317") + monkeypatch.delenv("OTEL_SDK_DISABLED", raising=False) + monkeypatch.setattr( + "nemo_retriever.service.tracing.OTLPSpanExporter", + lambda *args, **kwargs: _CollectingExporter(exported), + ) + monkeypatch.setattr("nemo_retriever.service.tracing.BatchSpanProcessor", SimpleSpanProcessor) + + assert configure_tracing(service_role="gateway") is True + return exported + + +def test_forward_injects_traceparent_and_preserves_forwarding_inputs(configured_tracing: list[Any]) -> None: + backend = _RecordingClient() + proxy = _make_proxy(batch=backend) + body = b'{"document": "raw payload"}' + request = _make_request( + method="POST", + path="/v1/ingest/document", + headers={ + "host": "gateway.local", + "transfer-encoding": "chunked", + "content-type": "application/json", + "traceparent": "00-00000000000000000000000000000001-0000000000000001-01", + "x-request-id": "keep-me", + }, + body=body, + ) + + async def _forward_under_span() -> str: + with start_span("proxy.forward.parent"): + trace_id = current_trace_id_hex() + assert trace_id is not None + response = await proxy.forward( + request, + PoolType.BATCH, + extra_headers={"x-extra": "extra-value"}, + ) + assert response.status_code == 202 + return trace_id + + trace_id = asyncio.run(_forward_under_span()) + + assert len(backend.calls) == 1 + call = backend.calls[0] + headers = call["headers"] + assert call["method"] == "POST" + assert call["url"] == "/v1/ingest/document" + assert call["content"] == body + assert headers["content-type"] == "application/json" + assert headers["x-request-id"] == "keep-me" + assert headers["x-extra"] == "extra-value" + assert "host" not in {key.lower() for key in headers} + assert "transfer-encoding" not in {key.lower() for key in headers} + assert headers["traceparent"].split("-")[1] == trace_id + assert headers["traceparent"] != "00-00000000000000000000000000000001-0000000000000001-01" + + +def test_forward_get_injects_traceparent_before_backend_get(configured_tracing: list[Any]) -> None: + backend = _RecordingClient() + proxy = _make_proxy(realtime=backend) + request = _make_request( + method="GET", + path="/v1/ingest/pipeline-config", + headers={ + "host": "gateway.local", + "Traceparent": "00-00000000000000000000000000000002-0000000000000002-01", + "x-request-id": "keep-me", + }, + ) + + async def _forward_under_span() -> str: + with start_span("proxy.forward_get.parent"): + trace_id = current_trace_id_hex() + assert trace_id is not None + response = await proxy.forward_get(request, PoolType.REALTIME, "/v1/ingest/pipeline-config") + assert response.status_code == 200 + return trace_id + + trace_id = asyncio.run(_forward_under_span()) + + assert len(backend.calls) == 1 + call = backend.calls[0] + headers = call["headers"] + assert call["method"] == "GET" + assert call["url"] == "/v1/ingest/pipeline-config" + assert headers["x-request-id"] == "keep-me" + assert "host" not in {key.lower() for key in headers} + assert "Traceparent" not in headers + assert headers["traceparent"].split("-")[1] == trace_id + + +def test_forward_sends_backend_request_when_trace_injection_fails(monkeypatch: pytest.MonkeyPatch) -> None: + backend = _RecordingClient() + proxy = _make_proxy(batch=backend) + body = b'{"document": "raw payload"}' + request = _make_request( + method="POST", + path="/v1/ingest/document", + headers={ + "host": "gateway.local", + "transfer-encoding": "chunked", + "content-type": "application/json", + "x-request-id": "keep-me", + }, + body=body, + ) + + def _raise_injection_error(headers: dict[str, str]) -> None: + raise RuntimeError("propagator unavailable") + + monkeypatch.setattr(proxy_module.tracing_module, "inject_trace_context", _raise_injection_error) + + async def _forward() -> Any: + return await proxy.forward( + request, + PoolType.BATCH, + extra_headers={"x-extra": "extra-value"}, + ) + + response = asyncio.run(_forward()) + + assert response.status_code == 202 + assert response.body == b'{"ok": true}' + assert len(backend.calls) == 1 + call = backend.calls[0] + headers = call["headers"] + assert call["method"] == "POST" + assert call["url"] == "/v1/ingest/document" + assert call["content"] == body + assert headers["content-type"] == "application/json" + assert headers["x-request-id"] == "keep-me" + assert headers["x-extra"] == "extra-value" + assert "host" not in {key.lower() for key in headers} + assert "transfer-encoding" not in {key.lower() for key in headers} + + +def test_forward_get_sends_backend_request_when_trace_injection_fails(monkeypatch: pytest.MonkeyPatch) -> None: + backend = _RecordingClient() + proxy = _make_proxy(realtime=backend) + request = _make_request( + method="GET", + path="/v1/ingest/pipeline-config", + headers={ + "host": "gateway.local", + "x-request-id": "keep-me", + }, + ) + + def _raise_injection_error(headers: dict[str, str]) -> None: + raise RuntimeError("propagator unavailable") + + monkeypatch.setattr(proxy_module.tracing_module, "inject_trace_context", _raise_injection_error) + + async def _forward_get() -> Any: + return await proxy.forward_get(request, PoolType.REALTIME, "/v1/ingest/pipeline-config") + + response = asyncio.run(_forward_get()) + + assert response.status_code == 200 + assert response.body == b'{"config": true}' + assert len(backend.calls) == 1 + call = backend.calls[0] + headers = call["headers"] + assert call["method"] == "GET" + assert call["url"] == "/v1/ingest/pipeline-config" + assert headers["x-request-id"] == "keep-me" + assert "host" not in {key.lower() for key in headers} + + +def _make_proxy( + *, + realtime: _RecordingClient | None = None, + batch: _RecordingClient | None = None, +) -> GatewayProxy: + proxy = GatewayProxy.__new__(GatewayProxy) + proxy._config = GatewayConfig( # noqa: SLF001 + realtime_url="http://realtime.test", + batch_url="http://batch.test", + ) + proxy._realtime = realtime or _RecordingClient() # noqa: SLF001 + proxy._batch = batch or _RecordingClient() # noqa: SLF001 + return proxy + + +def _make_request( + *, + method: str, + path: str, + headers: dict[str, str], + body: bytes = b"", +) -> Request: + async def _receive() -> dict[str, Any]: + return {"type": "http.request", "body": body, "more_body": False} + + scope = { + "type": "http", + "method": method, + "path": path, + "raw_path": path.encode("ascii"), + "query_string": b"", + "headers": [(key.encode("latin-1"), value.encode("latin-1")) for key, value in headers.items()], + "scheme": "http", + "server": ("gateway.local", 80), + "client": ("testclient", 123), + } + return Request(scope, _receive) diff --git a/nemo_retriever/tests/test_service_tracing.py b/nemo_retriever/tests/test_service_tracing.py new file mode 100644 index 0000000000..0f6c68b46b --- /dev/null +++ b/nemo_retriever/tests/test_service_tracing.py @@ -0,0 +1,349 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. +# All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Tests for service-mode OpenTelemetry tracing helpers.""" + +from __future__ import annotations + +import re +from typing import Any + +import pytest +from opentelemetry import trace +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import SimpleSpanProcessor, SpanExportResult + +from nemo_retriever.service.app import create_app +from nemo_retriever.service.config import ServiceConfig +from nemo_retriever.service.tracing import ( + _reset_tracing_for_tests, + configure_tracing, + current_trace_id_hex, + extract_trace_context, + inject_trace_context, + start_span, + tracing_enabled_from_env, +) + + +class _CollectingExporter: + def __init__(self, exported: list[Any]) -> None: + self._exported = exported + + def export(self, spans: Any) -> SpanExportResult: + self._exported.extend(spans) + return SpanExportResult.SUCCESS + + def shutdown(self) -> None: + return None + + def force_flush(self, timeout_millis: int = 30_000) -> bool: + return True + + +@pytest.fixture(autouse=True) +def reset_tracing() -> None: + _reset_tracing_for_tests() + yield + _reset_tracing_for_tests() + + +@pytest.fixture +def exported_spans(monkeypatch: pytest.MonkeyPatch) -> list[Any]: + exported: list[Any] = [] + monkeypatch.setattr( + "nemo_retriever.service.tracing.OTLPSpanExporter", + lambda *args, **kwargs: _CollectingExporter(exported), + ) + monkeypatch.setattr("nemo_retriever.service.tracing.BatchSpanProcessor", SimpleSpanProcessor) + return exported + + +@pytest.mark.parametrize( + ("env", "expected"), + [ + ({}, False), + ({"OTEL_TRACES_EXPORTER": "otlp"}, False), + ({"OTEL_TRACES_EXPORTER": "OTLP"}, False), + ({"OTEL_TRACES_EXPORTER": "", "OTEL_EXPORTER_OTLP_ENDPOINT": "http://otel:4317"}, False), + ({"OTEL_TRACES_EXPORTER": "none", "OTEL_EXPORTER_OTLP_ENDPOINT": "http://otel:4317"}, False), + ({"OTEL_TRACES_EXPORTER": "otlp", "OTEL_EXPORTER_OTLP_ENDPOINT": ""}, False), + ( + { + "OTEL_TRACES_EXPORTER": "otlp", + "OTEL_EXPORTER_OTLP_ENDPOINT": "http://otel:4317", + "OTEL_SDK_DISABLED": "true", + }, + False, + ), + ({"OTEL_TRACES_EXPORTER": "otlp", "OTEL_EXPORTER_OTLP_ENDPOINT": "http://otel:4317"}, True), + ({"OTEL_TRACES_EXPORTER": "OTLP", "OTEL_EXPORTER_OTLP_ENDPOINT": "http://otel:4317"}, True), + ({"OTEL_TRACES_EXPORTER": "jaeger", "OTEL_EXPORTER_OTLP_ENDPOINT": "http://otel:4317"}, False), + ({"OTEL_TRACES_EXPORTER": "zipkin", "OTEL_EXPORTER_OTLP_ENDPOINT": "http://otel:4317"}, False), + ({"OTEL_TRACES_EXPORTER": "console", "OTEL_EXPORTER_OTLP_ENDPOINT": "http://otel:4317"}, False), + ({"OTEL_TRACES_EXPORTER": "custom", "OTEL_EXPORTER_OTLP_ENDPOINT": "http://otel:4317"}, False), + ], +) +def test_tracing_enabled_from_env_requires_exporter_and_endpoint(env: dict[str, str], expected: bool) -> None: + assert tracing_enabled_from_env(env) is expected + + +def test_tracing_helper_keeps_span_attribute_sanitizer_private() -> None: + import nemo_retriever.service.tracing as tracing_module + + assert not hasattr(tracing_module, "span_attributes") + + +def test_start_span_is_noop_when_tracer_lookup_fails(monkeypatch: pytest.MonkeyPatch) -> None: + def _raise(*args: Any, **kwargs: Any) -> Any: + raise RuntimeError("tracer unavailable") + + monkeypatch.setattr("nemo_retriever.service.tracing.get_tracer", _raise) + + with start_span("service.unavailable"): + assert current_trace_id_hex() is None + + +def test_start_span_is_noop_when_context_enter_fails(monkeypatch: pytest.MonkeyPatch) -> None: + class _FailingSpanContext: + def __enter__(self) -> Any: + raise RuntimeError("span enter failed") + + def __exit__(self, exc_type: Any, exc: Any, traceback: Any) -> bool: + raise AssertionError("span context should not be entered") + + class _Tracer: + def start_as_current_span(self, name: str, **kwargs: Any) -> Any: + return _FailingSpanContext() + + monkeypatch.setattr("nemo_retriever.service.tracing.get_tracer", lambda: _Tracer()) + + with start_span("service.enter-failure"): + assert current_trace_id_hex() is None + + +def test_start_span_does_not_swallow_user_exceptions(monkeypatch: pytest.MonkeyPatch) -> None: + class _SpanContext: + def __enter__(self) -> object: + return object() + + def __exit__(self, exc_type: Any, exc: Any, traceback: Any) -> bool: + return False + + class _Tracer: + def start_as_current_span(self, name: str, **kwargs: Any) -> Any: + return _SpanContext() + + monkeypatch.setattr("nemo_retriever.service.tracing.get_tracer", lambda: _Tracer()) + + with pytest.raises(RuntimeError, match="application failure"): + with start_span("service.user-failure"): + raise RuntimeError("application failure") + + +def test_configure_tracing_creates_spans_with_hex_trace_id( + monkeypatch: pytest.MonkeyPatch, exported_spans: list[Any] +) -> None: + monkeypatch.setenv("OTEL_TRACES_EXPORTER", "otlp") + monkeypatch.setenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://otel:4317") + + assert configure_tracing(service_role="standalone") is True + + with start_span("service.unit"): + trace_id = current_trace_id_hex() + + assert trace_id is not None + assert re.fullmatch(r"[0-9a-f]{32}", trace_id) + assert exported_spans + + +def test_configure_tracing_treats_existing_global_provider_as_configured( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setenv("OTEL_TRACES_EXPORTER", "otlp") + monkeypatch.setenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://otel:4317") + external_provider = TracerProvider() + trace.set_tracer_provider(external_provider) + monkeypatch.setattr( + "nemo_retriever.service.tracing.OTLPSpanExporter", + lambda *args, **kwargs: pytest.fail("configure_tracing should not construct an exporter"), + ) + + try: + assert configure_tracing(service_role="standalone") is True + assert configure_tracing(service_role="standalone") is True + assert trace.get_tracer_provider() is external_provider + finally: + external_provider.shutdown() + + +def test_configure_tracing_cleans_up_partial_setup_failure(monkeypatch: pytest.MonkeyPatch) -> None: + cleanup_calls: list[str] = [] + + class _FailingProvider: + def __init__(self, *args: Any, **kwargs: Any) -> None: + return None + + def add_span_processor(self, processor: Any) -> None: + raise RuntimeError("processor install failed") + + def shutdown(self) -> None: + cleanup_calls.append("provider") + + class _FakeExporter: + def shutdown(self) -> None: + cleanup_calls.append("exporter") + + class _FakeProcessor: + def __init__(self, exporter: Any) -> None: + self.exporter = exporter + + def shutdown(self) -> None: + cleanup_calls.append("processor") + + monkeypatch.setenv("OTEL_TRACES_EXPORTER", "otlp") + monkeypatch.setenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://otel:4317") + monkeypatch.setattr("nemo_retriever.service.tracing.TracerProvider", _FailingProvider) + monkeypatch.setattr("nemo_retriever.service.tracing.OTLPSpanExporter", _FakeExporter) + monkeypatch.setattr("nemo_retriever.service.tracing.BatchSpanProcessor", _FakeProcessor) + + assert configure_tracing(service_role="standalone") is False + assert cleanup_calls == ["processor", "exporter", "provider"] + + +def test_trace_context_inject_extract_round_trips_traceparent( + monkeypatch: pytest.MonkeyPatch, exported_spans: list[Any] +) -> None: + monkeypatch.setenv("OTEL_TRACES_EXPORTER", "otlp") + monkeypatch.setenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://otel:4317") + assert configure_tracing(service_role="standalone") is True + + with start_span("service.parent"): + parent_trace_id = current_trace_id_hex() + carrier = inject_trace_context() + + assert "traceparent" in carrier + assert set(carrier).issubset({"traceparent", "tracestate"}) + + extracted = extract_trace_context(carrier) + with start_span("service.child", context=extracted): + assert current_trace_id_hex() == parent_trace_id + + +def test_trace_context_inject_mutates_existing_carrier_preserving_unrelated_headers( + monkeypatch: pytest.MonkeyPatch, exported_spans: list[Any] +) -> None: + monkeypatch.setenv("OTEL_TRACES_EXPORTER", "otlp") + monkeypatch.setenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://otel:4317") + assert configure_tracing(service_role="standalone") is True + + existing_carrier = { + "traceparent": "00-00000000000000000000000000000001-0000000000000001-01", + "x-required": "keep", + } + with start_span("service.parent"): + carrier = inject_trace_context(existing_carrier) + + assert carrier is existing_carrier + assert carrier["x-required"] == "keep" + assert "traceparent" in carrier + assert carrier["traceparent"] != "00-00000000000000000000000000000001-0000000000000001-01" + + +def test_trace_context_inject_removes_mixed_case_w3c_keys( + monkeypatch: pytest.MonkeyPatch, exported_spans: list[Any] +) -> None: + monkeypatch.setenv("OTEL_TRACES_EXPORTER", "otlp") + monkeypatch.setenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://otel:4317") + assert configure_tracing(service_role="standalone") is True + + existing_carrier = { + "Traceparent": "00-00000000000000000000000000000001-0000000000000001-01", + "TRACEPARENT": "00-00000000000000000000000000000002-0000000000000002-01", + "Tracestate": "vendor=old", + "x-required": "keep", + } + with start_span("service.parent"): + carrier = inject_trace_context(existing_carrier) + + assert carrier is existing_carrier + assert carrier["x-required"] == "keep" + assert "traceparent" in carrier + assert "Traceparent" not in carrier + assert "TRACEPARENT" not in carrier + assert "Tracestate" not in carrier + assert {key for key in carrier if key.lower() in {"traceparent", "tracestate"}} <= {"traceparent", "tracestate"} + + +def test_start_span_drops_sensitive_keys_and_keeps_benign_metadata( + monkeypatch: pytest.MonkeyPatch, exported_spans: list[Any] +) -> None: + monkeypatch.setenv("OTEL_TRACES_EXPORTER", "otlp") + monkeypatch.setenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://otel:4317") + assert configure_tracing(service_role="standalone") is True + + raw_attributes = { + "Authorization": "Bearer abc", + "auth": "abc", + "auth.header": "Bearer abc", + "credential": "abc", + "credentials": {"token": "abc"}, + "user_credentials": "abc", + "x-api-key": "abc", + "API key": "abc", + "apiKey": "abc", + "access_token": "abc", + "password": "abc", + "client_secret": "abc", + "request_body": "{}", + "body": "{}", + "payload": b"raw", + "response_payload": b"raw", + "payload_text": "sensitive", + "file_bytes": b"raw", + "content": "sensitive", + "document_content": "sensitive", + "body_text": "sensitive", + "raw_content": "sensitive", + "content_type": "application/json", + "content_encoding": "gzip", + "content_language": "en", + "content_length": 123, + "payload_size": 456, + "body_length": 789, + "safe.status": "ok", + "document_count": 2, + } + + with start_span("service.sanitize", attributes=raw_attributes): + pass + + attrs = dict(exported_spans[-1].attributes) + assert attrs == { + "content_type": "application/json", + "content_encoding": "gzip", + "content_language": "en", + "content_length": 123, + "payload_size": 456, + "body_length": 789, + "safe.status": "ok", + "document_count": 2, + } + + +def test_create_app_configures_tracing_for_service_role(monkeypatch: pytest.MonkeyPatch) -> None: + calls: list[tuple[str, str | None]] = [] + + def _configure_tracing(*, service_role: str, service_name: str | None = None) -> bool: + calls.append((service_role, service_name)) + return True + + monkeypatch.setattr("nemo_retriever.service.tracing.configure_tracing", _configure_tracing) + monkeypatch.setattr("nemo_retriever.service.app._configure_logging", lambda config: None) + monkeypatch.setattr("nemo_retriever.service.app._apply_resource_limits", lambda config: None) + + app = create_app(ServiceConfig(mode="gateway")) + + assert app.state.config.mode == "gateway" + assert calls == [("gateway", None)] diff --git a/nemo_retriever/uv.lock b/nemo_retriever/uv.lock index 0090d174e8..61ec125db3 100644 --- a/nemo_retriever/uv.lock +++ b/nemo_retriever/uv.lock @@ -2446,6 +2446,9 @@ dependencies = [ { name = "nltk" }, { name = "numpy" }, { name = "nvidia-riva-client" }, + { name = "opentelemetry-api" }, + { name = "opentelemetry-exporter-otlp-proto-grpc" }, + { name = "opentelemetry-sdk" }, { name = "pandas" }, { name = "pillow" }, { name = "prometheus-fastapi-instrumentator" }, @@ -2620,6 +2623,9 @@ requires-dist = [ { name = "open-clip-torch", marker = "extra == 'benchmarks'", specifier = "==3.2.0" }, { name = "open-clip-torch", marker = "extra == 'nemotron-parse'", specifier = "==3.2.0" }, { name = "opencv-python-headless", marker = "extra == 'local'", specifier = ">=4.8.0" }, + { name = "opentelemetry-api", specifier = ">=1.41.1" }, + { name = "opentelemetry-exporter-otlp-proto-grpc", specifier = ">=1.41.1" }, + { name = "opentelemetry-sdk", specifier = ">=1.41.1" }, { name = "pandas", specifier = ">=2.0,<3" }, { name = "pillow", specifier = "==12.2.0" }, { name = "prometheus-fastapi-instrumentator", specifier = ">=7.0,<8" }, @@ -3282,7 +3288,7 @@ name = "opentelemetry-exporter-otlp-proto-common" version = "1.41.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "opentelemetry-proto", marker = "sys_platform == 'linux'" }, + { name = "opentelemetry-proto" }, ] sdist = { url = "https://files.pythonhosted.org/packages/ae/fa/f9e3bd3c4d692b3ce9a2880a167d1f79681a1bea11f00d5bf76adc03e6ea/opentelemetry_exporter_otlp_proto_common-1.41.1.tar.gz", hash = "sha256:0e253156ea9c36b0bd3d2440c5c9ba7dd1f3fb64ba7a08fc85fbac536b56e1fb", size = 20409, upload-time = "2026-04-24T13:15:40.924Z" } wheels = [ @@ -3294,13 +3300,13 @@ name = "opentelemetry-exporter-otlp-proto-grpc" version = "1.41.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "googleapis-common-protos", marker = "sys_platform == 'linux'" }, - { name = "grpcio", marker = "sys_platform == 'linux'" }, - { name = "opentelemetry-api", marker = "sys_platform == 'linux'" }, - { name = "opentelemetry-exporter-otlp-proto-common", marker = "sys_platform == 'linux'" }, - { name = "opentelemetry-proto", marker = "sys_platform == 'linux'" }, - { name = "opentelemetry-sdk", marker = "sys_platform == 'linux'" }, - { name = "typing-extensions", marker = "sys_platform == 'linux'" }, + { name = "googleapis-common-protos" }, + { name = "grpcio" }, + { name = "opentelemetry-api" }, + { name = "opentelemetry-exporter-otlp-proto-common" }, + { name = "opentelemetry-proto" }, + { name = "opentelemetry-sdk" }, + { name = "typing-extensions" }, ] sdist = { url = "https://files.pythonhosted.org/packages/1e/9b/e4503060b8695579dbaad187dc8cef4554188de68748c88060599b77489e/opentelemetry_exporter_otlp_proto_grpc-1.41.1.tar.gz", hash = "sha256:b05df8fa1333dc9a3fda36b676b96b5095ab6016d3f0c3296d430d629ba1443b", size = 25755, upload-time = "2026-04-24T13:15:41.93Z" } wheels = [