diff --git a/nemo_retriever/src/nemo_retriever/skill_eval/cli.py b/nemo_retriever/src/nemo_retriever/skill_eval/cli.py
index ababc472c9..786fd4956c 100644
--- a/nemo_retriever/src/nemo_retriever/skill_eval/cli.py
+++ b/nemo_retriever/src/nemo_retriever/skill_eval/cli.py
@@ -92,12 +92,13 @@ def _preflight_judge_endpoint(api_base: str, timeout: float = 5.0) -> None:
)
-def _build_judge(cfg: dict) -> Optional[Any]:
- """Construct an ``LLMJudge`` from ``cfg['judge']`` or return ``None``.
+def _build_judge(cfg: dict, *, manifest_path: Path) -> Optional[Any]:
+ """Construct a ``JudgeContext`` from ``cfg['judge']`` or return ``None``.
- Skips silently (with a console note) when the API key env var is unset, so
- runs work end-to-end without network access. Import is deferred so the
- ``litellm`` extra isn't required when judging is disabled.
+ Skips silently (with a console note) when the API key env var is unset,
+ so runs work end-to-end without network access. Prompt paths default
+ to the manifest's parent directory if not overridden in the config;
+ the file must exist for any mode the run actually needs.
"""
judge_cfg = cfg.get("judge") or {}
if not judge_cfg.get("enabled", True):
@@ -109,25 +110,65 @@ def _build_judge(cfg: dict) -> Optional[Any]:
typer.echo(f"Judge disabled: ${api_key_env} is not set in the environment.")
return None
try:
- from nemo_retriever.llm.clients.judge import LLMJudge
+ from nemo_retriever.llm.clients.litellm import LiteLLMClient
except ImportError as exc:
- typer.echo(f"Judge disabled: failed to import LLMJudge ({exc}). Install nemo-retriever[llm].")
+ typer.echo(f"Judge disabled: failed to import LiteLLMClient ({exc}). Install nemo-retriever[llm].")
return None
+ from nemo_retriever.skill_eval.runner import JudgeContext
+
api_base = judge_cfg.get("api_base")
if api_base:
_preflight_judge_endpoint(str(api_base))
- judge_kwargs: dict[str, Any] = {
- "model": str(judge_cfg.get("model", "nvidia_nim/nvidia/llama-3.3-nemotron-super-49b-v1.5")),
- "api_base": api_base,
- "api_key": api_key,
- }
- if judge_cfg.get("temperature") is not None:
- judge_kwargs["temperature"] = float(judge_cfg["temperature"])
- if judge_cfg.get("max_tokens") is not None:
- judge_kwargs["max_tokens"] = int(judge_cfg["max_tokens"])
- judge = LLMJudge.from_kwargs(**judge_kwargs)
- typer.echo(f"Judge enabled: model={judge.model}")
- return judge
+
+ client = LiteLLMClient.from_kwargs(
+ model=str(judge_cfg.get("model", "nvidia_nim/nvidia/llama-3.3-nemotron-super-49b-v1.5")),
+ api_base=api_base,
+ api_key=api_key,
+ temperature=float(judge_cfg.get("temperature", 0.1)),
+ max_tokens=int(judge_cfg.get("max_tokens", 4096)),
+ )
+
+ legacy_judge = None
+ if judge_cfg.get("legacy_enabled", True):
+ try:
+ from nemo_retriever.llm.clients.judge import LLMJudge
+ except ImportError as exc:
+ typer.echo(f"Legacy judge disabled: failed to import LLMJudge ({exc}).")
+ else:
+ legacy_judge = LLMJudge.from_kwargs(
+ model=str(judge_cfg.get("model", "nvidia_nim/nvidia/llama-3.3-nemotron-super-49b-v1.5")),
+ api_base=api_base,
+ api_key=api_key,
+ temperature=float(judge_cfg.get("temperature", 0.1)),
+ max_tokens=int(judge_cfg.get("max_tokens", 4096)),
+ )
+
+ manifest_dir = Path(manifest_path).expanduser().resolve().parent
+ simple_path = judge_cfg.get("simple_prompt_path")
+ scenario_path = judge_cfg.get("scenario_prompt_path")
+ simple_resolved = (
+ Path(str(simple_path)).expanduser().resolve() if simple_path else manifest_dir / "llm_scorer_prompt.md"
+ )
+ scenario_resolved = (
+ Path(str(scenario_path)).expanduser().resolve()
+ if scenario_path
+ else manifest_dir / "llm_scenario_scorer_prompt.md"
+ )
+ ctx = JudgeContext(
+ client=client,
+ simple_prompt_path=str(simple_resolved) if simple_resolved.is_file() else None,
+ scenario_prompt_path=str(scenario_resolved) if scenario_resolved.is_file() else None,
+ legacy_judge=legacy_judge,
+ )
+ typer.echo(
+ "Judge enabled: model={m} simple_prompt={s} scenario_prompt={c} legacy={legacy}".format(
+ m=client.transport.model,
+ s=ctx.simple_prompt_path or "(missing)",
+ c=ctx.scenario_prompt_path or "(missing)",
+ legacy="on" if ctx.legacy_judge is not None else "off",
+ )
+ )
+ return ctx
def _build_trace_summarizer(cfg: dict) -> Optional[Any]:
@@ -345,6 +386,15 @@ def run_command(
"Defaults to config.query_parallelism, then 1 (linear session)."
),
),
+ limit_queries: Optional[int] = typer.Option(
+ None,
+ "--limit-queries",
+ min=1,
+ help=(
+ "Cap queries per domain to the first N entries (deterministic). "
+ "Useful for smoke-tests; omit for the full sweep."
+ ),
+ ),
) -> None:
"""Run the benchmark across the dataset's domains x selected conditions."""
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
@@ -378,6 +428,10 @@ def run_command(
raise typer.Exit(code=2)
by_domain = {d: by_domain[d] for d in wanted}
+ if limit_queries is not None:
+ by_domain = {d: es[:limit_queries] for d, es in by_domain.items()}
+ typer.echo(f"--limit-queries={limit_queries}: capping each domain to its first {limit_queries} entries.")
+
domain_order = sorted(by_domain.keys())
typer.echo(f"Domains in this run: {domain_order} ({sum(len(v) for v in by_domain.values())} entries total)")
@@ -403,7 +457,7 @@ def run_command(
raise typer.Exit(code=2)
testdata_prefixes = tuple(str(p) for p in testdata_prefixes_raw)
- judge = _build_judge(cfg)
+ judge = _build_judge(cfg, manifest_path=Path(str(manifest_path)).expanduser().resolve())
summarizer = _build_trace_summarizer(cfg)
base_dir = str(artifacts_root) if artifacts_root else None
@@ -474,7 +528,16 @@ def run_command(
for r in results:
save_trial(r, session_dir)
kind = "setup" if r.is_setup else f"entry_id={r.entry_id} query_id={r.query_id}"
- judge_str = "" if r.is_setup or r.judge_score is None else f" judge={r.judge_score}"
+ judge_parts: list[str] = []
+ if not r.is_setup:
+ if r.judge_score is not None:
+ judge_parts.append(f"judge={r.judge_score}")
+ elif any(v is not None for v in r.judge_subscores.values()):
+ n = sum(1 for v in r.judge_subscores.values() if v is not None)
+ judge_parts.append(f"judge_mode={r.judge_mode}/sub_n={n}")
+ if r.legacy_judge_score is not None:
+ judge_parts.append(f"legacy={r.legacy_judge_score}")
+ judge_str = (" " + " ".join(judge_parts)) if judge_parts else ""
cost_str = f"${r.total_cost_usd:.3f}" if r.cost_available else "n/a"
trace_str = f" trace={r.compact_trace_path}" if r.compact_trace_path else ""
typer.echo(
@@ -500,21 +563,34 @@ def run_command(
if judge is not None:
typer.echo("\nLLM-as-judge scores (mean over query turns, 0-5 scale):")
for cond in selected:
- scored: list[int] = []
+ simple_scored: list[int] = []
+ scenario_scored = 0
+ legacy_scored: list[int] = []
errored = 0
for domain in domain_order:
for r in results_by_key.get((agent, cond, domain), []):
if r.is_setup:
continue
+ has_subscores = any(v is not None for v in r.judge_subscores.values())
if r.judge_score is not None:
- scored.append(int(r.judge_score))
+ simple_scored.append(int(r.judge_score))
+ elif has_subscores:
+ scenario_scored += 1
elif r.judge_error:
errored += 1
- if scored:
- mean_score = sum(scored) / len(scored)
- typer.echo(f" {agent}/{cond}: mean={mean_score:.2f} n={len(scored)} errors={errored}")
- else:
- typer.echo(f" {agent}/{cond}: no scores errors={errored} (check judge config / litellm install)")
+ if r.legacy_judge_score is not None:
+ legacy_scored.append(int(r.legacy_judge_score))
+ parts: list[str] = []
+ if simple_scored:
+ parts.append(f"simple mean={sum(simple_scored) / len(simple_scored):.2f} n={len(simple_scored)}")
+ if scenario_scored:
+ parts.append(f"scenario_scored={scenario_scored}")
+ if legacy_scored:
+ parts.append(f"legacy mean={sum(legacy_scored) / len(legacy_scored):.2f} n={len(legacy_scored)}")
+ parts.append(f"errors={errored}")
+ if not simple_scored and not scenario_scored and not legacy_scored:
+ parts.append("(check judge config / litellm install)")
+ typer.echo(f" {agent}/{cond}: " + " ".join(parts))
json_path, md_path = write_summary(
session_dir=session_dir,
@@ -537,8 +613,9 @@ def _needs_rescore(trial: dict[str, Any]) -> bool:
judge_error = trial.get("judge_error") or ""
if judge_error in UNSCORABLE_JUDGE_ERRORS:
return False
- score = trial.get("judge_score")
- if score is None:
+ sub_scores = trial.get("judge_subscores") or {}
+ scored = trial.get("judge_score") is not None or any(v is not None for v in sub_scores.values())
+ if not scored:
return True
if judge_error:
return True
@@ -621,7 +698,7 @@ def rescore_command(
entries = load_eval_manifest(Path(str(manifest_path)).expanduser().resolve())
entries_by_id = {e.entry_id: e for e in entries}
- judge = _build_judge(cfg)
+ judge = _build_judge(cfg, manifest_path=Path(str(manifest_path)).expanduser().resolve())
if judge is None:
typer.echo("Error: judge is not configured (see messages above). Cannot rescore.", err=True)
raise typer.Exit(code=2)
@@ -659,15 +736,24 @@ def rescore_command(
result.judge_score = None
result.judge_reasoning = ""
result.judge_error = ""
+ result.judge_mode = ""
+ result.judge_subscores = {}
+ result.judge_flags = {}
+ result.judge_lists = {}
_apply_judge(judge, entry, result)
raw.update(asdict(result))
path.write_text(json.dumps(raw, indent=2) + "\n", encoding="utf-8")
- if result.judge_score is not None:
+ scored_ok = result.judge_score is not None or any(v is not None for v in result.judge_subscores.values())
+ if scored_ok:
rescored += 1
- typer.echo(f" {path.name}: entry_id={result.entry_id} judge={result.judge_score}")
+ if result.judge_mode == "simple" and result.judge_score is not None:
+ typer.echo(f" {path.name}: entry_id={result.entry_id} judge={result.judge_score}")
+ else:
+ n_sub = sum(1 for v in result.judge_subscores.values() if v is not None)
+ typer.echo(f" {path.name}: entry_id={result.entry_id} mode={result.judge_mode} sub_scores={n_sub}")
elif result.judge_error in UNSCORABLE_JUDGE_ERRORS:
unscorable += 1
typer.echo(f" {path.name}: entry_id={result.entry_id} unscorable ({result.judge_error})")
diff --git a/nemo_retriever/src/nemo_retriever/skill_eval/configs/skill_eval.yaml b/nemo_retriever/src/nemo_retriever/skill_eval/configs/skill_eval.yaml
index 6b2a164e29..429e3c87f1 100644
--- a/nemo_retriever/src/nemo_retriever/skill_eval/configs/skill_eval.yaml
+++ b/nemo_retriever/src/nemo_retriever/skill_eval/configs/skill_eval.yaml
@@ -107,6 +107,20 @@ judge:
api_key_env: NVIDIA_API_KEY
temperature: 0.1
max_tokens: 4096
+ # On-disk path to the simple QA judge prompt (batch_1's llm_scorer_prompt.md).
+ # When null, the runner looks for ``llm_scorer_prompt.md`` next to the manifest.
+ # Required only if the manifest contains entries WITHOUT a `scoring_mode` field.
+ simple_prompt_path: null
+ # On-disk path to the scenario-aware judge prompt (batch_2's llm_scenario_scorer_prompt.md).
+ # When null, the runner looks for ``llm_scenario_scorer_prompt.md`` next to the manifest.
+ # Required only if the manifest contains entries WITH a `scoring_mode` field.
+ scenario_prompt_path: null
+ # When true (default), ALSO run the legacy hardcoded LLMJudge on every
+ # scoreable trial (one with a non-empty ground_truth_answer + non-empty
+ # final_answer). Produces an additional 1-5 score directly comparable to
+ # runs that pre-date the scenario-aware judge. Set to false to skip the
+ # second judge call and halve judge LLM cost.
+ legacy_enabled: true
# ---------------------------------------------------------------------------
# Tool-use summarizer
diff --git a/nemo_retriever/src/nemo_retriever/skill_eval/dataset.py b/nemo_retriever/src/nemo_retriever/skill_eval/dataset.py
index e3c2e89963..3d1dd2526d 100644
--- a/nemo_retriever/src/nemo_retriever/skill_eval/dataset.py
+++ b/nemo_retriever/src/nemo_retriever/skill_eval/dataset.py
@@ -40,6 +40,13 @@ class DatasetEntry(BaseModel):
ground_truth_answer: str = ""
domain: str = ""
domain_label: str = ""
+ scoring_mode: str = ""
+ category: str = ""
+ phase: str = ""
+ expected_action: str = ""
+ expected_output_shape: str = ""
+ validation_signal: str = ""
+ raw_answers: list[str] = []
def _select_prompt(candidates: list[dict[str, Any]], selected_variant: int | None) -> str:
@@ -121,6 +128,9 @@ def load_eval_manifest(path: Path) -> list[DatasetEntry]:
continue
pages.append(GroundTruthPage(doc_id=str(doc_id), page_number=int(page), score=int(p.get("score") or 1)))
+ raw_answers = item.get("raw_answers") or []
+ if not isinstance(raw_answers, list):
+ raw_answers = []
entries.append(
DatasetEntry(
entry_id=idx,
@@ -132,6 +142,13 @@ def load_eval_manifest(path: Path) -> list[DatasetEntry]:
ground_truth_answer=str(item.get("answer") or ""),
domain=domain,
domain_label=domain_label,
+ scoring_mode=str(item.get("scoring_mode") or ""),
+ category=str(item.get("category") or ""),
+ phase=str(item.get("phase") or ""),
+ expected_action=str(item.get("expected_action") or ""),
+ expected_output_shape=str(item.get("expected_output_shape") or ""),
+ validation_signal=str(item.get("validation_signal") or ""),
+ raw_answers=[str(x) for x in raw_answers],
)
)
return entries
diff --git a/nemo_retriever/src/nemo_retriever/skill_eval/judging.py b/nemo_retriever/src/nemo_retriever/skill_eval/judging.py
new file mode 100644
index 0000000000..179c65c7de
--- /dev/null
+++ b/nemo_retriever/src/nemo_retriever/skill_eval/judging.py
@@ -0,0 +1,271 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Scenario-aware LLM-as-judge for skill_eval.
+
+Routes each trial through one of two prompts based on whether the
+dataset entry carries a ``scoring_mode``:
+
+- ``simple`` -> batch_1 ``llm_scorer_prompt.md`` style (answer-grading).
+- ``scenario`` -> batch_2 ``llm_scenario_scorer_prompt.md`` style (handles
+ ingest_only / extract_only / refusal / capability_gap / dispatcher_prompt /
+ skip in addition to answerable_retrieval / ingest_plus_answer).
+
+Both prompts live on disk in the SDG run directory that produced the
+manifest, so this module never vendors prompt text. See
+``cli._build_judge`` for how paths are resolved.
+"""
+
+from __future__ import annotations
+
+import functools
+import json
+import re
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Literal, Optional, Protocol
+
+if TYPE_CHECKING:
+ from nemo_retriever.skill_eval.dataset import DatasetEntry
+ from nemo_retriever.skill_eval.runner import TrialResult
+
+
+JudgeMode = Literal["simple", "scenario"]
+
+
+SIMPLE_SUB_SCORES: tuple[str, ...] = ("answer_correctness", "citation_quality", "faithfulness")
+SCENARIO_SUB_SCORES: tuple[str, ...] = (
+ "action_correctness",
+ "routing_correctness",
+ "answer_correctness",
+ "citation_quality",
+ "faithfulness",
+ "refusal_correctness",
+)
+SIMPLE_FLAGS: tuple[str, ...] = ("is_answer_correct", "is_citation_correct")
+SCENARIO_FLAGS: tuple[str, ...] = ("is_action_correct", "is_answer_correct", "is_citation_correct")
+LIST_KEYS: tuple[str, ...] = ("major_errors", "missing_facts", "unsupported_claims")
+
+
+@dataclass
+class ScenarioJudgeResult:
+ """Structured judge output, shared between simple and scenario prompts."""
+
+ mode: JudgeMode
+ sub_scores: dict[str, Optional[int]] = field(default_factory=dict)
+ flags: dict[str, Optional[bool]] = field(default_factory=dict)
+ lists: dict[str, list[str]] = field(default_factory=dict)
+ rationale: str = ""
+ error: Optional[str] = None
+
+
+@functools.lru_cache(maxsize=8)
+def _read_prompt(path: str) -> str:
+ return Path(path).read_text(encoding="utf-8")
+
+
+def _select_mode(entry: "DatasetEntry") -> JudgeMode:
+ """Pick the prompt mode for ``entry`` based on its ``scoring_mode`` field."""
+ return "scenario" if entry.scoring_mode else "simple"
+
+
+def _render_pages(entry: "DatasetEntry") -> str:
+ parts = [f"{p.doc_id}:{p.page_number}" for p in entry.ground_truth_pages]
+ return ", ".join(parts)
+
+
+def _render_cited(result: "TrialResult") -> str:
+ parts = [f"{item.get('doc_id')}:{item.get('page_number')}" for item in result.ranked_retrieved]
+ return ", ".join(parts)
+
+
+def _render_raw_answers(entry: "DatasetEntry") -> str:
+ return "; ".join(entry.raw_answers)
+
+
+# Cap ``agent_answer`` at ~20K chars before stuffing into the judge prompt.
+# A handful of trials (notably ``extract_only`` scenarios where the agent
+# dumps the whole corpus into its answer) otherwise blow past the judge
+# model's context window. 20K chars ≈ 5-6K tokens leaves plenty of headroom
+# for the system instructions + prompt template + reference content.
+JUDGE_ANSWER_CHAR_CAP = 20000
+_TRUNCATION_MARKER = f"\n\n... [truncated for judge: original answer exceeded {JUDGE_ANSWER_CHAR_CAP} chars]"
+
+
+def truncate_for_judge(text: str, cap: int = JUDGE_ANSWER_CHAR_CAP) -> str:
+ """Cap ``text`` at ``cap`` chars with a visible marker.
+
+ Both the new judge (via ``_format_prompt``) and the legacy ``LLMJudge``
+ pass through this helper so the cap is applied identically across both
+ scoring paths. The marker makes the truncation visible to the judge so
+ its rationale can flag the gap rather than silently mis-scoring.
+ """
+ if len(text) <= cap:
+ return text
+ head_chars = max(0, cap - len(_TRUNCATION_MARKER))
+ return text[:head_chars] + _TRUNCATION_MARKER
+
+
+def _format_prompt(template: str, entry: "DatasetEntry", result: "TrialResult") -> str:
+ """Substitute the prompt slots for ``entry`` / ``result`` into ``template``.
+
+ Uses literal-token replacement for each known slot, NOT ``str.format_map``.
+ The SDG-shipped judge prompts contain literal JSON-schema blocks like
+ ``{ "action_correctness": <0-5>, ... }`` which a Python format-string
+ parser would misinterpret as substitution slots with invalid format
+ specifiers. With ``str.replace`` those literals pass through verbatim and
+ only our enumerated slot tokens are substituted.
+
+ Slots that the template doesn't reference are simply ignored. Slots the
+ template does reference but that we don't know about are left as literal
+ ``{slot_name}`` in the output — the judge LLM is robust to that, and
+ leaving them visible aids debugging compared to silently rendering empty.
+ """
+ substitutions: list[tuple[str, str]] = [
+ ("{query}", entry.original_query),
+ ("{reference_answer}", entry.ground_truth_answer),
+ ("{raw_answers}", _render_raw_answers(entry)),
+ ("{relevant_pages}", _render_pages(entry)),
+ ("{agent_answer}", truncate_for_judge(result.final_answer)),
+ ("{agent_cited_pages}", _render_cited(result)),
+ ("{cited_evidence}", ""),
+ ("{scenario_prompt}", entry.paraphrased_prompt),
+ ("{category}", entry.category),
+ ("{phase}", entry.phase),
+ ("{scoring_mode}", entry.scoring_mode),
+ ("{expected_action}", entry.expected_action),
+ ("{expected_output_shape}", entry.expected_output_shape),
+ ("{validation_signal}", entry.validation_signal),
+ ]
+ out = template
+ for token, value in substitutions:
+ out = out.replace(token, value)
+ return out
+
+
+class _ChatClient(Protocol):
+ """Minimal slice of LiteLLMClient that this module needs."""
+
+ def complete(self, messages: list[dict], max_tokens: Optional[int] = None) -> tuple[str, float]: ...
+
+
+_SYSTEM_INSTRUCTION = "Respond with ONLY valid JSON matching the schema in the user message. No prose, no markdown."
+
+
+def _strip_envelope(raw: str) -> str:
+ text = raw.strip()
+ text = re.sub(r".*?", "", text, flags=re.DOTALL).strip()
+ text = re.sub(r"^```(?:json)?\s*", "", text, flags=re.MULTILINE)
+ text = re.sub(r"\s*```$", "", text, flags=re.MULTILINE)
+ return text.strip()
+
+
+def _validate_subscore(value: Any) -> Optional[int]:
+ if value is None:
+ return None
+ if not isinstance(value, int) or isinstance(value, bool):
+ raise TypeError(f"sub-score must be an integer, got {type(value).__name__}: {value!r}")
+ score = value
+ if not 0 <= score <= 5:
+ raise ValueError(f"sub-score {score} out of range 0-5")
+ return score
+
+
+def _validate_flag(value: Any) -> Optional[bool]:
+ if value is None:
+ return None
+ return bool(value)
+
+
+def _validate_list(value: Any) -> list[str]:
+ if value is None:
+ return []
+ if not isinstance(value, list):
+ return []
+ return [str(x) for x in value]
+
+
+def _parse_response(raw: str, mode: JudgeMode) -> ScenarioJudgeResult:
+ """Parse a judge JSON response into a ``ScenarioJudgeResult``.
+
+ Tolerates ```` blocks and code fences. Returns a result whose
+ ``error`` is set when JSON parsing or sub-score validation fails;
+ sub-scores then all default to ``None``.
+ """
+ sub_keys = SIMPLE_SUB_SCORES if mode == "simple" else SCENARIO_SUB_SCORES
+ flag_keys = SIMPLE_FLAGS if mode == "simple" else SCENARIO_FLAGS
+ empty = ScenarioJudgeResult(
+ mode=mode,
+ sub_scores={k: None for k in sub_keys},
+ flags={k: None for k in flag_keys},
+ lists={k: [] for k in LIST_KEYS},
+ rationale="",
+ )
+
+ try:
+ data = json.loads(_strip_envelope(raw))
+ except json.JSONDecodeError as exc:
+ empty.error = f"parse_failure: {exc}: {raw[:200]!r}"
+ return empty
+ if not isinstance(data, dict):
+ empty.error = f"parse_failure: top-level must be a JSON object, got {type(data).__name__}"
+ return empty
+
+ try:
+ sub_scores = {k: _validate_subscore(data.get(k)) for k in sub_keys}
+ except (TypeError, ValueError) as exc:
+ empty.error = f"parse_failure: {exc}"
+ return empty
+
+ return ScenarioJudgeResult(
+ mode=mode,
+ sub_scores=sub_scores,
+ flags={k: _validate_flag(data.get(k)) for k in flag_keys},
+ lists={k: _validate_list(data.get(k)) for k in LIST_KEYS},
+ rationale=str(data.get("brief_rationale") or ""),
+ )
+
+
+def evaluate_entry(
+ *,
+ client: _ChatClient,
+ entry: "DatasetEntry",
+ result: "TrialResult",
+ simple_prompt_path: Optional[str],
+ scenario_prompt_path: Optional[str],
+) -> ScenarioJudgeResult:
+ """Score ``result`` against ``entry`` using the prompt selected by ``_select_mode``.
+
+ ``simple_prompt_path`` / ``scenario_prompt_path`` are the on-disk
+ locations of the SDG-shipped judge prompts. Only the path for the
+ selected mode is required; the other may be ``None``.
+ """
+ mode = _select_mode(entry)
+ chosen_path = scenario_prompt_path if mode == "scenario" else simple_prompt_path
+ if not chosen_path:
+ raise ValueError(
+ f"judge mode={mode!r} requires {('scenario_prompt_path' if mode == 'scenario' else 'simple_prompt_path')}; "
+ "set judge._prompt_path in the config or co-locate the prompt file with the manifest."
+ )
+
+ template = _read_prompt(chosen_path)
+ user = _format_prompt(template, entry, result)
+ messages = [
+ {"role": "system", "content": _SYSTEM_INSTRUCTION},
+ {"role": "user", "content": user},
+ ]
+ try:
+ raw, _ = client.complete(messages)
+ except Exception as exc:
+ sub_keys = SIMPLE_SUB_SCORES if mode == "simple" else SCENARIO_SUB_SCORES
+ flag_keys = SIMPLE_FLAGS if mode == "simple" else SCENARIO_FLAGS
+ return ScenarioJudgeResult(
+ mode=mode,
+ sub_scores={k: None for k in sub_keys},
+ flags={k: None for k in flag_keys},
+ lists={k: [] for k in LIST_KEYS},
+ rationale="",
+ error=f"judge_api_error: {exc}",
+ )
+ return _parse_response(raw, mode=mode)
diff --git a/nemo_retriever/src/nemo_retriever/skill_eval/report.py b/nemo_retriever/src/nemo_retriever/skill_eval/report.py
index 7058d5148b..9a85eabea9 100644
--- a/nemo_retriever/src/nemo_retriever/skill_eval/report.py
+++ b/nemo_retriever/src/nemo_retriever/skill_eval/report.py
@@ -110,6 +110,29 @@ def _aggregate(
metrics["judge_score_mean"] = sum(judge_scores) / len(judge_scores)
metrics["judge_score_n"] = len(judge_scores)
+ legacy_scores = [r.legacy_judge_score for r in query_results if r.legacy_judge_score is not None]
+ if legacy_scores:
+ metrics["legacy_judge_score_mean"] = sum(legacy_scores) / len(legacy_scores)
+ metrics["legacy_judge_score_n"] = len(legacy_scores)
+
+ sub_means: dict[str, tuple[float, int]] = {}
+ sub_score_keys: set[str] = set()
+ for r in query_results:
+ sub_score_keys.update(r.judge_subscores.keys())
+ for key in sorted(sub_score_keys):
+ values = [v for r in query_results if (v := r.judge_subscores.get(key)) is not None]
+ if values:
+ sub_means[key] = (sum(values) / len(values), len(values))
+ if sub_means:
+ metrics["judge_sub_means"] = sub_means
+
+ mode_counts: dict[str, int] = defaultdict(int)
+ for r in query_results:
+ if r.judge_mode:
+ mode_counts[r.judge_mode] += 1
+ if mode_counts:
+ metrics["judge_mode_counts"] = dict(mode_counts)
+
tool_use_summary = next((r.tool_use_summary for r in setup_results if r.tool_use_summary), "")
return {
@@ -152,8 +175,13 @@ def _md_cell(value: Any) -> str:
def _md_row(row: dict[str, Any]) -> str:
m = row.get("metrics", {})
judge_cell = f"{m['judge_score_mean']:.2f} (n={m.get('judge_score_n', 0)})" if "judge_score_mean" in m else "-"
+ legacy_cell = (
+ f"{m['legacy_judge_score_mean']:.2f} (n={m.get('legacy_judge_score_n', 0)})"
+ if "legacy_judge_score_mean" in m
+ else "-"
+ )
return (
- "| {run} | {sr:.2f} | {retr:.2f} | {r1:.3f} | {r5:.3f} | {r10:.3f} | {judge} "
+ "| {run} | {sr:.2f} | {retr:.2f} | {r1:.3f} | {r5:.3f} | {r10:.3f} | {judge} | {legacy} "
"| {ipt:.0f} | {opt:.0f} | {cr:.0f} | {cc:.0f} | {cost} |"
).format(
run=row.get("run_name", "?"),
@@ -163,6 +191,7 @@ def _md_row(row: dict[str, Any]) -> str:
r5=m.get("recall_5", 0.0),
r10=m.get("recall_10", 0.0),
judge=judge_cell,
+ legacy=legacy_cell,
ipt=m.get("input_tokens", 0.0),
opt=m.get("output_tokens", 0.0),
cr=m.get("cache_read_input_tokens", 0.0),
@@ -172,10 +201,56 @@ def _md_row(row: dict[str, Any]) -> str:
_MAIN_TABLE_HEADER = (
- "| run | success_rate | retr_used | recall@1 | recall@5 | recall@10 | judge | q_input | q_output "
+ "| run | success_rate | retr_used | recall@1 | recall@5 | recall@10 | judge | legacy | q_input | q_output "
"| q_cache_read | q_cache_create | q_cost |"
)
-_MAIN_TABLE_DIVIDER = "|---|---|---|---|---|---|---|---|---|---|---|---|"
+_MAIN_TABLE_DIVIDER = "|---|---|---|---|---|---|---|---|---|---|---|---|---|"
+
+
+def _md_subscore_section(overall_rows: list[dict[str, Any]]) -> list[str]:
+ """Emit per-sub-score mean tables; one column per sub-score key seen."""
+ keys: list[str] = []
+ seen: set[str] = set()
+ for row in overall_rows:
+ sub_means = row.get("metrics", {}).get("judge_sub_means") or {}
+ for k in sub_means:
+ if k not in seen:
+ seen.add(k)
+ keys.append(k)
+ if not keys:
+ return []
+ header = "| run | " + " | ".join(keys) + " |"
+ divider = "|---|" + "|".join(["---"] * len(keys)) + "|"
+ lines = ["", "## Judge sub-scores (mean over trials that produced each sub-score, with N)", "", header, divider]
+ for row in overall_rows:
+ sub_means = row.get("metrics", {}).get("judge_sub_means") or {}
+ cells = []
+ for k in keys:
+ entry = sub_means.get(k)
+ cells.append("-" if entry is None else f"{entry[0]:.2f} (n={entry[1]})")
+ lines.append(f"| {row.get('run_name', '?')} | " + " | ".join(cells) + " |")
+ return lines
+
+
+def _md_mode_breakdown(overall_rows: list[dict[str, Any]]) -> list[str]:
+ """Emit a one-row-per-run table showing trial counts per judge mode."""
+ modes: list[str] = []
+ seen: set[str] = set()
+ for row in overall_rows:
+ for k in row.get("metrics", {}).get("judge_mode_counts") or {}:
+ if k not in seen:
+ seen.add(k)
+ modes.append(k)
+ if not modes:
+ return []
+ header = "| run | " + " | ".join(modes) + " |"
+ divider = "|---|" + "|".join(["---"] * len(modes)) + "|"
+ lines = ["", "## Judge mode breakdown (trials per mode)", "", header, divider]
+ for row in overall_rows:
+ counts = row.get("metrics", {}).get("judge_mode_counts") or {}
+ cells = [str(counts.get(m, 0)) for m in modes]
+ lines.append(f"| {row.get('run_name', '?')} | " + " | ".join(cells) + " |")
+ return lines
def write_summary_md(
@@ -273,6 +348,9 @@ def write_summary_md(
)
)
+ lines.extend(_md_subscore_section(overall_rows))
+ lines.extend(_md_mode_breakdown(overall_rows))
+
diag_lines = []
for row in overall_rows:
m = row.get("metrics", {})
diff --git a/nemo_retriever/src/nemo_retriever/skill_eval/runner.py b/nemo_retriever/src/nemo_retriever/skill_eval/runner.py
index 3008dd4b7f..1e6d71ee7b 100644
--- a/nemo_retriever/src/nemo_retriever/skill_eval/runner.py
+++ b/nemo_retriever/src/nemo_retriever/skill_eval/runner.py
@@ -72,6 +72,13 @@ class TrialResult:
judge_score: int | None = None
judge_reasoning: str = ""
judge_error: str = ""
+ judge_mode: str = ""
+ judge_subscores: dict[str, int | None] = field(default_factory=dict)
+ judge_flags: dict[str, bool | None] = field(default_factory=dict)
+ judge_lists: dict[str, list[str]] = field(default_factory=dict)
+ legacy_judge_score: int | None = None
+ legacy_judge_reasoning: str = ""
+ legacy_judge_error: str = ""
tool_use_summary: str = ""
cost_available: bool = True
execution_mode: str = "linear_session"
@@ -90,6 +97,26 @@ class ConditionRun:
execution_mode: str = "linear_session"
+ANSWERABLE_SCORING_MODES: frozenset[str] = frozenset({"answerable_retrieval", "ingest_plus_answer"})
+ANSWER_REQUIRED_SCORING_MODES: frozenset[str] = frozenset(
+ {"answerable_retrieval", "ingest_plus_answer", "refusal", "capability_gap", "dispatcher_prompt"}
+)
+
+
+@dataclass
+class JudgeContext:
+ """Bag of state passed from CLI into ``_apply_judge``.
+
+ Holds the LiteLLM transport plus the two on-disk prompt paths. ``None``
+ is a valid path when no entries in the run need that mode.
+ """
+
+ client: Any
+ simple_prompt_path: str | None
+ scenario_prompt_path: str | None
+ legacy_judge: Any = None
+
+
def _remap_pdf_paths(text: str, prefixes: tuple[str, ...]) -> str:
"""Rewrite caller-supplied path prefixes in *text* to ``./pdfs/``.
@@ -1153,38 +1180,89 @@ def _run_one_turn(
return result
-UNSCORABLE_JUDGE_ERRORS: frozenset[str] = frozenset({"no_ground_truth", "empty_candidate"})
+UNSCORABLE_JUDGE_ERRORS: frozenset[str] = frozenset({"no_ground_truth", "empty_candidate", "scoring_mode_skip"})
-def _apply_judge(judge: Any, entry: DatasetEntry, result: TrialResult) -> None:
- """Score ``result.final_answer`` against ``entry.ground_truth_answer``.
+def _apply_judge(ctx: Any, entry: DatasetEntry, result: TrialResult) -> None:
+ """Score ``result.final_answer`` against ``entry`` via the dispatched judge.
- Missing ground truth and empty candidates are recorded as terminal
- ``judge_error`` values so ``rescore`` can skip intrinsically unscorable
- trials instead of retrying them forever.
+ Behaviour summary:
+ - ``ctx is None`` -> no-op (judge disabled).
+ - ``scoring_mode == "skip"`` -> terminal ``judge_error="scoring_mode_skip"``.
+ - Simple-mode entry with no
+ ground truth -> terminal ``judge_error="no_ground_truth"``.
+ - Empty ``final_answer`` for a
+ mode that requires an answer -> terminal ``judge_error="empty_candidate"``.
+ - Otherwise: call
+ ``judging.evaluate_entry`` and stamp sub-scores onto ``result``.
"""
- if judge is None:
+ if ctx is None:
return
- if not entry.ground_truth_answer:
+ if entry.scoring_mode == "skip":
+ result.judge_error = "scoring_mode_skip"
+ return
+
+ needs_answer_mode = entry.scoring_mode in ANSWER_REQUIRED_SCORING_MODES or entry.scoring_mode == ""
+ answerable_mode = entry.scoring_mode in ANSWERABLE_SCORING_MODES or entry.scoring_mode == ""
+ if answerable_mode and not entry.ground_truth_answer:
result.judge_error = "no_ground_truth"
return
- if not result.final_answer:
+ if needs_answer_mode and not result.final_answer:
result.judge_error = "empty_candidate"
return
+
+ from nemo_retriever.skill_eval.judging import evaluate_entry
+
try:
- verdict = judge.judge(
- query=entry.original_query,
- reference=entry.ground_truth_answer,
- candidate=result.final_answer,
+ verdict = evaluate_entry(
+ client=ctx.client,
+ entry=entry,
+ result=result,
+ simple_prompt_path=ctx.simple_prompt_path,
+ scenario_prompt_path=ctx.scenario_prompt_path,
)
except Exception as exc:
result.judge_error = f"judge_invocation_error: {exc}"
- logger.warning("LLMJudge raised for entry_id=%s: %s", result.entry_id, exc, exc_info=True)
- return
- result.judge_score = verdict.score
- result.judge_reasoning = verdict.reasoning or ""
- if verdict.error:
- result.judge_error = verdict.error
+ logger.warning("evaluate_entry raised for entry_id=%s: %s", result.entry_id, exc, exc_info=True)
+ verdict = None
+
+ if verdict is not None:
+ result.judge_mode = verdict.mode
+ result.judge_subscores = dict(verdict.sub_scores)
+ result.judge_flags = dict(verdict.flags)
+ result.judge_lists = dict(verdict.lists)
+ result.judge_reasoning = verdict.rationale or ""
+ if verdict.mode == "simple":
+ result.judge_score = verdict.sub_scores.get("answer_correctness")
+ if verdict.error:
+ result.judge_error = verdict.error
+
+ # Also run the legacy LLMJudge for cross-run comparability when possible.
+ # This is independent of the new judge: a failure in one does not block the
+ # other, since they use different prompts and may be sensitive to different
+ # inputs (e.g. context-window limits).
+ if ctx.legacy_judge is not None and entry.ground_truth_answer and result.final_answer:
+ from nemo_retriever.skill_eval.judging import truncate_for_judge
+
+ try:
+ legacy_verdict = ctx.legacy_judge.judge(
+ query=entry.original_query,
+ reference=entry.ground_truth_answer,
+ candidate=truncate_for_judge(result.final_answer),
+ )
+ except Exception as exc:
+ result.legacy_judge_error = f"legacy_judge_invocation_error: {exc}"
+ logger.warning(
+ "legacy LLMJudge raised for entry_id=%s: %s",
+ result.entry_id,
+ exc,
+ exc_info=True,
+ )
+ else:
+ result.legacy_judge_score = legacy_verdict.score
+ result.legacy_judge_reasoning = legacy_verdict.reasoning or ""
+ if legacy_verdict.error:
+ result.legacy_judge_error = legacy_verdict.error
def run_condition(