From ba9f4216cc6771adaffcb5432f751166b9860c29 Mon Sep 17 00:00:00 2001
From: edknv <edwardk@nvidia.com>
Date: Wed, 27 May 2026 13:20:46 -0700
Subject: [PATCH] (skill-eval) update llm scorer prompts

---
 .../src/nemo_retriever/skill_eval/cli.py      | 152 +++++++---
 .../skill_eval/configs/skill_eval.yaml        |  14 +
 .../src/nemo_retriever/skill_eval/dataset.py  |  17 ++
 .../src/nemo_retriever/skill_eval/judging.py  | 271 ++++++++++++++++++
 .../src/nemo_retriever/skill_eval/report.py   |  84 +++++-
 .../src/nemo_retriever/skill_eval/runner.py   | 116 ++++++--
 6 files changed, 599 insertions(+), 55 deletions(-)
 create mode 100644 nemo_retriever/src/nemo_retriever/skill_eval/judging.py

diff --git a/nemo_retriever/src/nemo_retriever/skill_eval/cli.py b/nemo_retriever/src/nemo_retriever/skill_eval/cli.py
index ababc472c9..786fd4956c 100644
--- a/nemo_retriever/src/nemo_retriever/skill_eval/cli.py
+++ b/nemo_retriever/src/nemo_retriever/skill_eval/cli.py
@@ -92,12 +92,13 @@ def _preflight_judge_endpoint(api_base: str, timeout: float = 5.0) -> None:
         )
 
 
-def _build_judge(cfg: dict) -> Optional[Any]:
-    """Construct an ``LLMJudge`` from ``cfg['judge']`` or return ``None``.
+def _build_judge(cfg: dict, *, manifest_path: Path) -> Optional[Any]:
+    """Construct a ``JudgeContext`` from ``cfg['judge']`` or return ``None``.
 
-    Skips silently (with a console note) when the API key env var is unset, so
-    runs work end-to-end without network access. Import is deferred so the
-    ``litellm`` extra isn't required when judging is disabled.
+    Skips silently (with a console note) when the API key env var is unset,
+    so runs work end-to-end without network access. Prompt paths default
+    to the manifest's parent directory if not overridden in the config;
+    the file must exist for any mode the run actually needs.
     """
     judge_cfg = cfg.get("judge") or {}
     if not judge_cfg.get("enabled", True):
@@ -109,25 +110,65 @@ def _build_judge(cfg: dict) -> Optional[Any]:
         typer.echo(f"Judge disabled: ${api_key_env} is not set in the environment.")
         return None
     try:
-        from nemo_retriever.llm.clients.judge import LLMJudge
+        from nemo_retriever.llm.clients.litellm import LiteLLMClient
     except ImportError as exc:
-        typer.echo(f"Judge disabled: failed to import LLMJudge ({exc}). Install nemo-retriever[llm].")
+        typer.echo(f"Judge disabled: failed to import LiteLLMClient ({exc}). Install nemo-retriever[llm].")
         return None
+    from nemo_retriever.skill_eval.runner import JudgeContext
+
     api_base = judge_cfg.get("api_base")
     if api_base:
         _preflight_judge_endpoint(str(api_base))
-    judge_kwargs: dict[str, Any] = {
-        "model": str(judge_cfg.get("model", "nvidia_nim/nvidia/llama-3.3-nemotron-super-49b-v1.5")),
-        "api_base": api_base,
-        "api_key": api_key,
-    }
-    if judge_cfg.get("temperature") is not None:
-        judge_kwargs["temperature"] = float(judge_cfg["temperature"])
-    if judge_cfg.get("max_tokens") is not None:
-        judge_kwargs["max_tokens"] = int(judge_cfg["max_tokens"])
-    judge = LLMJudge.from_kwargs(**judge_kwargs)
-    typer.echo(f"Judge enabled: model={judge.model}")
-    return judge
+
+    client = LiteLLMClient.from_kwargs(
+        model=str(judge_cfg.get("model", "nvidia_nim/nvidia/llama-3.3-nemotron-super-49b-v1.5")),
+        api_base=api_base,
+        api_key=api_key,
+        temperature=float(judge_cfg.get("temperature", 0.1)),
+        max_tokens=int(judge_cfg.get("max_tokens", 4096)),
+    )
+
+    legacy_judge = None
+    if judge_cfg.get("legacy_enabled", True):
+        try:
+            from nemo_retriever.llm.clients.judge import LLMJudge
+        except ImportError as exc:
+            typer.echo(f"Legacy judge disabled: failed to import LLMJudge ({exc}).")
+        else:
+            legacy_judge = LLMJudge.from_kwargs(
+                model=str(judge_cfg.get("model", "nvidia_nim/nvidia/llama-3.3-nemotron-super-49b-v1.5")),
+                api_base=api_base,
+                api_key=api_key,
+                temperature=float(judge_cfg.get("temperature", 0.1)),
+                max_tokens=int(judge_cfg.get("max_tokens", 4096)),
+            )
+
+    manifest_dir = Path(manifest_path).expanduser().resolve().parent
+    simple_path = judge_cfg.get("simple_prompt_path")
+    scenario_path = judge_cfg.get("scenario_prompt_path")
+    simple_resolved = (
+        Path(str(simple_path)).expanduser().resolve() if simple_path else manifest_dir / "llm_scorer_prompt.md"
+    )
+    scenario_resolved = (
+        Path(str(scenario_path)).expanduser().resolve()
+        if scenario_path
+        else manifest_dir / "llm_scenario_scorer_prompt.md"
+    )
+    ctx = JudgeContext(
+        client=client,
+        simple_prompt_path=str(simple_resolved) if simple_resolved.is_file() else None,
+        scenario_prompt_path=str(scenario_resolved) if scenario_resolved.is_file() else None,
+        legacy_judge=legacy_judge,
+    )
+    typer.echo(
+        "Judge enabled: model={m}  simple_prompt={s}  scenario_prompt={c}  legacy={legacy}".format(
+            m=client.transport.model,
+            s=ctx.simple_prompt_path or "(missing)",
+            c=ctx.scenario_prompt_path or "(missing)",
+            legacy="on" if ctx.legacy_judge is not None else "off",
+        )
+    )
+    return ctx
 
 
 def _build_trace_summarizer(cfg: dict) -> Optional[Any]:
@@ -345,6 +386,15 @@ def run_command(
             "Defaults to config.query_parallelism, then 1 (linear session)."
         ),
     ),
+    limit_queries: Optional[int] = typer.Option(
+        None,
+        "--limit-queries",
+        min=1,
+        help=(
+            "Cap queries per domain to the first N entries (deterministic). "
+            "Useful for smoke-tests; omit for the full sweep."
+        ),
+    ),
 ) -> None:
     """Run the benchmark across the dataset's domains x selected conditions."""
     logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
@@ -378,6 +428,10 @@ def run_command(
             raise typer.Exit(code=2)
         by_domain = {d: by_domain[d] for d in wanted}
 
+    if limit_queries is not None:
+        by_domain = {d: es[:limit_queries] for d, es in by_domain.items()}
+        typer.echo(f"--limit-queries={limit_queries}: capping each domain to its first {limit_queries} entries.")
+
     domain_order = sorted(by_domain.keys())
     typer.echo(f"Domains in this run: {domain_order} ({sum(len(v) for v in by_domain.values())} entries total)")
 
@@ -403,7 +457,7 @@ def run_command(
         raise typer.Exit(code=2)
     testdata_prefixes = tuple(str(p) for p in testdata_prefixes_raw)
 
-    judge = _build_judge(cfg)
+    judge = _build_judge(cfg, manifest_path=Path(str(manifest_path)).expanduser().resolve())
     summarizer = _build_trace_summarizer(cfg)
 
     base_dir = str(artifacts_root) if artifacts_root else None
@@ -474,7 +528,16 @@ def run_command(
             for r in results:
                 save_trial(r, session_dir)
                 kind = "setup" if r.is_setup else f"entry_id={r.entry_id} query_id={r.query_id}"
-                judge_str = "" if r.is_setup or r.judge_score is None else f" judge={r.judge_score}"
+                judge_parts: list[str] = []
+                if not r.is_setup:
+                    if r.judge_score is not None:
+                        judge_parts.append(f"judge={r.judge_score}")
+                    elif any(v is not None for v in r.judge_subscores.values()):
+                        n = sum(1 for v in r.judge_subscores.values() if v is not None)
+                        judge_parts.append(f"judge_mode={r.judge_mode}/sub_n={n}")
+                    if r.legacy_judge_score is not None:
+                        judge_parts.append(f"legacy={r.legacy_judge_score}")
+                judge_str = (" " + " ".join(judge_parts)) if judge_parts else ""
                 cost_str = f"${r.total_cost_usd:.3f}" if r.cost_available else "n/a"
                 trace_str = f" trace={r.compact_trace_path}" if r.compact_trace_path else ""
                 typer.echo(
@@ -500,21 +563,34 @@ def run_command(
     if judge is not None:
         typer.echo("\nLLM-as-judge scores (mean over query turns, 0-5 scale):")
         for cond in selected:
-            scored: list[int] = []
+            simple_scored: list[int] = []
+            scenario_scored = 0
+            legacy_scored: list[int] = []
             errored = 0
             for domain in domain_order:
                 for r in results_by_key.get((agent, cond, domain), []):
                     if r.is_setup:
                         continue
+                    has_subscores = any(v is not None for v in r.judge_subscores.values())
                     if r.judge_score is not None:
-                        scored.append(int(r.judge_score))
+                        simple_scored.append(int(r.judge_score))
+                    elif has_subscores:
+                        scenario_scored += 1
                     elif r.judge_error:
                         errored += 1
-            if scored:
-                mean_score = sum(scored) / len(scored)
-                typer.echo(f"  {agent}/{cond}: mean={mean_score:.2f}  n={len(scored)}  errors={errored}")
-            else:
-                typer.echo(f"  {agent}/{cond}: no scores  errors={errored} (check judge config / litellm install)")
+                    if r.legacy_judge_score is not None:
+                        legacy_scored.append(int(r.legacy_judge_score))
+            parts: list[str] = []
+            if simple_scored:
+                parts.append(f"simple mean={sum(simple_scored) / len(simple_scored):.2f} n={len(simple_scored)}")
+            if scenario_scored:
+                parts.append(f"scenario_scored={scenario_scored}")
+            if legacy_scored:
+                parts.append(f"legacy mean={sum(legacy_scored) / len(legacy_scored):.2f} n={len(legacy_scored)}")
+            parts.append(f"errors={errored}")
+            if not simple_scored and not scenario_scored and not legacy_scored:
+                parts.append("(check judge config / litellm install)")
+            typer.echo(f"  {agent}/{cond}: " + "  ".join(parts))
 
     json_path, md_path = write_summary(
         session_dir=session_dir,
@@ -537,8 +613,9 @@ def _needs_rescore(trial: dict[str, Any]) -> bool:
     judge_error = trial.get("judge_error") or ""
     if judge_error in UNSCORABLE_JUDGE_ERRORS:
         return False
-    score = trial.get("judge_score")
-    if score is None:
+    sub_scores = trial.get("judge_subscores") or {}
+    scored = trial.get("judge_score") is not None or any(v is not None for v in sub_scores.values())
+    if not scored:
         return True
     if judge_error:
         return True
@@ -621,7 +698,7 @@ def rescore_command(
     entries = load_eval_manifest(Path(str(manifest_path)).expanduser().resolve())
     entries_by_id = {e.entry_id: e for e in entries}
 
-    judge = _build_judge(cfg)
+    judge = _build_judge(cfg, manifest_path=Path(str(manifest_path)).expanduser().resolve())
     if judge is None:
         typer.echo("Error: judge is not configured (see messages above). Cannot rescore.", err=True)
         raise typer.Exit(code=2)
@@ -659,15 +736,24 @@ def rescore_command(
         result.judge_score = None
         result.judge_reasoning = ""
         result.judge_error = ""
+        result.judge_mode = ""
+        result.judge_subscores = {}
+        result.judge_flags = {}
+        result.judge_lists = {}
 
         _apply_judge(judge, entry, result)
 
         raw.update(asdict(result))
         path.write_text(json.dumps(raw, indent=2) + "\n", encoding="utf-8")
 
-        if result.judge_score is not None:
+        scored_ok = result.judge_score is not None or any(v is not None for v in result.judge_subscores.values())
+        if scored_ok:
             rescored += 1
-            typer.echo(f"  {path.name}: entry_id={result.entry_id} judge={result.judge_score}")
+            if result.judge_mode == "simple" and result.judge_score is not None:
+                typer.echo(f"  {path.name}: entry_id={result.entry_id} judge={result.judge_score}")
+            else:
+                n_sub = sum(1 for v in result.judge_subscores.values() if v is not None)
+                typer.echo(f"  {path.name}: entry_id={result.entry_id} mode={result.judge_mode} sub_scores={n_sub}")
         elif result.judge_error in UNSCORABLE_JUDGE_ERRORS:
             unscorable += 1
             typer.echo(f"  {path.name}: entry_id={result.entry_id} unscorable ({result.judge_error})")
diff --git a/nemo_retriever/src/nemo_retriever/skill_eval/configs/skill_eval.yaml b/nemo_retriever/src/nemo_retriever/skill_eval/configs/skill_eval.yaml
index 6b2a164e29..429e3c87f1 100644
--- a/nemo_retriever/src/nemo_retriever/skill_eval/configs/skill_eval.yaml
+++ b/nemo_retriever/src/nemo_retriever/skill_eval/configs/skill_eval.yaml
@@ -107,6 +107,20 @@ judge:
   api_key_env: NVIDIA_API_KEY
   temperature: 0.1
   max_tokens: 4096
+  # On-disk path to the simple QA judge prompt (batch_1's llm_scorer_prompt.md).
+  # When null, the runner looks for ``llm_scorer_prompt.md`` next to the manifest.
+  # Required only if the manifest contains entries WITHOUT a `scoring_mode` field.
+  simple_prompt_path: null
+  # On-disk path to the scenario-aware judge prompt (batch_2's llm_scenario_scorer_prompt.md).
+  # When null, the runner looks for ``llm_scenario_scorer_prompt.md`` next to the manifest.
+  # Required only if the manifest contains entries WITH a `scoring_mode` field.
+  scenario_prompt_path: null
+  # When true (default), ALSO run the legacy hardcoded LLMJudge on every
+  # scoreable trial (one with a non-empty ground_truth_answer + non-empty
+  # final_answer). Produces an additional 1-5 score directly comparable to
+  # runs that pre-date the scenario-aware judge. Set to false to skip the
+  # second judge call and halve judge LLM cost.
+  legacy_enabled: true
 
 # ---------------------------------------------------------------------------
 # Tool-use summarizer
diff --git a/nemo_retriever/src/nemo_retriever/skill_eval/dataset.py b/nemo_retriever/src/nemo_retriever/skill_eval/dataset.py
index e3c2e89963..3d1dd2526d 100644
--- a/nemo_retriever/src/nemo_retriever/skill_eval/dataset.py
+++ b/nemo_retriever/src/nemo_retriever/skill_eval/dataset.py
@@ -40,6 +40,13 @@ class DatasetEntry(BaseModel):
     ground_truth_answer: str = ""
     domain: str = ""
     domain_label: str = ""
+    scoring_mode: str = ""
+    category: str = ""
+    phase: str = ""
+    expected_action: str = ""
+    expected_output_shape: str = ""
+    validation_signal: str = ""
+    raw_answers: list[str] = []
 
 
 def _select_prompt(candidates: list[dict[str, Any]], selected_variant: int | None) -> str:
@@ -121,6 +128,9 @@ def load_eval_manifest(path: Path) -> list[DatasetEntry]:
                 continue
             pages.append(GroundTruthPage(doc_id=str(doc_id), page_number=int(page), score=int(p.get("score") or 1)))
 
+        raw_answers = item.get("raw_answers") or []
+        if not isinstance(raw_answers, list):
+            raw_answers = []
         entries.append(
             DatasetEntry(
                 entry_id=idx,
@@ -132,6 +142,13 @@ def load_eval_manifest(path: Path) -> list[DatasetEntry]:
                 ground_truth_answer=str(item.get("answer") or ""),
                 domain=domain,
                 domain_label=domain_label,
+                scoring_mode=str(item.get("scoring_mode") or ""),
+                category=str(item.get("category") or ""),
+                phase=str(item.get("phase") or ""),
+                expected_action=str(item.get("expected_action") or ""),
+                expected_output_shape=str(item.get("expected_output_shape") or ""),
+                validation_signal=str(item.get("validation_signal") or ""),
+                raw_answers=[str(x) for x in raw_answers],
             )
         )
     return entries
diff --git a/nemo_retriever/src/nemo_retriever/skill_eval/judging.py b/nemo_retriever/src/nemo_retriever/skill_eval/judging.py
new file mode 100644
index 0000000000..179c65c7de
--- /dev/null
+++ b/nemo_retriever/src/nemo_retriever/skill_eval/judging.py
@@ -0,0 +1,271 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Scenario-aware LLM-as-judge for skill_eval.
+
+Routes each trial through one of two prompts based on whether the
+dataset entry carries a ``scoring_mode``:
+
+- ``simple``  -> batch_1 ``llm_scorer_prompt.md`` style (answer-grading).
+- ``scenario`` -> batch_2 ``llm_scenario_scorer_prompt.md`` style (handles
+  ingest_only / extract_only / refusal / capability_gap / dispatcher_prompt /
+  skip in addition to answerable_retrieval / ingest_plus_answer).
+
+Both prompts live on disk in the SDG run directory that produced the
+manifest, so this module never vendors prompt text. See
+``cli._build_judge`` for how paths are resolved.
+"""
+
+from __future__ import annotations
+
+import functools
+import json
+import re
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Literal, Optional, Protocol
+
+if TYPE_CHECKING:
+    from nemo_retriever.skill_eval.dataset import DatasetEntry
+    from nemo_retriever.skill_eval.runner import TrialResult
+
+
+JudgeMode = Literal["simple", "scenario"]
+
+
+SIMPLE_SUB_SCORES: tuple[str, ...] = ("answer_correctness", "citation_quality", "faithfulness")
+SCENARIO_SUB_SCORES: tuple[str, ...] = (
+    "action_correctness",
+    "routing_correctness",
+    "answer_correctness",
+    "citation_quality",
+    "faithfulness",
+    "refusal_correctness",
+)
+SIMPLE_FLAGS: tuple[str, ...] = ("is_answer_correct", "is_citation_correct")
+SCENARIO_FLAGS: tuple[str, ...] = ("is_action_correct", "is_answer_correct", "is_citation_correct")
+LIST_KEYS: tuple[str, ...] = ("major_errors", "missing_facts", "unsupported_claims")
+
+
+@dataclass
+class ScenarioJudgeResult:
+    """Structured judge output, shared between simple and scenario prompts."""
+
+    mode: JudgeMode
+    sub_scores: dict[str, Optional[int]] = field(default_factory=dict)
+    flags: dict[str, Optional[bool]] = field(default_factory=dict)
+    lists: dict[str, list[str]] = field(default_factory=dict)
+    rationale: str = ""
+    error: Optional[str] = None
+
+
+@functools.lru_cache(maxsize=8)
+def _read_prompt(path: str) -> str:
+    return Path(path).read_text(encoding="utf-8")
+
+
+def _select_mode(entry: "DatasetEntry") -> JudgeMode:
+    """Pick the prompt mode for ``entry`` based on its ``scoring_mode`` field."""
+    return "scenario" if entry.scoring_mode else "simple"
+
+
+def _render_pages(entry: "DatasetEntry") -> str:
+    parts = [f"{p.doc_id}:{p.page_number}" for p in entry.ground_truth_pages]
+    return ", ".join(parts)
+
+
+def _render_cited(result: "TrialResult") -> str:
+    parts = [f"{item.get('doc_id')}:{item.get('page_number')}" for item in result.ranked_retrieved]
+    return ", ".join(parts)
+
+
+def _render_raw_answers(entry: "DatasetEntry") -> str:
+    return "; ".join(entry.raw_answers)
+
+
+# Cap ``agent_answer`` at ~20K chars before stuffing into the judge prompt.
+# A handful of trials (notably ``extract_only`` scenarios where the agent
+# dumps the whole corpus into its answer) otherwise blow past the judge
+# model's context window. 20K chars ≈ 5-6K tokens leaves plenty of headroom
+# for the system instructions + prompt template + reference content.
+JUDGE_ANSWER_CHAR_CAP = 20000
+_TRUNCATION_MARKER = f"\n\n... [truncated for judge: original answer exceeded {JUDGE_ANSWER_CHAR_CAP} chars]"
+
+
+def truncate_for_judge(text: str, cap: int = JUDGE_ANSWER_CHAR_CAP) -> str:
+    """Cap ``text`` at ``cap`` chars with a visible marker.
+
+    Both the new judge (via ``_format_prompt``) and the legacy ``LLMJudge``
+    pass through this helper so the cap is applied identically across both
+    scoring paths. The marker makes the truncation visible to the judge so
+    its rationale can flag the gap rather than silently mis-scoring.
+    """
+    if len(text) <= cap:
+        return text
+    head_chars = max(0, cap - len(_TRUNCATION_MARKER))
+    return text[:head_chars] + _TRUNCATION_MARKER
+
+
+def _format_prompt(template: str, entry: "DatasetEntry", result: "TrialResult") -> str:
+    """Substitute the prompt slots for ``entry`` / ``result`` into ``template``.
+
+    Uses literal-token replacement for each known slot, NOT ``str.format_map``.
+    The SDG-shipped judge prompts contain literal JSON-schema blocks like
+    ``{ "action_correctness": <0-5>, ... }`` which a Python format-string
+    parser would misinterpret as substitution slots with invalid format
+    specifiers. With ``str.replace`` those literals pass through verbatim and
+    only our enumerated slot tokens are substituted.
+
+    Slots that the template doesn't reference are simply ignored. Slots the
+    template does reference but that we don't know about are left as literal
+    ``{slot_name}`` in the output — the judge LLM is robust to that, and
+    leaving them visible aids debugging compared to silently rendering empty.
+    """
+    substitutions: list[tuple[str, str]] = [
+        ("{query}", entry.original_query),
+        ("{reference_answer}", entry.ground_truth_answer),
+        ("{raw_answers}", _render_raw_answers(entry)),
+        ("{relevant_pages}", _render_pages(entry)),
+        ("{agent_answer}", truncate_for_judge(result.final_answer)),
+        ("{agent_cited_pages}", _render_cited(result)),
+        ("{cited_evidence}", ""),
+        ("{scenario_prompt}", entry.paraphrased_prompt),
+        ("{category}", entry.category),
+        ("{phase}", entry.phase),
+        ("{scoring_mode}", entry.scoring_mode),
+        ("{expected_action}", entry.expected_action),
+        ("{expected_output_shape}", entry.expected_output_shape),
+        ("{validation_signal}", entry.validation_signal),
+    ]
+    out = template
+    for token, value in substitutions:
+        out = out.replace(token, value)
+    return out
+
+
+class _ChatClient(Protocol):
+    """Minimal slice of LiteLLMClient that this module needs."""
+
+    def complete(self, messages: list[dict], max_tokens: Optional[int] = None) -> tuple[str, float]: ...
+
+
+_SYSTEM_INSTRUCTION = "Respond with ONLY valid JSON matching the schema in the user message. No prose, no markdown."
+
+
+def _strip_envelope(raw: str) -> str:
+    text = raw.strip()
+    text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()
+    text = re.sub(r"^```(?:json)?\s*", "", text, flags=re.MULTILINE)
+    text = re.sub(r"\s*```$", "", text, flags=re.MULTILINE)
+    return text.strip()
+
+
+def _validate_subscore(value: Any) -> Optional[int]:
+    if value is None:
+        return None
+    if not isinstance(value, int) or isinstance(value, bool):
+        raise TypeError(f"sub-score must be an integer, got {type(value).__name__}: {value!r}")
+    score = value
+    if not 0 <= score <= 5:
+        raise ValueError(f"sub-score {score} out of range 0-5")
+    return score
+
+
+def _validate_flag(value: Any) -> Optional[bool]:
+    if value is None:
+        return None
+    return bool(value)
+
+
+def _validate_list(value: Any) -> list[str]:
+    if value is None:
+        return []
+    if not isinstance(value, list):
+        return []
+    return [str(x) for x in value]
+
+
+def _parse_response(raw: str, mode: JudgeMode) -> ScenarioJudgeResult:
+    """Parse a judge JSON response into a ``ScenarioJudgeResult``.
+
+    Tolerates ``<think>`` blocks and code fences. Returns a result whose
+    ``error`` is set when JSON parsing or sub-score validation fails;
+    sub-scores then all default to ``None``.
+    """
+    sub_keys = SIMPLE_SUB_SCORES if mode == "simple" else SCENARIO_SUB_SCORES
+    flag_keys = SIMPLE_FLAGS if mode == "simple" else SCENARIO_FLAGS
+    empty = ScenarioJudgeResult(
+        mode=mode,
+        sub_scores={k: None for k in sub_keys},
+        flags={k: None for k in flag_keys},
+        lists={k: [] for k in LIST_KEYS},
+        rationale="",
+    )
+
+    try:
+        data = json.loads(_strip_envelope(raw))
+    except json.JSONDecodeError as exc:
+        empty.error = f"parse_failure: {exc}: {raw[:200]!r}"
+        return empty
+    if not isinstance(data, dict):
+        empty.error = f"parse_failure: top-level must be a JSON object, got {type(data).__name__}"
+        return empty
+
+    try:
+        sub_scores = {k: _validate_subscore(data.get(k)) for k in sub_keys}
+    except (TypeError, ValueError) as exc:
+        empty.error = f"parse_failure: {exc}"
+        return empty
+
+    return ScenarioJudgeResult(
+        mode=mode,
+        sub_scores=sub_scores,
+        flags={k: _validate_flag(data.get(k)) for k in flag_keys},
+        lists={k: _validate_list(data.get(k)) for k in LIST_KEYS},
+        rationale=str(data.get("brief_rationale") or ""),
+    )
+
+
+def evaluate_entry(
+    *,
+    client: _ChatClient,
+    entry: "DatasetEntry",
+    result: "TrialResult",
+    simple_prompt_path: Optional[str],
+    scenario_prompt_path: Optional[str],
+) -> ScenarioJudgeResult:
+    """Score ``result`` against ``entry`` using the prompt selected by ``_select_mode``.
+
+    ``simple_prompt_path`` / ``scenario_prompt_path`` are the on-disk
+    locations of the SDG-shipped judge prompts. Only the path for the
+    selected mode is required; the other may be ``None``.
+    """
+    mode = _select_mode(entry)
+    chosen_path = scenario_prompt_path if mode == "scenario" else simple_prompt_path
+    if not chosen_path:
+        raise ValueError(
+            f"judge mode={mode!r} requires {('scenario_prompt_path' if mode == 'scenario' else 'simple_prompt_path')}; "
+            "set judge.<mode>_prompt_path in the config or co-locate the prompt file with the manifest."
+        )
+
+    template = _read_prompt(chosen_path)
+    user = _format_prompt(template, entry, result)
+    messages = [
+        {"role": "system", "content": _SYSTEM_INSTRUCTION},
+        {"role": "user", "content": user},
+    ]
+    try:
+        raw, _ = client.complete(messages)
+    except Exception as exc:
+        sub_keys = SIMPLE_SUB_SCORES if mode == "simple" else SCENARIO_SUB_SCORES
+        flag_keys = SIMPLE_FLAGS if mode == "simple" else SCENARIO_FLAGS
+        return ScenarioJudgeResult(
+            mode=mode,
+            sub_scores={k: None for k in sub_keys},
+            flags={k: None for k in flag_keys},
+            lists={k: [] for k in LIST_KEYS},
+            rationale="",
+            error=f"judge_api_error: {exc}",
+        )
+    return _parse_response(raw, mode=mode)
diff --git a/nemo_retriever/src/nemo_retriever/skill_eval/report.py b/nemo_retriever/src/nemo_retriever/skill_eval/report.py
index 7058d5148b..9a85eabea9 100644
--- a/nemo_retriever/src/nemo_retriever/skill_eval/report.py
+++ b/nemo_retriever/src/nemo_retriever/skill_eval/report.py
@@ -110,6 +110,29 @@ def _aggregate(
         metrics["judge_score_mean"] = sum(judge_scores) / len(judge_scores)
         metrics["judge_score_n"] = len(judge_scores)
 
+    legacy_scores = [r.legacy_judge_score for r in query_results if r.legacy_judge_score is not None]
+    if legacy_scores:
+        metrics["legacy_judge_score_mean"] = sum(legacy_scores) / len(legacy_scores)
+        metrics["legacy_judge_score_n"] = len(legacy_scores)
+
+    sub_means: dict[str, tuple[float, int]] = {}
+    sub_score_keys: set[str] = set()
+    for r in query_results:
+        sub_score_keys.update(r.judge_subscores.keys())
+    for key in sorted(sub_score_keys):
+        values = [v for r in query_results if (v := r.judge_subscores.get(key)) is not None]
+        if values:
+            sub_means[key] = (sum(values) / len(values), len(values))
+    if sub_means:
+        metrics["judge_sub_means"] = sub_means
+
+    mode_counts: dict[str, int] = defaultdict(int)
+    for r in query_results:
+        if r.judge_mode:
+            mode_counts[r.judge_mode] += 1
+    if mode_counts:
+        metrics["judge_mode_counts"] = dict(mode_counts)
+
     tool_use_summary = next((r.tool_use_summary for r in setup_results if r.tool_use_summary), "")
 
     return {
@@ -152,8 +175,13 @@ def _md_cell(value: Any) -> str:
 def _md_row(row: dict[str, Any]) -> str:
     m = row.get("metrics", {})
     judge_cell = f"{m['judge_score_mean']:.2f} (n={m.get('judge_score_n', 0)})" if "judge_score_mean" in m else "-"
+    legacy_cell = (
+        f"{m['legacy_judge_score_mean']:.2f} (n={m.get('legacy_judge_score_n', 0)})"
+        if "legacy_judge_score_mean" in m
+        else "-"
+    )
     return (
-        "| {run} | {sr:.2f} | {retr:.2f} | {r1:.3f} | {r5:.3f} | {r10:.3f} | {judge} "
+        "| {run} | {sr:.2f} | {retr:.2f} | {r1:.3f} | {r5:.3f} | {r10:.3f} | {judge} | {legacy} "
         "| {ipt:.0f} | {opt:.0f} | {cr:.0f} | {cc:.0f} | {cost} |"
     ).format(
         run=row.get("run_name", "?"),
@@ -163,6 +191,7 @@ def _md_row(row: dict[str, Any]) -> str:
         r5=m.get("recall_5", 0.0),
         r10=m.get("recall_10", 0.0),
         judge=judge_cell,
+        legacy=legacy_cell,
         ipt=m.get("input_tokens", 0.0),
         opt=m.get("output_tokens", 0.0),
         cr=m.get("cache_read_input_tokens", 0.0),
@@ -172,10 +201,56 @@ def _md_row(row: dict[str, Any]) -> str:
 
 
 _MAIN_TABLE_HEADER = (
-    "| run | success_rate | retr_used | recall@1 | recall@5 | recall@10 | judge | q_input | q_output "
+    "| run | success_rate | retr_used | recall@1 | recall@5 | recall@10 | judge | legacy | q_input | q_output "
     "| q_cache_read | q_cache_create | q_cost |"
 )
-_MAIN_TABLE_DIVIDER = "|---|---|---|---|---|---|---|---|---|---|---|---|"
+_MAIN_TABLE_DIVIDER = "|---|---|---|---|---|---|---|---|---|---|---|---|---|"
+
+
+def _md_subscore_section(overall_rows: list[dict[str, Any]]) -> list[str]:
+    """Emit per-sub-score mean tables; one column per sub-score key seen."""
+    keys: list[str] = []
+    seen: set[str] = set()
+    for row in overall_rows:
+        sub_means = row.get("metrics", {}).get("judge_sub_means") or {}
+        for k in sub_means:
+            if k not in seen:
+                seen.add(k)
+                keys.append(k)
+    if not keys:
+        return []
+    header = "| run | " + " | ".join(keys) + " |"
+    divider = "|---|" + "|".join(["---"] * len(keys)) + "|"
+    lines = ["", "## Judge sub-scores (mean over trials that produced each sub-score, with N)", "", header, divider]
+    for row in overall_rows:
+        sub_means = row.get("metrics", {}).get("judge_sub_means") or {}
+        cells = []
+        for k in keys:
+            entry = sub_means.get(k)
+            cells.append("-" if entry is None else f"{entry[0]:.2f} (n={entry[1]})")
+        lines.append(f"| {row.get('run_name', '?')} | " + " | ".join(cells) + " |")
+    return lines
+
+
+def _md_mode_breakdown(overall_rows: list[dict[str, Any]]) -> list[str]:
+    """Emit a one-row-per-run table showing trial counts per judge mode."""
+    modes: list[str] = []
+    seen: set[str] = set()
+    for row in overall_rows:
+        for k in row.get("metrics", {}).get("judge_mode_counts") or {}:
+            if k not in seen:
+                seen.add(k)
+                modes.append(k)
+    if not modes:
+        return []
+    header = "| run | " + " | ".join(modes) + " |"
+    divider = "|---|" + "|".join(["---"] * len(modes)) + "|"
+    lines = ["", "## Judge mode breakdown (trials per mode)", "", header, divider]
+    for row in overall_rows:
+        counts = row.get("metrics", {}).get("judge_mode_counts") or {}
+        cells = [str(counts.get(m, 0)) for m in modes]
+        lines.append(f"| {row.get('run_name', '?')} | " + " | ".join(cells) + " |")
+    return lines
 
 
 def write_summary_md(
@@ -273,6 +348,9 @@ def write_summary_md(
             )
         )
 
+    lines.extend(_md_subscore_section(overall_rows))
+    lines.extend(_md_mode_breakdown(overall_rows))
+
     diag_lines = []
     for row in overall_rows:
         m = row.get("metrics", {})
diff --git a/nemo_retriever/src/nemo_retriever/skill_eval/runner.py b/nemo_retriever/src/nemo_retriever/skill_eval/runner.py
index 3008dd4b7f..1e6d71ee7b 100644
--- a/nemo_retriever/src/nemo_retriever/skill_eval/runner.py
+++ b/nemo_retriever/src/nemo_retriever/skill_eval/runner.py
@@ -72,6 +72,13 @@ class TrialResult:
     judge_score: int | None = None
     judge_reasoning: str = ""
     judge_error: str = ""
+    judge_mode: str = ""
+    judge_subscores: dict[str, int | None] = field(default_factory=dict)
+    judge_flags: dict[str, bool | None] = field(default_factory=dict)
+    judge_lists: dict[str, list[str]] = field(default_factory=dict)
+    legacy_judge_score: int | None = None
+    legacy_judge_reasoning: str = ""
+    legacy_judge_error: str = ""
     tool_use_summary: str = ""
     cost_available: bool = True
     execution_mode: str = "linear_session"
@@ -90,6 +97,26 @@ class ConditionRun:
     execution_mode: str = "linear_session"
 
 
+ANSWERABLE_SCORING_MODES: frozenset[str] = frozenset({"answerable_retrieval", "ingest_plus_answer"})
+ANSWER_REQUIRED_SCORING_MODES: frozenset[str] = frozenset(
+    {"answerable_retrieval", "ingest_plus_answer", "refusal", "capability_gap", "dispatcher_prompt"}
+)
+
+
+@dataclass
+class JudgeContext:
+    """Bag of state passed from CLI into ``_apply_judge``.
+
+    Holds the LiteLLM transport plus the two on-disk prompt paths. ``None``
+    is a valid path when no entries in the run need that mode.
+    """
+
+    client: Any
+    simple_prompt_path: str | None
+    scenario_prompt_path: str | None
+    legacy_judge: Any = None
+
+
 def _remap_pdf_paths(text: str, prefixes: tuple[str, ...]) -> str:
     """Rewrite caller-supplied path prefixes in *text* to ``./pdfs/``.
 
@@ -1153,38 +1180,89 @@ def _run_one_turn(
     return result
 
 
-UNSCORABLE_JUDGE_ERRORS: frozenset[str] = frozenset({"no_ground_truth", "empty_candidate"})
+UNSCORABLE_JUDGE_ERRORS: frozenset[str] = frozenset({"no_ground_truth", "empty_candidate", "scoring_mode_skip"})
 
 
-def _apply_judge(judge: Any, entry: DatasetEntry, result: TrialResult) -> None:
-    """Score ``result.final_answer`` against ``entry.ground_truth_answer``.
+def _apply_judge(ctx: Any, entry: DatasetEntry, result: TrialResult) -> None:
+    """Score ``result.final_answer`` against ``entry`` via the dispatched judge.
 
-    Missing ground truth and empty candidates are recorded as terminal
-    ``judge_error`` values so ``rescore`` can skip intrinsically unscorable
-    trials instead of retrying them forever.
+    Behaviour summary:
+      - ``ctx is None``               -> no-op (judge disabled).
+      - ``scoring_mode == "skip"``    -> terminal ``judge_error="scoring_mode_skip"``.
+      - Simple-mode entry with no
+        ground truth                  -> terminal ``judge_error="no_ground_truth"``.
+      - Empty ``final_answer`` for a
+        mode that requires an answer  -> terminal ``judge_error="empty_candidate"``.
+      - Otherwise: call
+        ``judging.evaluate_entry`` and stamp sub-scores onto ``result``.
     """
-    if judge is None:
+    if ctx is None:
         return
-    if not entry.ground_truth_answer:
+    if entry.scoring_mode == "skip":
+        result.judge_error = "scoring_mode_skip"
+        return
+
+    needs_answer_mode = entry.scoring_mode in ANSWER_REQUIRED_SCORING_MODES or entry.scoring_mode == ""
+    answerable_mode = entry.scoring_mode in ANSWERABLE_SCORING_MODES or entry.scoring_mode == ""
+    if answerable_mode and not entry.ground_truth_answer:
         result.judge_error = "no_ground_truth"
         return
-    if not result.final_answer:
+    if needs_answer_mode and not result.final_answer:
         result.judge_error = "empty_candidate"
         return
+
+    from nemo_retriever.skill_eval.judging import evaluate_entry
+
     try:
-        verdict = judge.judge(
-            query=entry.original_query,
-            reference=entry.ground_truth_answer,
-            candidate=result.final_answer,
+        verdict = evaluate_entry(
+            client=ctx.client,
+            entry=entry,
+            result=result,
+            simple_prompt_path=ctx.simple_prompt_path,
+            scenario_prompt_path=ctx.scenario_prompt_path,
         )
     except Exception as exc:
         result.judge_error = f"judge_invocation_error: {exc}"
-        logger.warning("LLMJudge raised for entry_id=%s: %s", result.entry_id, exc, exc_info=True)
-        return
-    result.judge_score = verdict.score
-    result.judge_reasoning = verdict.reasoning or ""
-    if verdict.error:
-        result.judge_error = verdict.error
+        logger.warning("evaluate_entry raised for entry_id=%s: %s", result.entry_id, exc, exc_info=True)
+        verdict = None
+
+    if verdict is not None:
+        result.judge_mode = verdict.mode
+        result.judge_subscores = dict(verdict.sub_scores)
+        result.judge_flags = dict(verdict.flags)
+        result.judge_lists = dict(verdict.lists)
+        result.judge_reasoning = verdict.rationale or ""
+        if verdict.mode == "simple":
+            result.judge_score = verdict.sub_scores.get("answer_correctness")
+        if verdict.error:
+            result.judge_error = verdict.error
+
+    # Also run the legacy LLMJudge for cross-run comparability when possible.
+    # This is independent of the new judge: a failure in one does not block the
+    # other, since they use different prompts and may be sensitive to different
+    # inputs (e.g. context-window limits).
+    if ctx.legacy_judge is not None and entry.ground_truth_answer and result.final_answer:
+        from nemo_retriever.skill_eval.judging import truncate_for_judge
+
+        try:
+            legacy_verdict = ctx.legacy_judge.judge(
+                query=entry.original_query,
+                reference=entry.ground_truth_answer,
+                candidate=truncate_for_judge(result.final_answer),
+            )
+        except Exception as exc:
+            result.legacy_judge_error = f"legacy_judge_invocation_error: {exc}"
+            logger.warning(
+                "legacy LLMJudge raised for entry_id=%s: %s",
+                result.entry_id,
+                exc,
+                exc_info=True,
+            )
+        else:
+            result.legacy_judge_score = legacy_verdict.score
+            result.legacy_judge_reasoning = legacy_verdict.reasoning or ""
+            if legacy_verdict.error:
+                result.legacy_judge_error = legacy_verdict.error
 
 
 def run_condition(