From ba9f4216cc6771adaffcb5432f751166b9860c29 Mon Sep 17 00:00:00 2001 From: edknv Date: Wed, 27 May 2026 13:20:46 -0700 Subject: [PATCH] (skill-eval) update llm scorer prompts --- .../src/nemo_retriever/skill_eval/cli.py | 152 +++++++--- .../skill_eval/configs/skill_eval.yaml | 14 + .../src/nemo_retriever/skill_eval/dataset.py | 17 ++ .../src/nemo_retriever/skill_eval/judging.py | 271 ++++++++++++++++++ .../src/nemo_retriever/skill_eval/report.py | 84 +++++- .../src/nemo_retriever/skill_eval/runner.py | 116 ++++++-- 6 files changed, 599 insertions(+), 55 deletions(-) create mode 100644 nemo_retriever/src/nemo_retriever/skill_eval/judging.py diff --git a/nemo_retriever/src/nemo_retriever/skill_eval/cli.py b/nemo_retriever/src/nemo_retriever/skill_eval/cli.py index ababc472c9..786fd4956c 100644 --- a/nemo_retriever/src/nemo_retriever/skill_eval/cli.py +++ b/nemo_retriever/src/nemo_retriever/skill_eval/cli.py @@ -92,12 +92,13 @@ def _preflight_judge_endpoint(api_base: str, timeout: float = 5.0) -> None: ) -def _build_judge(cfg: dict) -> Optional[Any]: - """Construct an ``LLMJudge`` from ``cfg['judge']`` or return ``None``. +def _build_judge(cfg: dict, *, manifest_path: Path) -> Optional[Any]: + """Construct a ``JudgeContext`` from ``cfg['judge']`` or return ``None``. - Skips silently (with a console note) when the API key env var is unset, so - runs work end-to-end without network access. Import is deferred so the - ``litellm`` extra isn't required when judging is disabled. + Skips silently (with a console note) when the API key env var is unset, + so runs work end-to-end without network access. Prompt paths default + to the manifest's parent directory if not overridden in the config; + the file must exist for any mode the run actually needs. """ judge_cfg = cfg.get("judge") or {} if not judge_cfg.get("enabled", True): @@ -109,25 +110,65 @@ def _build_judge(cfg: dict) -> Optional[Any]: typer.echo(f"Judge disabled: ${api_key_env} is not set in the environment.") return None try: - from nemo_retriever.llm.clients.judge import LLMJudge + from nemo_retriever.llm.clients.litellm import LiteLLMClient except ImportError as exc: - typer.echo(f"Judge disabled: failed to import LLMJudge ({exc}). Install nemo-retriever[llm].") + typer.echo(f"Judge disabled: failed to import LiteLLMClient ({exc}). Install nemo-retriever[llm].") return None + from nemo_retriever.skill_eval.runner import JudgeContext + api_base = judge_cfg.get("api_base") if api_base: _preflight_judge_endpoint(str(api_base)) - judge_kwargs: dict[str, Any] = { - "model": str(judge_cfg.get("model", "nvidia_nim/nvidia/llama-3.3-nemotron-super-49b-v1.5")), - "api_base": api_base, - "api_key": api_key, - } - if judge_cfg.get("temperature") is not None: - judge_kwargs["temperature"] = float(judge_cfg["temperature"]) - if judge_cfg.get("max_tokens") is not None: - judge_kwargs["max_tokens"] = int(judge_cfg["max_tokens"]) - judge = LLMJudge.from_kwargs(**judge_kwargs) - typer.echo(f"Judge enabled: model={judge.model}") - return judge + + client = LiteLLMClient.from_kwargs( + model=str(judge_cfg.get("model", "nvidia_nim/nvidia/llama-3.3-nemotron-super-49b-v1.5")), + api_base=api_base, + api_key=api_key, + temperature=float(judge_cfg.get("temperature", 0.1)), + max_tokens=int(judge_cfg.get("max_tokens", 4096)), + ) + + legacy_judge = None + if judge_cfg.get("legacy_enabled", True): + try: + from nemo_retriever.llm.clients.judge import LLMJudge + except ImportError as exc: + typer.echo(f"Legacy judge disabled: failed to import LLMJudge ({exc}).") + else: + legacy_judge = LLMJudge.from_kwargs( + model=str(judge_cfg.get("model", "nvidia_nim/nvidia/llama-3.3-nemotron-super-49b-v1.5")), + api_base=api_base, + api_key=api_key, + temperature=float(judge_cfg.get("temperature", 0.1)), + max_tokens=int(judge_cfg.get("max_tokens", 4096)), + ) + + manifest_dir = Path(manifest_path).expanduser().resolve().parent + simple_path = judge_cfg.get("simple_prompt_path") + scenario_path = judge_cfg.get("scenario_prompt_path") + simple_resolved = ( + Path(str(simple_path)).expanduser().resolve() if simple_path else manifest_dir / "llm_scorer_prompt.md" + ) + scenario_resolved = ( + Path(str(scenario_path)).expanduser().resolve() + if scenario_path + else manifest_dir / "llm_scenario_scorer_prompt.md" + ) + ctx = JudgeContext( + client=client, + simple_prompt_path=str(simple_resolved) if simple_resolved.is_file() else None, + scenario_prompt_path=str(scenario_resolved) if scenario_resolved.is_file() else None, + legacy_judge=legacy_judge, + ) + typer.echo( + "Judge enabled: model={m} simple_prompt={s} scenario_prompt={c} legacy={legacy}".format( + m=client.transport.model, + s=ctx.simple_prompt_path or "(missing)", + c=ctx.scenario_prompt_path or "(missing)", + legacy="on" if ctx.legacy_judge is not None else "off", + ) + ) + return ctx def _build_trace_summarizer(cfg: dict) -> Optional[Any]: @@ -345,6 +386,15 @@ def run_command( "Defaults to config.query_parallelism, then 1 (linear session)." ), ), + limit_queries: Optional[int] = typer.Option( + None, + "--limit-queries", + min=1, + help=( + "Cap queries per domain to the first N entries (deterministic). " + "Useful for smoke-tests; omit for the full sweep." + ), + ), ) -> None: """Run the benchmark across the dataset's domains x selected conditions.""" logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s") @@ -378,6 +428,10 @@ def run_command( raise typer.Exit(code=2) by_domain = {d: by_domain[d] for d in wanted} + if limit_queries is not None: + by_domain = {d: es[:limit_queries] for d, es in by_domain.items()} + typer.echo(f"--limit-queries={limit_queries}: capping each domain to its first {limit_queries} entries.") + domain_order = sorted(by_domain.keys()) typer.echo(f"Domains in this run: {domain_order} ({sum(len(v) for v in by_domain.values())} entries total)") @@ -403,7 +457,7 @@ def run_command( raise typer.Exit(code=2) testdata_prefixes = tuple(str(p) for p in testdata_prefixes_raw) - judge = _build_judge(cfg) + judge = _build_judge(cfg, manifest_path=Path(str(manifest_path)).expanduser().resolve()) summarizer = _build_trace_summarizer(cfg) base_dir = str(artifacts_root) if artifacts_root else None @@ -474,7 +528,16 @@ def run_command( for r in results: save_trial(r, session_dir) kind = "setup" if r.is_setup else f"entry_id={r.entry_id} query_id={r.query_id}" - judge_str = "" if r.is_setup or r.judge_score is None else f" judge={r.judge_score}" + judge_parts: list[str] = [] + if not r.is_setup: + if r.judge_score is not None: + judge_parts.append(f"judge={r.judge_score}") + elif any(v is not None for v in r.judge_subscores.values()): + n = sum(1 for v in r.judge_subscores.values() if v is not None) + judge_parts.append(f"judge_mode={r.judge_mode}/sub_n={n}") + if r.legacy_judge_score is not None: + judge_parts.append(f"legacy={r.legacy_judge_score}") + judge_str = (" " + " ".join(judge_parts)) if judge_parts else "" cost_str = f"${r.total_cost_usd:.3f}" if r.cost_available else "n/a" trace_str = f" trace={r.compact_trace_path}" if r.compact_trace_path else "" typer.echo( @@ -500,21 +563,34 @@ def run_command( if judge is not None: typer.echo("\nLLM-as-judge scores (mean over query turns, 0-5 scale):") for cond in selected: - scored: list[int] = [] + simple_scored: list[int] = [] + scenario_scored = 0 + legacy_scored: list[int] = [] errored = 0 for domain in domain_order: for r in results_by_key.get((agent, cond, domain), []): if r.is_setup: continue + has_subscores = any(v is not None for v in r.judge_subscores.values()) if r.judge_score is not None: - scored.append(int(r.judge_score)) + simple_scored.append(int(r.judge_score)) + elif has_subscores: + scenario_scored += 1 elif r.judge_error: errored += 1 - if scored: - mean_score = sum(scored) / len(scored) - typer.echo(f" {agent}/{cond}: mean={mean_score:.2f} n={len(scored)} errors={errored}") - else: - typer.echo(f" {agent}/{cond}: no scores errors={errored} (check judge config / litellm install)") + if r.legacy_judge_score is not None: + legacy_scored.append(int(r.legacy_judge_score)) + parts: list[str] = [] + if simple_scored: + parts.append(f"simple mean={sum(simple_scored) / len(simple_scored):.2f} n={len(simple_scored)}") + if scenario_scored: + parts.append(f"scenario_scored={scenario_scored}") + if legacy_scored: + parts.append(f"legacy mean={sum(legacy_scored) / len(legacy_scored):.2f} n={len(legacy_scored)}") + parts.append(f"errors={errored}") + if not simple_scored and not scenario_scored and not legacy_scored: + parts.append("(check judge config / litellm install)") + typer.echo(f" {agent}/{cond}: " + " ".join(parts)) json_path, md_path = write_summary( session_dir=session_dir, @@ -537,8 +613,9 @@ def _needs_rescore(trial: dict[str, Any]) -> bool: judge_error = trial.get("judge_error") or "" if judge_error in UNSCORABLE_JUDGE_ERRORS: return False - score = trial.get("judge_score") - if score is None: + sub_scores = trial.get("judge_subscores") or {} + scored = trial.get("judge_score") is not None or any(v is not None for v in sub_scores.values()) + if not scored: return True if judge_error: return True @@ -621,7 +698,7 @@ def rescore_command( entries = load_eval_manifest(Path(str(manifest_path)).expanduser().resolve()) entries_by_id = {e.entry_id: e for e in entries} - judge = _build_judge(cfg) + judge = _build_judge(cfg, manifest_path=Path(str(manifest_path)).expanduser().resolve()) if judge is None: typer.echo("Error: judge is not configured (see messages above). Cannot rescore.", err=True) raise typer.Exit(code=2) @@ -659,15 +736,24 @@ def rescore_command( result.judge_score = None result.judge_reasoning = "" result.judge_error = "" + result.judge_mode = "" + result.judge_subscores = {} + result.judge_flags = {} + result.judge_lists = {} _apply_judge(judge, entry, result) raw.update(asdict(result)) path.write_text(json.dumps(raw, indent=2) + "\n", encoding="utf-8") - if result.judge_score is not None: + scored_ok = result.judge_score is not None or any(v is not None for v in result.judge_subscores.values()) + if scored_ok: rescored += 1 - typer.echo(f" {path.name}: entry_id={result.entry_id} judge={result.judge_score}") + if result.judge_mode == "simple" and result.judge_score is not None: + typer.echo(f" {path.name}: entry_id={result.entry_id} judge={result.judge_score}") + else: + n_sub = sum(1 for v in result.judge_subscores.values() if v is not None) + typer.echo(f" {path.name}: entry_id={result.entry_id} mode={result.judge_mode} sub_scores={n_sub}") elif result.judge_error in UNSCORABLE_JUDGE_ERRORS: unscorable += 1 typer.echo(f" {path.name}: entry_id={result.entry_id} unscorable ({result.judge_error})") diff --git a/nemo_retriever/src/nemo_retriever/skill_eval/configs/skill_eval.yaml b/nemo_retriever/src/nemo_retriever/skill_eval/configs/skill_eval.yaml index 6b2a164e29..429e3c87f1 100644 --- a/nemo_retriever/src/nemo_retriever/skill_eval/configs/skill_eval.yaml +++ b/nemo_retriever/src/nemo_retriever/skill_eval/configs/skill_eval.yaml @@ -107,6 +107,20 @@ judge: api_key_env: NVIDIA_API_KEY temperature: 0.1 max_tokens: 4096 + # On-disk path to the simple QA judge prompt (batch_1's llm_scorer_prompt.md). + # When null, the runner looks for ``llm_scorer_prompt.md`` next to the manifest. + # Required only if the manifest contains entries WITHOUT a `scoring_mode` field. + simple_prompt_path: null + # On-disk path to the scenario-aware judge prompt (batch_2's llm_scenario_scorer_prompt.md). + # When null, the runner looks for ``llm_scenario_scorer_prompt.md`` next to the manifest. + # Required only if the manifest contains entries WITH a `scoring_mode` field. + scenario_prompt_path: null + # When true (default), ALSO run the legacy hardcoded LLMJudge on every + # scoreable trial (one with a non-empty ground_truth_answer + non-empty + # final_answer). Produces an additional 1-5 score directly comparable to + # runs that pre-date the scenario-aware judge. Set to false to skip the + # second judge call and halve judge LLM cost. + legacy_enabled: true # --------------------------------------------------------------------------- # Tool-use summarizer diff --git a/nemo_retriever/src/nemo_retriever/skill_eval/dataset.py b/nemo_retriever/src/nemo_retriever/skill_eval/dataset.py index e3c2e89963..3d1dd2526d 100644 --- a/nemo_retriever/src/nemo_retriever/skill_eval/dataset.py +++ b/nemo_retriever/src/nemo_retriever/skill_eval/dataset.py @@ -40,6 +40,13 @@ class DatasetEntry(BaseModel): ground_truth_answer: str = "" domain: str = "" domain_label: str = "" + scoring_mode: str = "" + category: str = "" + phase: str = "" + expected_action: str = "" + expected_output_shape: str = "" + validation_signal: str = "" + raw_answers: list[str] = [] def _select_prompt(candidates: list[dict[str, Any]], selected_variant: int | None) -> str: @@ -121,6 +128,9 @@ def load_eval_manifest(path: Path) -> list[DatasetEntry]: continue pages.append(GroundTruthPage(doc_id=str(doc_id), page_number=int(page), score=int(p.get("score") or 1))) + raw_answers = item.get("raw_answers") or [] + if not isinstance(raw_answers, list): + raw_answers = [] entries.append( DatasetEntry( entry_id=idx, @@ -132,6 +142,13 @@ def load_eval_manifest(path: Path) -> list[DatasetEntry]: ground_truth_answer=str(item.get("answer") or ""), domain=domain, domain_label=domain_label, + scoring_mode=str(item.get("scoring_mode") or ""), + category=str(item.get("category") or ""), + phase=str(item.get("phase") or ""), + expected_action=str(item.get("expected_action") or ""), + expected_output_shape=str(item.get("expected_output_shape") or ""), + validation_signal=str(item.get("validation_signal") or ""), + raw_answers=[str(x) for x in raw_answers], ) ) return entries diff --git a/nemo_retriever/src/nemo_retriever/skill_eval/judging.py b/nemo_retriever/src/nemo_retriever/skill_eval/judging.py new file mode 100644 index 0000000000..179c65c7de --- /dev/null +++ b/nemo_retriever/src/nemo_retriever/skill_eval/judging.py @@ -0,0 +1,271 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. +# All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Scenario-aware LLM-as-judge for skill_eval. + +Routes each trial through one of two prompts based on whether the +dataset entry carries a ``scoring_mode``: + +- ``simple`` -> batch_1 ``llm_scorer_prompt.md`` style (answer-grading). +- ``scenario`` -> batch_2 ``llm_scenario_scorer_prompt.md`` style (handles + ingest_only / extract_only / refusal / capability_gap / dispatcher_prompt / + skip in addition to answerable_retrieval / ingest_plus_answer). + +Both prompts live on disk in the SDG run directory that produced the +manifest, so this module never vendors prompt text. See +``cli._build_judge`` for how paths are resolved. +""" + +from __future__ import annotations + +import functools +import json +import re +from dataclasses import dataclass, field +from pathlib import Path +from typing import TYPE_CHECKING, Any, Literal, Optional, Protocol + +if TYPE_CHECKING: + from nemo_retriever.skill_eval.dataset import DatasetEntry + from nemo_retriever.skill_eval.runner import TrialResult + + +JudgeMode = Literal["simple", "scenario"] + + +SIMPLE_SUB_SCORES: tuple[str, ...] = ("answer_correctness", "citation_quality", "faithfulness") +SCENARIO_SUB_SCORES: tuple[str, ...] = ( + "action_correctness", + "routing_correctness", + "answer_correctness", + "citation_quality", + "faithfulness", + "refusal_correctness", +) +SIMPLE_FLAGS: tuple[str, ...] = ("is_answer_correct", "is_citation_correct") +SCENARIO_FLAGS: tuple[str, ...] = ("is_action_correct", "is_answer_correct", "is_citation_correct") +LIST_KEYS: tuple[str, ...] = ("major_errors", "missing_facts", "unsupported_claims") + + +@dataclass +class ScenarioJudgeResult: + """Structured judge output, shared between simple and scenario prompts.""" + + mode: JudgeMode + sub_scores: dict[str, Optional[int]] = field(default_factory=dict) + flags: dict[str, Optional[bool]] = field(default_factory=dict) + lists: dict[str, list[str]] = field(default_factory=dict) + rationale: str = "" + error: Optional[str] = None + + +@functools.lru_cache(maxsize=8) +def _read_prompt(path: str) -> str: + return Path(path).read_text(encoding="utf-8") + + +def _select_mode(entry: "DatasetEntry") -> JudgeMode: + """Pick the prompt mode for ``entry`` based on its ``scoring_mode`` field.""" + return "scenario" if entry.scoring_mode else "simple" + + +def _render_pages(entry: "DatasetEntry") -> str: + parts = [f"{p.doc_id}:{p.page_number}" for p in entry.ground_truth_pages] + return ", ".join(parts) + + +def _render_cited(result: "TrialResult") -> str: + parts = [f"{item.get('doc_id')}:{item.get('page_number')}" for item in result.ranked_retrieved] + return ", ".join(parts) + + +def _render_raw_answers(entry: "DatasetEntry") -> str: + return "; ".join(entry.raw_answers) + + +# Cap ``agent_answer`` at ~20K chars before stuffing into the judge prompt. +# A handful of trials (notably ``extract_only`` scenarios where the agent +# dumps the whole corpus into its answer) otherwise blow past the judge +# model's context window. 20K chars ≈ 5-6K tokens leaves plenty of headroom +# for the system instructions + prompt template + reference content. +JUDGE_ANSWER_CHAR_CAP = 20000 +_TRUNCATION_MARKER = f"\n\n... [truncated for judge: original answer exceeded {JUDGE_ANSWER_CHAR_CAP} chars]" + + +def truncate_for_judge(text: str, cap: int = JUDGE_ANSWER_CHAR_CAP) -> str: + """Cap ``text`` at ``cap`` chars with a visible marker. + + Both the new judge (via ``_format_prompt``) and the legacy ``LLMJudge`` + pass through this helper so the cap is applied identically across both + scoring paths. The marker makes the truncation visible to the judge so + its rationale can flag the gap rather than silently mis-scoring. + """ + if len(text) <= cap: + return text + head_chars = max(0, cap - len(_TRUNCATION_MARKER)) + return text[:head_chars] + _TRUNCATION_MARKER + + +def _format_prompt(template: str, entry: "DatasetEntry", result: "TrialResult") -> str: + """Substitute the prompt slots for ``entry`` / ``result`` into ``template``. + + Uses literal-token replacement for each known slot, NOT ``str.format_map``. + The SDG-shipped judge prompts contain literal JSON-schema blocks like + ``{ "action_correctness": <0-5>, ... }`` which a Python format-string + parser would misinterpret as substitution slots with invalid format + specifiers. With ``str.replace`` those literals pass through verbatim and + only our enumerated slot tokens are substituted. + + Slots that the template doesn't reference are simply ignored. Slots the + template does reference but that we don't know about are left as literal + ``{slot_name}`` in the output — the judge LLM is robust to that, and + leaving them visible aids debugging compared to silently rendering empty. + """ + substitutions: list[tuple[str, str]] = [ + ("{query}", entry.original_query), + ("{reference_answer}", entry.ground_truth_answer), + ("{raw_answers}", _render_raw_answers(entry)), + ("{relevant_pages}", _render_pages(entry)), + ("{agent_answer}", truncate_for_judge(result.final_answer)), + ("{agent_cited_pages}", _render_cited(result)), + ("{cited_evidence}", ""), + ("{scenario_prompt}", entry.paraphrased_prompt), + ("{category}", entry.category), + ("{phase}", entry.phase), + ("{scoring_mode}", entry.scoring_mode), + ("{expected_action}", entry.expected_action), + ("{expected_output_shape}", entry.expected_output_shape), + ("{validation_signal}", entry.validation_signal), + ] + out = template + for token, value in substitutions: + out = out.replace(token, value) + return out + + +class _ChatClient(Protocol): + """Minimal slice of LiteLLMClient that this module needs.""" + + def complete(self, messages: list[dict], max_tokens: Optional[int] = None) -> tuple[str, float]: ... + + +_SYSTEM_INSTRUCTION = "Respond with ONLY valid JSON matching the schema in the user message. No prose, no markdown." + + +def _strip_envelope(raw: str) -> str: + text = raw.strip() + text = re.sub(r".*?", "", text, flags=re.DOTALL).strip() + text = re.sub(r"^```(?:json)?\s*", "", text, flags=re.MULTILINE) + text = re.sub(r"\s*```$", "", text, flags=re.MULTILINE) + return text.strip() + + +def _validate_subscore(value: Any) -> Optional[int]: + if value is None: + return None + if not isinstance(value, int) or isinstance(value, bool): + raise TypeError(f"sub-score must be an integer, got {type(value).__name__}: {value!r}") + score = value + if not 0 <= score <= 5: + raise ValueError(f"sub-score {score} out of range 0-5") + return score + + +def _validate_flag(value: Any) -> Optional[bool]: + if value is None: + return None + return bool(value) + + +def _validate_list(value: Any) -> list[str]: + if value is None: + return [] + if not isinstance(value, list): + return [] + return [str(x) for x in value] + + +def _parse_response(raw: str, mode: JudgeMode) -> ScenarioJudgeResult: + """Parse a judge JSON response into a ``ScenarioJudgeResult``. + + Tolerates ```` blocks and code fences. Returns a result whose + ``error`` is set when JSON parsing or sub-score validation fails; + sub-scores then all default to ``None``. + """ + sub_keys = SIMPLE_SUB_SCORES if mode == "simple" else SCENARIO_SUB_SCORES + flag_keys = SIMPLE_FLAGS if mode == "simple" else SCENARIO_FLAGS + empty = ScenarioJudgeResult( + mode=mode, + sub_scores={k: None for k in sub_keys}, + flags={k: None for k in flag_keys}, + lists={k: [] for k in LIST_KEYS}, + rationale="", + ) + + try: + data = json.loads(_strip_envelope(raw)) + except json.JSONDecodeError as exc: + empty.error = f"parse_failure: {exc}: {raw[:200]!r}" + return empty + if not isinstance(data, dict): + empty.error = f"parse_failure: top-level must be a JSON object, got {type(data).__name__}" + return empty + + try: + sub_scores = {k: _validate_subscore(data.get(k)) for k in sub_keys} + except (TypeError, ValueError) as exc: + empty.error = f"parse_failure: {exc}" + return empty + + return ScenarioJudgeResult( + mode=mode, + sub_scores=sub_scores, + flags={k: _validate_flag(data.get(k)) for k in flag_keys}, + lists={k: _validate_list(data.get(k)) for k in LIST_KEYS}, + rationale=str(data.get("brief_rationale") or ""), + ) + + +def evaluate_entry( + *, + client: _ChatClient, + entry: "DatasetEntry", + result: "TrialResult", + simple_prompt_path: Optional[str], + scenario_prompt_path: Optional[str], +) -> ScenarioJudgeResult: + """Score ``result`` against ``entry`` using the prompt selected by ``_select_mode``. + + ``simple_prompt_path`` / ``scenario_prompt_path`` are the on-disk + locations of the SDG-shipped judge prompts. Only the path for the + selected mode is required; the other may be ``None``. + """ + mode = _select_mode(entry) + chosen_path = scenario_prompt_path if mode == "scenario" else simple_prompt_path + if not chosen_path: + raise ValueError( + f"judge mode={mode!r} requires {('scenario_prompt_path' if mode == 'scenario' else 'simple_prompt_path')}; " + "set judge._prompt_path in the config or co-locate the prompt file with the manifest." + ) + + template = _read_prompt(chosen_path) + user = _format_prompt(template, entry, result) + messages = [ + {"role": "system", "content": _SYSTEM_INSTRUCTION}, + {"role": "user", "content": user}, + ] + try: + raw, _ = client.complete(messages) + except Exception as exc: + sub_keys = SIMPLE_SUB_SCORES if mode == "simple" else SCENARIO_SUB_SCORES + flag_keys = SIMPLE_FLAGS if mode == "simple" else SCENARIO_FLAGS + return ScenarioJudgeResult( + mode=mode, + sub_scores={k: None for k in sub_keys}, + flags={k: None for k in flag_keys}, + lists={k: [] for k in LIST_KEYS}, + rationale="", + error=f"judge_api_error: {exc}", + ) + return _parse_response(raw, mode=mode) diff --git a/nemo_retriever/src/nemo_retriever/skill_eval/report.py b/nemo_retriever/src/nemo_retriever/skill_eval/report.py index 7058d5148b..9a85eabea9 100644 --- a/nemo_retriever/src/nemo_retriever/skill_eval/report.py +++ b/nemo_retriever/src/nemo_retriever/skill_eval/report.py @@ -110,6 +110,29 @@ def _aggregate( metrics["judge_score_mean"] = sum(judge_scores) / len(judge_scores) metrics["judge_score_n"] = len(judge_scores) + legacy_scores = [r.legacy_judge_score for r in query_results if r.legacy_judge_score is not None] + if legacy_scores: + metrics["legacy_judge_score_mean"] = sum(legacy_scores) / len(legacy_scores) + metrics["legacy_judge_score_n"] = len(legacy_scores) + + sub_means: dict[str, tuple[float, int]] = {} + sub_score_keys: set[str] = set() + for r in query_results: + sub_score_keys.update(r.judge_subscores.keys()) + for key in sorted(sub_score_keys): + values = [v for r in query_results if (v := r.judge_subscores.get(key)) is not None] + if values: + sub_means[key] = (sum(values) / len(values), len(values)) + if sub_means: + metrics["judge_sub_means"] = sub_means + + mode_counts: dict[str, int] = defaultdict(int) + for r in query_results: + if r.judge_mode: + mode_counts[r.judge_mode] += 1 + if mode_counts: + metrics["judge_mode_counts"] = dict(mode_counts) + tool_use_summary = next((r.tool_use_summary for r in setup_results if r.tool_use_summary), "") return { @@ -152,8 +175,13 @@ def _md_cell(value: Any) -> str: def _md_row(row: dict[str, Any]) -> str: m = row.get("metrics", {}) judge_cell = f"{m['judge_score_mean']:.2f} (n={m.get('judge_score_n', 0)})" if "judge_score_mean" in m else "-" + legacy_cell = ( + f"{m['legacy_judge_score_mean']:.2f} (n={m.get('legacy_judge_score_n', 0)})" + if "legacy_judge_score_mean" in m + else "-" + ) return ( - "| {run} | {sr:.2f} | {retr:.2f} | {r1:.3f} | {r5:.3f} | {r10:.3f} | {judge} " + "| {run} | {sr:.2f} | {retr:.2f} | {r1:.3f} | {r5:.3f} | {r10:.3f} | {judge} | {legacy} " "| {ipt:.0f} | {opt:.0f} | {cr:.0f} | {cc:.0f} | {cost} |" ).format( run=row.get("run_name", "?"), @@ -163,6 +191,7 @@ def _md_row(row: dict[str, Any]) -> str: r5=m.get("recall_5", 0.0), r10=m.get("recall_10", 0.0), judge=judge_cell, + legacy=legacy_cell, ipt=m.get("input_tokens", 0.0), opt=m.get("output_tokens", 0.0), cr=m.get("cache_read_input_tokens", 0.0), @@ -172,10 +201,56 @@ def _md_row(row: dict[str, Any]) -> str: _MAIN_TABLE_HEADER = ( - "| run | success_rate | retr_used | recall@1 | recall@5 | recall@10 | judge | q_input | q_output " + "| run | success_rate | retr_used | recall@1 | recall@5 | recall@10 | judge | legacy | q_input | q_output " "| q_cache_read | q_cache_create | q_cost |" ) -_MAIN_TABLE_DIVIDER = "|---|---|---|---|---|---|---|---|---|---|---|---|" +_MAIN_TABLE_DIVIDER = "|---|---|---|---|---|---|---|---|---|---|---|---|---|" + + +def _md_subscore_section(overall_rows: list[dict[str, Any]]) -> list[str]: + """Emit per-sub-score mean tables; one column per sub-score key seen.""" + keys: list[str] = [] + seen: set[str] = set() + for row in overall_rows: + sub_means = row.get("metrics", {}).get("judge_sub_means") or {} + for k in sub_means: + if k not in seen: + seen.add(k) + keys.append(k) + if not keys: + return [] + header = "| run | " + " | ".join(keys) + " |" + divider = "|---|" + "|".join(["---"] * len(keys)) + "|" + lines = ["", "## Judge sub-scores (mean over trials that produced each sub-score, with N)", "", header, divider] + for row in overall_rows: + sub_means = row.get("metrics", {}).get("judge_sub_means") or {} + cells = [] + for k in keys: + entry = sub_means.get(k) + cells.append("-" if entry is None else f"{entry[0]:.2f} (n={entry[1]})") + lines.append(f"| {row.get('run_name', '?')} | " + " | ".join(cells) + " |") + return lines + + +def _md_mode_breakdown(overall_rows: list[dict[str, Any]]) -> list[str]: + """Emit a one-row-per-run table showing trial counts per judge mode.""" + modes: list[str] = [] + seen: set[str] = set() + for row in overall_rows: + for k in row.get("metrics", {}).get("judge_mode_counts") or {}: + if k not in seen: + seen.add(k) + modes.append(k) + if not modes: + return [] + header = "| run | " + " | ".join(modes) + " |" + divider = "|---|" + "|".join(["---"] * len(modes)) + "|" + lines = ["", "## Judge mode breakdown (trials per mode)", "", header, divider] + for row in overall_rows: + counts = row.get("metrics", {}).get("judge_mode_counts") or {} + cells = [str(counts.get(m, 0)) for m in modes] + lines.append(f"| {row.get('run_name', '?')} | " + " | ".join(cells) + " |") + return lines def write_summary_md( @@ -273,6 +348,9 @@ def write_summary_md( ) ) + lines.extend(_md_subscore_section(overall_rows)) + lines.extend(_md_mode_breakdown(overall_rows)) + diag_lines = [] for row in overall_rows: m = row.get("metrics", {}) diff --git a/nemo_retriever/src/nemo_retriever/skill_eval/runner.py b/nemo_retriever/src/nemo_retriever/skill_eval/runner.py index 3008dd4b7f..1e6d71ee7b 100644 --- a/nemo_retriever/src/nemo_retriever/skill_eval/runner.py +++ b/nemo_retriever/src/nemo_retriever/skill_eval/runner.py @@ -72,6 +72,13 @@ class TrialResult: judge_score: int | None = None judge_reasoning: str = "" judge_error: str = "" + judge_mode: str = "" + judge_subscores: dict[str, int | None] = field(default_factory=dict) + judge_flags: dict[str, bool | None] = field(default_factory=dict) + judge_lists: dict[str, list[str]] = field(default_factory=dict) + legacy_judge_score: int | None = None + legacy_judge_reasoning: str = "" + legacy_judge_error: str = "" tool_use_summary: str = "" cost_available: bool = True execution_mode: str = "linear_session" @@ -90,6 +97,26 @@ class ConditionRun: execution_mode: str = "linear_session" +ANSWERABLE_SCORING_MODES: frozenset[str] = frozenset({"answerable_retrieval", "ingest_plus_answer"}) +ANSWER_REQUIRED_SCORING_MODES: frozenset[str] = frozenset( + {"answerable_retrieval", "ingest_plus_answer", "refusal", "capability_gap", "dispatcher_prompt"} +) + + +@dataclass +class JudgeContext: + """Bag of state passed from CLI into ``_apply_judge``. + + Holds the LiteLLM transport plus the two on-disk prompt paths. ``None`` + is a valid path when no entries in the run need that mode. + """ + + client: Any + simple_prompt_path: str | None + scenario_prompt_path: str | None + legacy_judge: Any = None + + def _remap_pdf_paths(text: str, prefixes: tuple[str, ...]) -> str: """Rewrite caller-supplied path prefixes in *text* to ``./pdfs/``. @@ -1153,38 +1180,89 @@ def _run_one_turn( return result -UNSCORABLE_JUDGE_ERRORS: frozenset[str] = frozenset({"no_ground_truth", "empty_candidate"}) +UNSCORABLE_JUDGE_ERRORS: frozenset[str] = frozenset({"no_ground_truth", "empty_candidate", "scoring_mode_skip"}) -def _apply_judge(judge: Any, entry: DatasetEntry, result: TrialResult) -> None: - """Score ``result.final_answer`` against ``entry.ground_truth_answer``. +def _apply_judge(ctx: Any, entry: DatasetEntry, result: TrialResult) -> None: + """Score ``result.final_answer`` against ``entry`` via the dispatched judge. - Missing ground truth and empty candidates are recorded as terminal - ``judge_error`` values so ``rescore`` can skip intrinsically unscorable - trials instead of retrying them forever. + Behaviour summary: + - ``ctx is None`` -> no-op (judge disabled). + - ``scoring_mode == "skip"`` -> terminal ``judge_error="scoring_mode_skip"``. + - Simple-mode entry with no + ground truth -> terminal ``judge_error="no_ground_truth"``. + - Empty ``final_answer`` for a + mode that requires an answer -> terminal ``judge_error="empty_candidate"``. + - Otherwise: call + ``judging.evaluate_entry`` and stamp sub-scores onto ``result``. """ - if judge is None: + if ctx is None: return - if not entry.ground_truth_answer: + if entry.scoring_mode == "skip": + result.judge_error = "scoring_mode_skip" + return + + needs_answer_mode = entry.scoring_mode in ANSWER_REQUIRED_SCORING_MODES or entry.scoring_mode == "" + answerable_mode = entry.scoring_mode in ANSWERABLE_SCORING_MODES or entry.scoring_mode == "" + if answerable_mode and not entry.ground_truth_answer: result.judge_error = "no_ground_truth" return - if not result.final_answer: + if needs_answer_mode and not result.final_answer: result.judge_error = "empty_candidate" return + + from nemo_retriever.skill_eval.judging import evaluate_entry + try: - verdict = judge.judge( - query=entry.original_query, - reference=entry.ground_truth_answer, - candidate=result.final_answer, + verdict = evaluate_entry( + client=ctx.client, + entry=entry, + result=result, + simple_prompt_path=ctx.simple_prompt_path, + scenario_prompt_path=ctx.scenario_prompt_path, ) except Exception as exc: result.judge_error = f"judge_invocation_error: {exc}" - logger.warning("LLMJudge raised for entry_id=%s: %s", result.entry_id, exc, exc_info=True) - return - result.judge_score = verdict.score - result.judge_reasoning = verdict.reasoning or "" - if verdict.error: - result.judge_error = verdict.error + logger.warning("evaluate_entry raised for entry_id=%s: %s", result.entry_id, exc, exc_info=True) + verdict = None + + if verdict is not None: + result.judge_mode = verdict.mode + result.judge_subscores = dict(verdict.sub_scores) + result.judge_flags = dict(verdict.flags) + result.judge_lists = dict(verdict.lists) + result.judge_reasoning = verdict.rationale or "" + if verdict.mode == "simple": + result.judge_score = verdict.sub_scores.get("answer_correctness") + if verdict.error: + result.judge_error = verdict.error + + # Also run the legacy LLMJudge for cross-run comparability when possible. + # This is independent of the new judge: a failure in one does not block the + # other, since they use different prompts and may be sensitive to different + # inputs (e.g. context-window limits). + if ctx.legacy_judge is not None and entry.ground_truth_answer and result.final_answer: + from nemo_retriever.skill_eval.judging import truncate_for_judge + + try: + legacy_verdict = ctx.legacy_judge.judge( + query=entry.original_query, + reference=entry.ground_truth_answer, + candidate=truncate_for_judge(result.final_answer), + ) + except Exception as exc: + result.legacy_judge_error = f"legacy_judge_invocation_error: {exc}" + logger.warning( + "legacy LLMJudge raised for entry_id=%s: %s", + result.entry_id, + exc, + exc_info=True, + ) + else: + result.legacy_judge_score = legacy_verdict.score + result.legacy_judge_reasoning = legacy_verdict.reasoning or "" + if legacy_verdict.error: + result.legacy_judge_error = legacy_verdict.error def run_condition(