diff --git a/src/synth_panel/cli/commands.py b/src/synth_panel/cli/commands.py index d2a260b..4d40700 100644 --- a/src/synth_panel/cli/commands.py +++ b/src/synth_panel/cli/commands.py @@ -98,6 +98,7 @@ persona_system_prompt_from_template, ) from synth_panel.question_budget import QuestionFailureBudget +from synth_panel.response_coercion import is_typed_schema from synth_panel.runtime import AgentRuntime from synth_panel.synthesis import ( STRATEGY_MAP_REDUCE, @@ -874,6 +875,133 @@ def _estimate_output_tokens_per_response(question: Any) -> int: return _DRY_RUN_OUTPUT_TOKENS_TEXT_DEFAULT +def _question_meta_for_save(questions: list[Any]) -> list[dict[str, Any]]: + """Build the per-question metadata persisted alongside a saved result. + + sy-547: ``poll-summary`` / ``analyze`` read the saved result's + ``questions`` field to recover each question's ``response_schema`` and + bucket it as enum/scale. Previously the standard run path never passed + ``questions=`` to :func:`save_panel_result`, so the schema was lost and + every question degraded to ``kind=text``. Emit a minimal + ``{text, response_schema?}`` entry per authored question, preserving the + declared schema verbatim so the downstream ``_detect_kind`` is schema- + driven rather than guessing from response shape. + """ + meta: list[dict[str, Any]] = [] + for q in questions: + if not isinstance(q, dict): + meta.append({"text": str(q)}) + continue + entry: dict[str, Any] = {"text": build_question_prompt(q)} + rs = q.get("response_schema") + if rs is not None: + entry["response_schema"] = rs + meta.append(entry) + return meta + + +def _blend_dropped_models(ensemble: Any) -> list[str]: + """Return blend members that produced zero usable responses (sy-546). + + A model is "dropped" when, across all its panelists, every primary + (non-follow-up) response is an error or empty — i.e. the model + contributed nothing to the blended distributions. A bad slug that + 404s on every call lands here, as does a model that died mid-run. + Order follows ``ensemble.models``. + """ + dropped: list[str] = [] + for mr in ensemble.model_results: + usable = 0 + for pr in mr.panelist_results: + for resp in pr.responses: + if not isinstance(resp, dict) or resp.get("follow_up"): + continue + if resp.get("error"): + continue + val = resp.get("response") + if val is None: + continue + if isinstance(val, str) and (not val.strip() or val.strip().startswith("[error:")): + continue + usable += 1 + if usable == 0: + dropped.append(mr.model) + return dropped + + +def _run_model_preflight( + models: list[str], + *, + args: Any, + client: Any | None = None, +) -> int | None: + """Probe *models* for reachability; return an exit code to abort or None. + + sy-546. Aborts (returns 1) when the reachable-model count violates the + configured guard: + + * ``--min-models N`` set → abort if fewer than N models are reachable. + * Otherwise (default, or ``--require-all-models``) → abort if ANY model + is unreachable. + + Unreachable models are always reported. Inconclusive probes (transient / + auth / missing-credential failures) never count against the guard — they + are not a property of the slug. + """ + from synth_panel.preflight import preflight_models + + print( + f"Pre-flight: probing {len(set(models))} model(s) for reachability...", + file=sys.stderr, + ) + report = preflight_models(models, client=client) + + reachable = [p.model for p in report.probes if p.status == "reachable"] + inconclusive = [p for p in report.probes if p.status == "inconclusive"] + for p in inconclusive: + print( + f"Pre-flight: could not verify {p.model} (inconclusive: {p.detail or 'unknown error'}); " + "proceeding — it will surface at call time if truly broken.", + file=sys.stderr, + ) + + min_models = getattr(args, "min_models", None) + require_all = getattr(args, "require_all_models", False) + + if min_models is not None and not require_all: + # Relaxed guard: only abort if too few models are reachable. Treat + # inconclusive as potentially-usable so a flaky probe doesn't trip + # the floor. + usable = len(reachable) + len(inconclusive) + if usable < min_models: + if report.unreachable: + print(report.failure_message(), file=sys.stderr) + print( + f"Pre-flight failed: only {usable} model(s) reachable, --min-models requires {min_models}.", + file=sys.stderr, + ) + return 1 + if report.unreachable: + bad = ", ".join(p.model for p in report.unreachable) + print( + f"Pre-flight: WARNING — {len(report.unreachable)} model(s) unreachable ({bad}); " + f"proceeding with {len(reachable)} reachable model(s) per --min-models={min_models}.", + file=sys.stderr, + ) + return None + + # Default / --require-all-models: any unreachable slug is fatal. + if report.unreachable: + print(report.failure_message(), file=sys.stderr) + return 1 + + print( + f"Pre-flight: OK — all {len(reachable)} model(s) reachable.", + file=sys.stderr, + ) + return None + + def _iter_instrument_attachments(instrument: Instrument): """Yield every attachment dict the instrument would send to the model. @@ -943,6 +1071,11 @@ def _emit_dry_run_preview( question_count = len(questions) llm_calls = persona_count * question_count + # sy-547 (d): count questions with an enforceable typed response_schema + # (enum/scale) so the preview can be explicit that these are coerced + # post-hoc, not constrained at generation. + typed_schema_count = sum(1 for q in questions if isinstance(q, dict) and is_typed_schema(q.get("response_schema"))) + system_prompt_chars = sum(len(system_prompt_fn(p)) for p in personas) question_chars = sum(len(build_question_prompt(q)) for q in questions) follow_up_chars = 0 @@ -1037,7 +1170,20 @@ def _emit_dry_run_preview( if vision_warning: print(f"Validation: WARNING — {vision_warning}", file=sys.stderr) else: - print("Validation: OK", file=sys.stderr) + print("Validation: OK (instrument spec is structurally valid)", file=sys.stderr) + # sy-547 (d): be explicit that a typed response_schema is NOT + # enforced at generation. The real run coerces free-text answers to + # the nearest enum option / in-range integer post-hoc and flags any + # that can't be mapped, but "Validation: OK" must not read as + # "output is guaranteed to be one of the options". + if typed_schema_count: + print( + f"Note: {typed_schema_count} question(s) declare an enum/scale response_schema. " + "These are NOT constrained at generation — the run coerces each free-text answer " + "to the nearest option / in-range integer post-hoc and flags unmappable answers. " + "Dry-run does not perform that coercion.", + file=sys.stderr, + ) return preview: dict[str, Any] = { @@ -1051,6 +1197,11 @@ def _emit_dry_run_preview( "estimated_cost_usd": round(cost.total_cost, 6), "cost_is_estimated": pricing_is_estimated, "validation": "warning" if vision_warning else "ok", + # sy-547 (d): advertise that typed response_schemas are coerced + # post-hoc, not enforced at generation, so JSON consumers don't + # treat "validation: ok" as "output guaranteed constrained". + "typed_schema_question_count": typed_schema_count, + "typed_schema_enforced": False, "rounds": [ { "name": r.name, @@ -1533,6 +1684,25 @@ def system_prompt_fn(persona: dict) -> str: if persona_models: print(format_assignment_breakdown(persona_models), file=sys.stderr) + # ── sy-546: model reachability pre-flight ──────────────────────────── + # For multi-model runs (--models weighted / ensemble / --blend), probe + # every distinct slug with a 1-token call before spending. A bogus slug + # (e.g. a 404'ing OpenRouter id) deterministically fails every call and + # silently shrinks an ensemble/blend; catch it here and fail fast naming + # the bad slug(s). Runs on BOTH --dry-run and real runs so a dry-run's + # "OK" actually means the spec is runnable. Bypassable with + # --skip-preflight. Transient/auth/credential failures are inconclusive + # and never block the run. + preflight_models_list: list[str] = [] + if model_spec is not None and len(model_spec) > 1: + preflight_models_list = [m for m, _w in model_spec] + elif persona_models: + preflight_models_list = sorted(set(persona_models.values())) + if len(set(preflight_models_list)) > 1 and not getattr(args, "skip_preflight", False): + rc = _run_model_preflight(preflight_models_list, args=args, client=None) + if rc is not None: + return rc + # ── sp-x8g: --dry-run preview ──────────────────────────────────────── # Short-circuit before any LLM-invoking code (variant expansion, # ensemble, blend, orchestrator). Shows the user what each question @@ -1940,6 +2110,7 @@ def system_prompt_fn(persona: dict) -> str: # Run all panelists in parallel via the orchestrator blend_result = None # populated only when --blend is active + blend_drop_warning: str | None = None # sy-546: set when a blend member dropped # ── sp-hsk3: checkpoint + resume wiring ──────────────────────────── # We snapshot progress every K completed panelists so a crashed or @@ -2098,6 +2269,24 @@ def system_prompt_fn(persona: dict) -> str: blend_weights = {m: w for m, w in model_spec} blend_result = blend_distributions(ensemble, weights=blend_weights, questions=questions) + # sy-546: detect blend members that contributed ZERO usable responses + # (e.g. a slug that 404'd on every call slipped past pre-flight, or a + # member died mid-run). The blend silently degrades to the survivors; + # emit a loud, top-level warning stating the new N so the operator + # isn't fooled into treating it as a full-strength blend. + dropped_models = _blend_dropped_models(ensemble) + if dropped_models: + surviving = [m for m in ensemble_models if m not in dropped_models] + blend_drop_warning = ( + f"BLEND DEGRADED: {len(dropped_models)} of {len(ensemble_models)} model(s) " + f"produced no usable responses ({', '.join(dropped_models)}). " + f"The blend dropped to {len(surviving)} model(s): {', '.join(surviving) or 'none'}. " + "Distributions and synthesis reflect only the surviving model(s). " + "Re-run with corrected --models (or --require-all-models to abort instead)." + ) + logger.warning(blend_drop_warning) + print(f"\nWarning: {blend_drop_warning}\n", file=sys.stderr) + # Flatten all panelist results across models for output + synthesis panelist_results = [pr for mr in ensemble.model_results for pr in mr.panelist_results] else: @@ -2582,6 +2771,12 @@ def _on_complete(pr: PanelistResult) -> None: persona_count=len(personas), question_count=len(questions), instrument_name=inst_name, + # sy-547: persist the authored question defs (text + + # response_schema) so poll-summary / analyze recognize enum/scale + # questions instead of falling back to kind=text. Without this the + # saved result carried only the question text echoed on each + # response, dropping the schema kind. + questions=_question_meta_for_save(questions), models=all_models, synthesis=synthesis_dict, metadata=metadata, @@ -2852,6 +3047,14 @@ def _cli_panelist_formatter(pr: PanelistResult, panel_model: str) -> dict[str, A warnings_list = extra.setdefault("warnings", []) if isinstance(warnings_list, list): warnings_list.extend(assignment_warnings) + # sy-546: carry the blend-degraded warning into the JSON envelope so + # CI / MCP consumers detect the dropped member(s) without scraping + # stderr, and expose the count explicitly. + if blend_drop_warning is not None: + warnings_list = extra.setdefault("warnings", []) + if isinstance(warnings_list, list): + warnings_list.append(blend_drop_warning) + extra["blend_degraded"] = True # sp-g270: surface --personas-merge name-collision drops so JSON # consumers can assert panel size matches expectations. Always # present (as []) when --personas-merge was used so downstream diff --git a/src/synth_panel/cli/parser.py b/src/synth_panel/cli/parser.py index 83def24..f731139 100644 --- a/src/synth_panel/cli/parser.py +++ b/src/synth_panel/cli/parser.py @@ -391,6 +391,42 @@ def build_parser() -> argparse.ArgumentParser: "can be passed to 'synthpanel analyze '." ), ) + # sy-546: model reachability pre-flight guards for multi-model runs. + panel_run_parser.add_argument( + "--skip-preflight", + action="store_true", + default=False, + help=( + "Skip the model reachability pre-flight. By default, runs with " + "multiple models (--models, ensemble, or --blend) probe each slug " + "with a 1-token call before spending and abort if any slug is " + "unreachable (e.g. a bad OpenRouter model id that 404s)." + ), + ) + panel_run_parser.add_argument( + "--require-all-models", + action="store_true", + default=False, + help=( + "Abort the run if ANY model in --models is unreachable in " + "pre-flight, even when the others would still satisfy --min-models. " + "This is the default for multi-model runs; the flag is accepted " + "for explicitness and to override a relaxed --min-models." + ), + ) + panel_run_parser.add_argument( + "--min-models", + type=int, + default=None, + metavar="N", + help=( + "Minimum number of reachable models required to proceed with a " + "multi-model run. If pre-flight finds fewer than N reachable " + "models, the run aborts. Without this flag (and without " + "--require-all-models), the run still aborts on ANY unreachable " + "slug — set --min-models to deliberately allow a degraded run." + ), + ) panel_run_parser.add_argument( "--dry-run", action="store_true", diff --git a/src/synth_panel/orchestrator.py b/src/synth_panel/orchestrator.py index 6e6752c..ab742c0 100644 --- a/src/synth_panel/orchestrator.py +++ b/src/synth_panel/orchestrator.py @@ -32,6 +32,7 @@ from synth_panel.persistence import Session from synth_panel.prompts import build_question_blocks from synth_panel.question_budget import QuestionFailureBudget +from synth_panel.response_coercion import coerce_response, is_typed_schema from synth_panel.routing import route_round from synth_panel.runtime import AgentRuntime, TurnSummary from synth_panel.structured.output import StructuredOutputConfig, StructuredOutputEngine @@ -1180,6 +1181,34 @@ def _run_panelist( } tracker.record_turn(summary.usage) + # sy-547: when the question declares a typed + # ``response_schema`` (enum/scale), coerce the free-text + # answer to the nearest declared option / in-range integer + # and persist BOTH the raw text (``response``, untouched) + # and the typed value (``response_typed``). The schema kind + # is stamped on the response so poll-summary/analyze bucket + # the question as enum/scale instead of falling back to + # ``kind=text``. Unmappable answers set ``schema_unmapped`` + # so the caller can tally a run-level failure count and emit + # a per-response warning. + q_schema = question.get("response_schema") if isinstance(question, dict) else None + if is_typed_schema(q_schema): + resp_dict["response_schema"] = q_schema + coerced = coerce_response(q_schema, response_text) + if coerced is not None: + if coerced.mapped: + resp_dict["response_typed"] = coerced.value + else: + resp_dict["schema_unmapped"] = True + logger.warning( + "panelist %s q%d: answer %r could not be mapped to " + "declared %s response_schema; stored as raw text only", + name, + qi, + response_text[:80], + coerced.kind, + ) + # Extraction pass: extract structured data from the # free-text response (--extract-schema). if extract_engine and extract_config: diff --git a/src/synth_panel/poll_summary.py b/src/synth_panel/poll_summary.py index 1fbe8ed..78a94b2 100644 --- a/src/synth_panel/poll_summary.py +++ b/src/synth_panel/poll_summary.py @@ -786,6 +786,26 @@ def _extract_first_choice( else: keys = _FIRST_CHOICE_KEYS + # sy-547: the orchestrator coerces enum answers against the declared + # response_schema and persists the canonical option under + # ``response_typed``. Trust it first — it's already mapped to a valid + # option (e.g. ``"Blue."`` → ``"blue"``), so it wins over re-parsing + # the raw free text. + typed = response_dict.get("response_typed") + if isinstance(typed, str) and typed.strip(): + return typed.strip() + if isinstance(typed, (int, float)) and not isinstance(typed, bool): + return str(typed) + + # sy-547: when the response carries an enum response_schema but no typed + # value, the orchestrator already determined the free text could not be + # mapped to a declared option (``schema_unmapped``). Do NOT fall back to + # the raw string — counting it as a choice would resurrect exactly the + # off-schema free-text the coercion layer rejected. Report unparseable. + resp_schema = response_dict.get("response_schema") + if isinstance(resp_schema, dict) and resp_schema.get("type") == "enum": + return None + extraction = response_dict.get("extraction") if isinstance(extraction, dict): for key in keys: @@ -845,6 +865,12 @@ def _extract_score( else: keys = _SCORE_KEYS + # sy-547: prefer the schema-coerced integer the orchestrator persisted + # for scale questions (already validated to be inside [min, max]). + typed = response_dict.get("response_typed") + if isinstance(typed, (int, float)) and not isinstance(typed, bool): + return float(typed) + raw = response_dict.get("response") # Top-level numeric response wins when the whole response is a # scalar (free-text scale schemas). For dict responses we look diff --git a/src/synth_panel/preflight.py b/src/synth_panel/preflight.py new file mode 100644 index 0000000..25fce8c --- /dev/null +++ b/src/synth_panel/preflight.py @@ -0,0 +1,155 @@ +"""Model reachability pre-flight (sy-546). + +An ensemble / weighted / blend run lists several models via ``--models``. +When one slug is bogus (e.g. ``openrouter/google/gemini-2.0-flash-001``, +which 404s on every OpenRouter call), the historical behaviour was a +silent degrade: every call to that member errored, the run completed on +the survivors, and a ``--blend`` quietly became a 2-model blend. The 404 +is deterministic and knowable before any persona work, so this module +probes each distinct model once with a 1-token completion and classifies +the result: + +* reachable — the probe succeeded. +* unreachable — the provider rejected the model itself (404 / "no + endpoints" / model-not-found style ``BAD_REQUEST``). This is the + fail-fast signal. +* inconclusive — a transient or environmental failure (rate limit, + server error, transport, or missing credentials). We do NOT block the + run on these: they are not a property of the slug, and a flaky probe + must not abort an otherwise-valid panel. + +:func:`preflight_models` runs the probes (concurrently, via the shared +``LLMClient`` throttles) and returns a :class:`PreflightReport`. The CLI +calls it before spending — on the real run AND on ``--dry-run`` — and +aborts with a message naming the bad slug(s) when any model is +unreachable. +""" + +from __future__ import annotations + +import logging +from concurrent.futures import ThreadPoolExecutor +from dataclasses import dataclass, field + +from synth_panel.llm.client import LLMClient +from synth_panel.llm.errors import LLMError, LLMErrorCategory +from synth_panel.llm.models import CompletionRequest, InputMessage, TextBlock + +logger = logging.getLogger(__name__) + +# Error categories that mean "the slug is wrong" (fail fast) rather than +# "the environment hiccuped" (don't block). 404 / no-endpoints / unknown +# model land in BAD_REQUEST via classify_http_status (400-499 except +# 401/403/429). +_UNREACHABLE_CATEGORIES = frozenset({LLMErrorCategory.BAD_REQUEST}) + +# Categories we treat as inconclusive — the probe could not prove the +# slug bad, so we let the run proceed and surface the issue at call time. +_INCONCLUSIVE_CATEGORIES = frozenset( + { + LLMErrorCategory.RATE_LIMIT, + LLMErrorCategory.SERVER_ERROR, + LLMErrorCategory.TRANSPORT, + LLMErrorCategory.AUTHENTICATION, + LLMErrorCategory.MISSING_CREDENTIALS, + LLMErrorCategory.DESERIALIZATION, + LLMErrorCategory.RETRIES_EXHAUSTED, + } +) + + +@dataclass(frozen=True) +class ModelProbe: + """Outcome of probing a single model slug.""" + + model: str + status: str # "reachable" | "unreachable" | "inconclusive" + detail: str | None = None + + @property + def unreachable(self) -> bool: + return self.status == "unreachable" + + +@dataclass +class PreflightReport: + """Aggregate result of probing every model in a run.""" + + probes: list[ModelProbe] = field(default_factory=list) + + @property + def unreachable(self) -> list[ModelProbe]: + return [p for p in self.probes if p.unreachable] + + @property + def ok(self) -> bool: + """True iff no model was proven unreachable.""" + return not self.unreachable + + def failure_message(self) -> str: + """Render the actionable abort message naming the bad slug(s).""" + bad = self.unreachable + lines = [ + f"Pre-flight failed: {len(bad)} model(s) in --models are unreachable " + "(the provider rejected the slug, not a transient error):", + ] + for p in bad: + detail = f" — {p.detail}" if p.detail else "" + lines.append(f" - {p.model}: model not found / no endpoints{detail}") + lines.append( + "Fix the slug(s) above (check the provider's model catalog) or drop " + "them from --models. Pass --skip-preflight to bypass this check." + ) + return "\n".join(lines) + + +def _probe_one(client: LLMClient, model: str) -> ModelProbe: + """Probe a single model with a minimal 1-token completion.""" + request = CompletionRequest( + model=model, + max_tokens=1, + messages=[InputMessage(role="user", content=[TextBlock(text="ping")])], + temperature=0.0, + cache_enabled=False, + ) + try: + client.send(request) + except LLMError as exc: + if exc.category in _UNREACHABLE_CATEGORIES: + logger.info("preflight: %s unreachable (%s)", model, exc.category.value) + return ModelProbe(model=model, status="unreachable", detail=str(exc)[:200]) + # Anything else is inconclusive — don't block the run on a flake or + # an environment problem the operator can see in the real error. + logger.info("preflight: %s inconclusive (%s)", model, exc.category.value) + return ModelProbe(model=model, status="inconclusive", detail=str(exc)[:200]) + except Exception as exc: # pragma: no cover - defensive + logger.info("preflight: %s inconclusive (%s)", model, type(exc).__name__) + return ModelProbe(model=model, status="inconclusive", detail=str(exc)[:200]) + return ModelProbe(model=model, status="reachable") + + +def preflight_models( + models: list[str], + *, + client: LLMClient | None = None, +) -> PreflightReport: + """Probe each distinct slug in *models* once and return a report. + + Order in the report follows first appearance in *models*. Probes run + concurrently; the shared :class:`LLMClient` applies any configured + concurrency / rate-limit throttles. + """ + distinct: list[str] = list(dict.fromkeys(m for m in models if m)) + if not distinct: + return PreflightReport() + + probe_client = client or LLMClient() + results: dict[str, ModelProbe] = {} + max_workers = min(len(distinct), 8) + with ThreadPoolExecutor(max_workers=max_workers) as pool: + futures = {pool.submit(_probe_one, probe_client, m): m for m in distinct} + for fut in futures: + probe = fut.result() + results[probe.model] = probe + + return PreflightReport(probes=[results[m] for m in distinct]) diff --git a/src/synth_panel/response_coercion.py b/src/synth_panel/response_coercion.py new file mode 100644 index 0000000..08f366a --- /dev/null +++ b/src/synth_panel/response_coercion.py @@ -0,0 +1,161 @@ +"""Post-hoc coercion of free-text panelist answers to a declared +``response_schema`` (sy-547). + +A question may declare a typed ``response_schema`` — ``enum`` (a fixed set +of options) or ``scale`` (an integer range ``[min, max]``). Historically +this was validated at instrument-load and factored into the dry-run +token estimate, but never enforced or checked against the model's output: +panelists free-answered in prose (``"Blue."`` for an ``["red","green", +"blue"]`` enum), and downstream tooling (``poll-summary``) saw ``kind=text``. + +This module performs the minimum-viable enforcement layer described in +issue #547 (b): given a declared schema and the raw free-text answer, +map the answer back to the nearest declared option (enum) or to an +integer inside the scale range (scale). It returns a :class:`CoercionResult` +carrying the typed value (or ``None`` when nothing maps), so the caller +can persist BOTH the raw text and the typed value and surface a +per-response warning + run-level failure count for unmappable answers. + +The matching is deliberately conservative — it normalizes case, +whitespace, and surrounding punctuation, and accepts an exact normalized +match or an unambiguous single-option substring hit. It does NOT attempt +fuzzy/edit-distance matching: silently snapping ``"navy"`` to ``"blue"`` +would manufacture data the panelist never produced. +""" + +from __future__ import annotations + +import re +from dataclasses import dataclass +from typing import Any + + +@dataclass(frozen=True) +class CoercionResult: + """Outcome of coercing one free-text answer against a typed schema. + + ``kind`` is the declared schema type (``"enum"`` / ``"scale"``). + ``value`` is the typed result (the canonical option string for enum, + an ``int`` for scale) or ``None`` when the answer could not be mapped. + ``mapped`` is ``True`` iff a value was recovered. ``raw`` echoes the + original free-text answer for persistence. + """ + + kind: str + raw: str + value: Any | None + mapped: bool + + +# Punctuation/whitespace stripped from both the answer and each option +# before comparison. Keeps internal characters (e.g. "price-band-a") so +# hyphenated options still match. +_EDGE_PUNCT = " \t\r\n.,;:!?\"'`()[]{}*_-—–" # noqa: RUF001 - em/en dashes are intentional strip chars + + +def _normalize(text: str) -> str: + """Lowercase, collapse internal whitespace, strip edge punctuation.""" + collapsed = re.sub(r"\s+", " ", text).strip() + return collapsed.strip(_EDGE_PUNCT).lower() + + +def is_typed_schema(response_schema: Any) -> bool: + """True iff ``response_schema`` declares an enforceable typed shape. + + Only ``enum`` and ``scale`` are enforceable here; ``text`` and + ``tagged_themes`` (and legacy inline JSON Schemas) are not coerced. + """ + if not isinstance(response_schema, dict): + return False + return response_schema.get("type") in {"enum", "scale"} + + +def coerce_enum(raw: str, options: list[str]) -> CoercionResult: + """Map a free-text answer to one of *options* (case/punctuation-insensitive). + + Resolution order: + 1. Exact normalized equality with an option. + 2. Unique substring hit — exactly one option's normalized form + appears as a token-bounded substring of the normalized answer + (catches ``"Blue."`` → ``"blue"`` and ``"I'd pick green"`` → + ``"green"``). Ambiguous matches (the answer contains two + options) do NOT map, so a hedging answer is flagged rather + than arbitrarily resolved. + """ + norm_answer = _normalize(raw) + norm_options = [(opt, _normalize(opt)) for opt in options] + + # 1. Exact normalized match. + for canonical, norm in norm_options: + if norm and norm_answer == norm: + return CoercionResult(kind="enum", raw=raw, value=canonical, mapped=True) + + # 2. Unique token-bounded substring match. + hits = [canonical for canonical, norm in norm_options if norm and _contains_token(norm_answer, norm)] + # De-dupe by canonical option (options are unique per schema validation, + # but normalization could collapse two — guard anyway). + unique_hits = list(dict.fromkeys(hits)) + if len(unique_hits) == 1: + return CoercionResult(kind="enum", raw=raw, value=unique_hits[0], mapped=True) + + return CoercionResult(kind="enum", raw=raw, value=None, mapped=False) + + +def _contains_token(haystack: str, needle: str) -> bool: + """True iff *needle* appears in *haystack* on word boundaries. + + Both are already normalized (lowercased, edge-stripped). Using word + boundaries avoids ``"red"`` matching inside ``"predisposed"`` while + still catching ``"blue."`` (normalized to ``"blue"``) and multi-word + options like ``"price band a"``. + """ + if needle == haystack: + return True + return re.search(rf"(? CoercionResult: + """Map a free-text answer to an integer in ``[lo, hi]``. + + Pulls the first integer token from the answer and accepts it only + when it lands inside the declared range. ``"7"``, ``"I'd say 7 out + of 10"`` → ``7``; ``"eleven"`` or ``"42"`` (out of range) → unmapped. + """ + m = re.search(r"-?\d+", raw) + if m is None: + return CoercionResult(kind="scale", raw=raw, value=None, mapped=False) + try: + n = int(m.group()) + except ValueError: # pragma: no cover - regex guarantees parseable + return CoercionResult(kind="scale", raw=raw, value=None, mapped=False) + if lo <= n <= hi: + return CoercionResult(kind="scale", raw=raw, value=n, mapped=True) + return CoercionResult(kind="scale", raw=raw, value=None, mapped=False) + + +def coerce_response(response_schema: Any, raw: Any) -> CoercionResult | None: + """Coerce *raw* against a declared typed ``response_schema``. + + Returns ``None`` when the schema is not an enforceable typed shape + (``is_typed_schema`` is false) or when *raw* is not a usable string — + the caller leaves the response untouched in those cases. Otherwise + returns a :class:`CoercionResult` (``mapped`` may be ``False``). + """ + if not is_typed_schema(response_schema): + return None + if not isinstance(raw, str) or not raw.strip(): + return None + + kind = response_schema["type"] + if kind == "enum": + options = response_schema.get("options") + if not isinstance(options, list) or not options: + return None + return coerce_enum(raw, [o for o in options if isinstance(o, str)]) + if kind == "scale": + lo = response_schema.get("min") + hi = response_schema.get("max") + if not isinstance(lo, int) or not isinstance(hi, int): + return None + return coerce_scale(raw, lo, hi) + return None diff --git a/tests/test_preflight.py b/tests/test_preflight.py new file mode 100644 index 0000000..0357bca --- /dev/null +++ b/tests/test_preflight.py @@ -0,0 +1,89 @@ +"""sy-546: model reachability pre-flight. + +A bad slug that 404s on every call must be caught before spending, with a +fail-fast message naming it. Transient / auth / credential failures are +inconclusive and never block the run. +""" + +from __future__ import annotations + +from synth_panel.llm.errors import LLMError, LLMErrorCategory +from synth_panel.llm.models import CompletionRequest +from synth_panel.preflight import ( + PreflightReport, + preflight_models, +) + +BAD_SLUG = "openrouter/google/gemini-2.0-flash-001" +GOOD_SLUG = "openrouter/openai/gpt-4o-mini" + + +class _FakeClient: + """Stand-in for LLMClient.send keyed by per-model behaviour.""" + + def __init__(self, behaviour: dict[str, object]) -> None: + self._behaviour = behaviour + self.calls: list[str] = [] + + def send(self, request: CompletionRequest): + self.calls.append(request.model) + outcome = self._behaviour.get(request.model, "ok") + if isinstance(outcome, Exception): + raise outcome + return outcome # truthy sentinel — preflight ignores the value + + +def test_bad_slug_is_flagged_unreachable() -> None: + client = _FakeClient( + { + BAD_SLUG: LLMError( + "OpenRouter API error 404: No endpoints found for google/gemini-2.0-flash-001.", + LLMErrorCategory.BAD_REQUEST, + status_code=404, + ), + GOOD_SLUG: "ok", + } + ) + report = preflight_models([GOOD_SLUG, BAD_SLUG], client=client) + + assert not report.ok + bad = report.unreachable + assert [p.model for p in bad] == [BAD_SLUG] + msg = report.failure_message() + assert BAD_SLUG in msg + assert "unreachable" in msg.lower() + + +def test_all_reachable_is_ok() -> None: + client = _FakeClient({GOOD_SLUG: "ok", "openrouter/anthropic/claude-haiku-4.5": "ok"}) + report = preflight_models([GOOD_SLUG, "openrouter/anthropic/claude-haiku-4.5"], client=client) + assert report.ok + assert report.unreachable == [] + + +def test_rate_limit_is_inconclusive_not_unreachable() -> None: + # A 429 must NOT abort the run — it isn't a property of the slug. + client = _FakeClient({GOOD_SLUG: LLMError("rate limited", LLMErrorCategory.RATE_LIMIT, status_code=429)}) + report = preflight_models([GOOD_SLUG], client=client) + assert report.ok + statuses = {p.model: p.status for p in report.probes} + assert statuses[GOOD_SLUG] == "inconclusive" + + +def test_missing_credentials_is_inconclusive() -> None: + client = _FakeClient({GOOD_SLUG: LLMError("no key", LLMErrorCategory.MISSING_CREDENTIALS)}) + report = preflight_models([GOOD_SLUG], client=client) + assert report.ok + + +def test_distinct_models_probed_once_each() -> None: + client = _FakeClient({GOOD_SLUG: "ok"}) + preflight_models([GOOD_SLUG, GOOD_SLUG, GOOD_SLUG], client=client) + assert client.calls.count(GOOD_SLUG) == 1 + + +def test_empty_models_returns_empty_report() -> None: + report = preflight_models([], client=_FakeClient({})) + assert isinstance(report, PreflightReport) + assert report.ok + assert report.probes == [] diff --git a/tests/test_preflight_cli.py b/tests/test_preflight_cli.py new file mode 100644 index 0000000..6309ba0 --- /dev/null +++ b/tests/test_preflight_cli.py @@ -0,0 +1,190 @@ +"""sy-546: CLI integration for the model reachability pre-flight. + +A multi-model run with a bad slug aborts before spending, naming the bad +slug; --skip-preflight bypasses the check; --dry-run runs the same check. +""" + +from __future__ import annotations + +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + +from synth_panel.main import main +from synth_panel.preflight import ModelProbe, PreflightReport + +BAD = "openrouter/google/gemini-2.0-flash-001" +GOOD_A = "openrouter/openai/gpt-4o-mini" +GOOD_B = "openrouter/anthropic/claude-haiku-4.5" + + +def _write_inputs(tmp_path: Path) -> tuple[Path, Path]: + personas = tmp_path / "personas.yaml" + personas.write_text("personas:\n - name: A\n - name: B\n") + survey = tmp_path / "survey.yaml" + survey.write_text("instrument:\n version: 1\n questions:\n - text: Q?\n") + return personas, survey + + +def _report(reachable: list[str], unreachable: list[str]) -> PreflightReport: + probes = [ModelProbe(model=m, status="reachable") for m in reachable] + probes += [ModelProbe(model=m, status="unreachable", detail="404 no endpoints") for m in unreachable] + return PreflightReport(probes=probes) + + +@patch("synth_panel.preflight.preflight_models") +def test_bad_slug_aborts_naming_it( + mock_preflight: MagicMock, + capsys: pytest.CaptureFixture[str], + tmp_path: Path, +) -> None: + mock_preflight.return_value = _report([GOOD_A], [BAD]) + personas, survey = _write_inputs(tmp_path) + + code = main( + [ + "panel", + "run", + "--personas", + str(personas), + "--instrument", + str(survey), + "--models", + f"{GOOD_A},{BAD}", + "--no-synthesis", + ] + ) + + assert code == 1 + err = capsys.readouterr().err + assert "Pre-flight failed" in err + assert BAD in err + # The orchestrator never ran — preflight was the gate. + mock_preflight.assert_called_once() + + +@patch("synth_panel.orchestrator.AgentRuntime") +@patch("synth_panel.preflight.preflight_models") +def test_skip_preflight_bypasses_check( + mock_preflight: MagicMock, + mock_runtime: MagicMock, + tmp_path: Path, +) -> None: + mock_preflight.return_value = _report([], [BAD]) # would abort if consulted + runtime = MagicMock() + from synth_panel.cost import TokenUsage as CostTokenUsage + from synth_panel.persistence import ConversationMessage + from synth_panel.runtime import TurnSummary + + usage = CostTokenUsage(input_tokens=5, output_tokens=2) + runtime.run_turn.return_value = TurnSummary( + assistant_messages=[ + ConversationMessage(role="assistant", content=[{"type": "text", "text": "ok"}], usage=usage) + ], + iterations=1, + usage=usage, + ) + mock_runtime.return_value = runtime + personas, survey = _write_inputs(tmp_path) + + code = main( + [ + "panel", + "run", + "--personas", + str(personas), + "--instrument", + str(survey), + "--models", + f"{GOOD_A},{BAD}", + "--skip-preflight", + "--no-synthesis", + "--max-concurrent", + "1", + ] + ) + + # With --skip-preflight the bad-slug report is never consulted. + mock_preflight.assert_not_called() + assert code == 0 + + +@patch("synth_panel.preflight.preflight_models") +def test_dry_run_performs_preflight( + mock_preflight: MagicMock, + capsys: pytest.CaptureFixture[str], + tmp_path: Path, +) -> None: + mock_preflight.return_value = _report([GOOD_A], [BAD]) + personas, survey = _write_inputs(tmp_path) + + code = main( + [ + "panel", + "run", + "--personas", + str(personas), + "--instrument", + str(survey), + "--models", + f"{GOOD_A},{BAD}", + "--dry-run", + "--no-synthesis", + ] + ) + + assert code == 1 + err = capsys.readouterr().err + assert BAD in err + mock_preflight.assert_called_once() + + +@patch("synth_panel.orchestrator.AgentRuntime") +@patch("synth_panel.preflight.preflight_models") +def test_min_models_allows_degraded_run( + mock_preflight: MagicMock, + mock_runtime: MagicMock, + capsys: pytest.CaptureFixture[str], + tmp_path: Path, +) -> None: + # 2 reachable, 1 bad; --min-models 2 should proceed with a warning. + mock_preflight.return_value = _report([GOOD_A, GOOD_B], [BAD]) + runtime = MagicMock() + from synth_panel.cost import TokenUsage as CostTokenUsage + from synth_panel.persistence import ConversationMessage + from synth_panel.runtime import TurnSummary + + usage = CostTokenUsage(input_tokens=5, output_tokens=2) + runtime.run_turn.return_value = TurnSummary( + assistant_messages=[ + ConversationMessage(role="assistant", content=[{"type": "text", "text": "ok"}], usage=usage) + ], + iterations=1, + usage=usage, + ) + mock_runtime.return_value = runtime + personas, survey = _write_inputs(tmp_path) + + code = main( + [ + "panel", + "run", + "--personas", + str(personas), + "--instrument", + str(survey), + "--models", + f"{GOOD_A},{GOOD_B},{BAD}", + "--min-models", + "2", + "--no-synthesis", + "--max-concurrent", + "1", + ] + ) + + assert code == 0 + err = capsys.readouterr().err + assert "WARNING" in err + assert BAD in err diff --git a/tests/test_response_coercion.py b/tests/test_response_coercion.py new file mode 100644 index 0000000..62a662b --- /dev/null +++ b/tests/test_response_coercion.py @@ -0,0 +1,107 @@ +"""sy-547: post-hoc coercion of free-text answers to a typed response_schema. + +Covers the enum/scale mapping rules in +:mod:`synth_panel.response_coercion`: + +* ``"Blue."`` → ``"blue"`` (case + trailing punctuation normalized). +* Unmappable / ambiguous answers report ``mapped=False`` (no fabrication). +* Scale answers coerce to an in-range integer, reject out-of-range. +* Non-typed schemas / non-string raw values return ``None`` (caller skips). +""" + +from __future__ import annotations + +from synth_panel.response_coercion import ( + CoercionResult, + coerce_enum, + coerce_response, + coerce_scale, + is_typed_schema, +) + + +class TestIsTypedSchema: + def test_enum_and_scale_are_typed(self) -> None: + assert is_typed_schema({"type": "enum", "options": ["a"]}) + assert is_typed_schema({"type": "scale", "min": 1, "max": 5}) + + def test_text_and_tagged_and_legacy_are_not(self) -> None: + assert not is_typed_schema({"type": "text"}) + assert not is_typed_schema({"type": "tagged_themes", "taxonomy": ["x"]}) + assert not is_typed_schema({"type": "object", "properties": {}}) + assert not is_typed_schema(None) + assert not is_typed_schema("nope") + + +class TestCoerceEnum: + OPTIONS = ["red", "green", "blue"] + + def test_repro_blue_period_maps_to_blue(self) -> None: + # The exact #547 repro: model answers "Blue." for a lowercase enum. + result = coerce_enum("Blue.", self.OPTIONS) + assert result == CoercionResult(kind="enum", raw="Blue.", value="blue", mapped=True) + + def test_exact_case_insensitive(self) -> None: + assert coerce_enum("GREEN", self.OPTIONS).value == "green" + + def test_substring_in_prose(self) -> None: + result = coerce_enum("I would definitely pick green here", self.OPTIONS) + assert result.value == "green" + assert result.mapped + + def test_word_boundary_avoids_false_substring(self) -> None: + # "red" must not match inside "predisposed". + result = coerce_enum("I am predisposed to neither", self.OPTIONS) + assert not result.mapped + assert result.value is None + + def test_ambiguous_answer_does_not_map(self) -> None: + # Mentions two options — refuse to guess. + result = coerce_enum("between red and blue", self.OPTIONS) + assert not result.mapped + assert result.value is None + + def test_unmappable_answer(self) -> None: + result = coerce_enum("Maybe a nice teal?", self.OPTIONS) + assert not result.mapped + + def test_multiword_option_matches(self) -> None: + opts = ["price band a", "price band b"] + assert coerce_enum("I'd choose Price Band A.", opts).value == "price band a" + + +class TestCoerceScale: + def test_in_range_integer(self) -> None: + assert coerce_scale("7", 1, 10).value == 7 + + def test_integer_in_prose(self) -> None: + result = coerce_scale("I'd say 7 out of 10", 1, 10) + assert result.value == 7 + assert result.mapped + + def test_out_of_range_does_not_map(self) -> None: + result = coerce_scale("42", 1, 10) + assert not result.mapped + assert result.value is None + + def test_no_number_does_not_map(self) -> None: + assert not coerce_scale("eleven", 1, 10).mapped + + +class TestCoerceResponse: + def test_enum_dispatch(self) -> None: + schema = {"type": "enum", "options": ["red", "green", "blue"]} + assert coerce_response(schema, "Blue.").value == "blue" + + def test_scale_dispatch(self) -> None: + schema = {"type": "scale", "min": 1, "max": 5} + assert coerce_response(schema, "3").value == 3 + + def test_non_typed_schema_returns_none(self) -> None: + assert coerce_response({"type": "text"}, "anything") is None + + def test_non_string_raw_returns_none(self) -> None: + schema = {"type": "enum", "options": ["a"]} + assert coerce_response(schema, None) is None + assert coerce_response(schema, {"a": 1}) is None + assert coerce_response(schema, " ") is None diff --git a/tests/test_response_schema_enforcement.py b/tests/test_response_schema_enforcement.py new file mode 100644 index 0000000..6fa1f91 --- /dev/null +++ b/tests/test_response_schema_enforcement.py @@ -0,0 +1,271 @@ +"""sy-547: end-to-end enforcement of typed response_schema. + +Two layers: + +* Orchestrator integration — a question with an enum/scale + ``response_schema`` coerces the free-text answer to a typed value, + persists BOTH raw (``response``) and typed (``response_typed``) plus the + schema kind, and flags unmappable answers with ``schema_unmapped``. +* Poll-summary integration — a saved result carrying the persisted + ``response_schema`` + ``response_typed`` buckets the question as ``enum`` + (not ``text``) and counts unmappable answers as unparseable. +""" + +from __future__ import annotations + +from typing import Any +from unittest.mock import MagicMock, patch + +from synth_panel.cost import TokenUsage as CostTokenUsage +from synth_panel.orchestrator import run_panel_parallel +from synth_panel.persistence import ConversationMessage +from synth_panel.poll_summary import build_poll_summary +from synth_panel.runtime import TurnSummary + + +def _system(p: dict[str, Any]) -> str: + return f"You are {p['name']}" + + +def _question(q: dict[str, Any]) -> str: + return q["text"] + + +def _turn(text: str) -> TurnSummary: + usage = CostTokenUsage(input_tokens=10, output_tokens=5) + msg = ConversationMessage(role="assistant", content=[{"type": "text", "text": text}], usage=usage) + return TurnSummary(assistant_messages=[msg], iterations=1, usage=usage) + + +# --------------------------------------------------------------------------- +# Orchestrator coercion +# --------------------------------------------------------------------------- + + +@patch("synth_panel.orchestrator.AgentRuntime") +def test_enum_answer_is_coerced_and_persisted(mock_runtime_cls: MagicMock) -> None: + """The #547 repro: 'Blue.' for an enum coerces to 'blue', raw kept.""" + questions = [ + { + "text": "Pick exactly one color and output only that.", + "response_schema": {"type": "enum", "options": ["red", "green", "blue"]}, + } + ] + + def runtime_factory(*args, **kwargs): + runtime = MagicMock() + runtime.run_turn.side_effect = lambda prompt: _turn("Blue.") + return runtime + + mock_runtime_cls.side_effect = runtime_factory + + results, _reg, _sess = run_panel_parallel( + client=MagicMock(), + personas=[{"name": "P0"}], + questions=questions, + model="claude-sonnet-4-6", + system_prompt_fn=_system, + question_prompt_fn=_question, + max_workers=1, + ) + + resp = results[0].responses[0] + assert resp["response"] == "Blue." # raw text preserved + assert resp["response_typed"] == "blue" # coerced typed value + assert resp["response_schema"]["type"] == "enum" + assert "schema_unmapped" not in resp + + +@patch("synth_panel.orchestrator.AgentRuntime") +def test_unmappable_enum_answer_is_flagged(mock_runtime_cls: MagicMock) -> None: + questions = [ + { + "text": "Pick a color.", + "response_schema": {"type": "enum", "options": ["red", "green", "blue"]}, + } + ] + + def runtime_factory(*args, **kwargs): + runtime = MagicMock() + runtime.run_turn.side_effect = lambda prompt: _turn("I'd go with a nice teal.") + return runtime + + mock_runtime_cls.side_effect = runtime_factory + + results, _reg, _sess = run_panel_parallel( + client=MagicMock(), + personas=[{"name": "P0"}], + questions=questions, + model="claude-sonnet-4-6", + system_prompt_fn=_system, + question_prompt_fn=_question, + max_workers=1, + ) + + resp = results[0].responses[0] + assert resp["response"] == "I'd go with a nice teal." + assert resp.get("schema_unmapped") is True + assert "response_typed" not in resp + + +@patch("synth_panel.orchestrator.AgentRuntime") +def test_scale_answer_coerced_to_int(mock_runtime_cls: MagicMock) -> None: + questions = [{"text": "Rate 1-5.", "response_schema": {"type": "scale", "min": 1, "max": 5}}] + + def runtime_factory(*args, **kwargs): + runtime = MagicMock() + runtime.run_turn.side_effect = lambda prompt: _turn("I'd say 4 out of 5.") + return runtime + + mock_runtime_cls.side_effect = runtime_factory + + results, _reg, _sess = run_panel_parallel( + client=MagicMock(), + personas=[{"name": "P0"}], + questions=questions, + model="claude-sonnet-4-6", + system_prompt_fn=_system, + question_prompt_fn=_question, + max_workers=1, + ) + + resp = results[0].responses[0] + assert resp["response_typed"] == 4 + + +@patch("synth_panel.orchestrator.AgentRuntime") +def test_text_schema_left_untouched(mock_runtime_cls: MagicMock) -> None: + questions = [{"text": "Tell me a story.", "response_schema": {"type": "text"}}] + + def runtime_factory(*args, **kwargs): + runtime = MagicMock() + runtime.run_turn.side_effect = lambda prompt: _turn("Once upon a time...") + return runtime + + mock_runtime_cls.side_effect = runtime_factory + + results, _reg, _sess = run_panel_parallel( + client=MagicMock(), + personas=[{"name": "P0"}], + questions=questions, + model="claude-sonnet-4-6", + system_prompt_fn=_system, + question_prompt_fn=_question, + max_workers=1, + ) + + resp = results[0].responses[0] + assert "response_typed" not in resp + assert "response_schema" not in resp + assert "schema_unmapped" not in resp + + +# --------------------------------------------------------------------------- +# Poll-summary buckets the persisted enum schema +# --------------------------------------------------------------------------- + + +def _saved_envelope() -> dict[str, Any]: + """A saved result shaped like ``save_panel_result`` writes after sy-547, + carrying the question's response_schema + per-response response_typed.""" + return { + "persona_count": 3, + "question_count": 1, + "questions": [ + { + "text": "Pick exactly one color and output only that.", + "response_schema": {"type": "enum", "options": ["red", "green", "blue"]}, + } + ], + "results": [ + { + "persona": "P0", + "responses": [ + { + "question": "Pick...", + "response": "Blue.", + "response_typed": "blue", + "response_schema": {"type": "enum", "options": ["red", "green", "blue"]}, + } + ], + }, + { + "persona": "P1", + "responses": [ + { + "question": "Pick...", + "response": "blue", + "response_typed": "blue", + "response_schema": {"type": "enum", "options": ["red", "green", "blue"]}, + } + ], + }, + { + "persona": "P2", + "responses": [ + { + "question": "Pick...", + "response": "teal", + "schema_unmapped": True, + "response_schema": {"type": "enum", "options": ["red", "green", "blue"]}, + } + ], + }, + ], + } + + +def test_poll_summary_buckets_persisted_enum_as_enum() -> None: + summary = build_poll_summary(_saved_envelope()) + q0 = summary.questions[0] + assert q0.kind == "enum" + # Two panelists coerced to "blue"; the third (teal) is unparseable. + assert q0.first_choice_counts == {"blue": 2} + assert q0.winner == "blue" + assert q0.n_unparseable == 1 + + +# --------------------------------------------------------------------------- +# sy-546: blend-member drop detection +# --------------------------------------------------------------------------- + + +class _FakePR: + def __init__(self, responses: list[dict[str, Any]]) -> None: + self.responses = responses + + +class _FakeMR: + def __init__(self, model: str, panelist_results: list[_FakePR]) -> None: + self.model = model + self.panelist_results = panelist_results + + +class _FakeEnsemble: + def __init__(self, model_results: list[_FakeMR]) -> None: + self.model_results = model_results + + +def test_blend_dropped_models_detects_all_error_member() -> None: + from synth_panel.cli.commands import _blend_dropped_models + + good = _FakeMR("good-model", [_FakePR([{"question": "Q", "response": "an answer"}])]) + # Every response is an error / inline error string → dropped. + bad = _FakeMR( + "bad-model", + [ + _FakePR([{"question": "Q", "response": "[error: OpenRouter API error 404]", "error": True}]), + _FakePR([{"question": "Q", "response": "[error: OpenRouter API error 404]", "error": True}]), + ], + ) + ensemble = _FakeEnsemble([good, bad]) + + assert _blend_dropped_models(ensemble) == ["bad-model"] + + +def test_blend_dropped_models_empty_when_all_healthy() -> None: + from synth_panel.cli.commands import _blend_dropped_models + + a = _FakeMR("a", [_FakePR([{"question": "Q", "response": "x"}])]) + b = _FakeMR("b", [_FakePR([{"question": "Q", "response": "y"}])]) + assert _blend_dropped_models(_FakeEnsemble([a, b])) == []