From 511370a6934fa4c124893b603179c06fac93ac63 Mon Sep 17 00:00:00 2001 From: Thormatt Date: Thu, 11 Jun 2026 22:36:36 -0400 Subject: [PATCH 1/4] feat(ingest): PDF ingestion via pypdf MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The #1 product gap from the review: the target users' corpora (credit memos, clinical summaries, contracts) are PDFs, and ingest rejected them outright. orc ingest now accepts .pdf files and application/pdf URLs: per-page text extraction with pypdf, pages joined with blank lines, PDF metadata /Title preferred over the heading-scan fallback. Owner- password-locked PDFs with an empty user password (common for distributed contracts) open via decrypt(""); truly password-protected, unparseable, and scanned/image-only PDFs are refused loudly — silently ingesting an empty corpus would produce confident not_found verdicts downstream. pypdf internals never leak; callers see ValueError. All existing URL guards (SSRF pinning, redirect re-validation, size limit) sit in front of the PDF branch unchanged. The request User-Agent now derives from orc.__version__ instead of a hardcoded stale string. Co-Authored-By: Claude Fable 5 --- pyproject.toml | 1 + src/orc/ingest/loaders.py | 67 +++++++++++++++- tests/unit/test_loaders.py | 151 +++++++++++++++++++++++++++++++++++++ 3 files changed, 217 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 91b101b..ba2cf83 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,6 +45,7 @@ dependencies = [ "rich>=13.0", "python-ulid>=2.0", "python-dotenv>=1.0", + "pypdf>=4.0", ] [project.optional-dependencies] diff --git a/src/orc/ingest/loaders.py b/src/orc/ingest/loaders.py index e4ab577..eedb62c 100644 --- a/src/orc/ingest/loaders.py +++ b/src/orc/ingest/loaders.py @@ -1,8 +1,14 @@ -"""File and URL loaders. Each returns a `LoadedDoc` with raw bytes + decoded text.""" +"""File and URL loaders. Each returns a `LoadedDoc` with raw bytes + extracted text. + +Supported formats: the text mimes in SUPPORTED_TEXT_MIMES (markdown, plain text, +HTML, reST, JSON) plus application/pdf, whose text is extracted with pypdf. +Scanned/image-only PDFs are rejected — OCR is not supported. +""" from __future__ import annotations import hashlib +import io import ipaddress import mimetypes import socket @@ -11,10 +17,15 @@ from urllib.parse import urljoin, urlparse import httpx +from pypdf import PdfReader + +from orc import __version__ MAX_URL_BYTES = 25 * 1024 * 1024 MAX_REDIRECTS = 5 +PDF_MIME = "application/pdf" + SUPPORTED_TEXT_MIMES = { "text/markdown", "text/x-markdown", @@ -37,6 +48,15 @@ class LoadedDoc: def load_file(path: Path) -> LoadedDoc: raw_bytes = path.read_bytes() mime = _guess_mime(path) + if mime == PDF_MIME: + text, pdf_title = _extract_pdf(raw_bytes, source=str(path)) + return LoadedDoc( + source_uri=str(path.resolve()), + title=pdf_title or _extract_title(text, fallback=path.stem), + mime_type=mime, + text=text, + raw_bytes=raw_bytes, + ) if mime not in SUPPORTED_TEXT_MIMES and not mime.startswith("text/"): raise ValueError(f"Unsupported file type for ingest: {mime} ({path})") text = raw_bytes.decode("utf-8", errors="replace") @@ -133,7 +153,7 @@ def load_url( with httpx.Client( timeout=timeout, follow_redirects=False, - headers={"User-Agent": "orc/0.1.0"}, + headers={"User-Agent": f"orc/{__version__}"}, transport=transport, ) as http: for _ in range(MAX_REDIRECTS + 1): @@ -151,6 +171,15 @@ def load_url( if len(raw_bytes) > MAX_URL_BYTES: raise ValueError(f"URL response exceeds {MAX_URL_BYTES} byte limit: {url!r}") mime = response.headers.get("content-type", "application/octet-stream").split(";")[0].strip() + if mime == PDF_MIME: + text, pdf_title = _extract_pdf(raw_bytes, source=url) + return LoadedDoc( + source_uri=url, + title=pdf_title or _extract_title(text, fallback=url), + mime_type=mime, + text=text, + raw_bytes=raw_bytes, + ) if mime not in SUPPORTED_TEXT_MIMES and not mime.startswith("text/"): raise ValueError(f"Unsupported URL content-type for ingest: {mime} ({url})") text = raw_bytes.decode(response.encoding or "utf-8", errors="replace") @@ -163,6 +192,40 @@ def load_url( ) +def _extract_pdf(raw_bytes: bytes, *, source: str) -> tuple[str, str | None]: + """Extract (text, metadata /Title) from a PDF in a single parse. + + Pages are joined with blank lines, skipping empty ones. The metadata title + is surfaced because PDF corpora (credit memos, contracts) rarely contain + the markdown-style headings _extract_title scans for. + """ + try: + reader = PdfReader(io.BytesIO(raw_bytes)) + # Owner-password-locked PDFs with an empty user password (common for + # distributed contracts/memos) open with decrypt(""); only PDFs that + # truly require a password are refused. + if reader.is_encrypted and not reader.decrypt(""): + raise ValueError(f"Could not extract text from PDF (encrypted): {source}") + page_texts = (page.extract_text() for page in reader.pages) + text = "\n\n".join(page_text for page_text in page_texts if page_text.strip()) + meta = reader.metadata + title = (meta.title or "").strip() if meta is not None else "" + except ValueError: + raise + except Exception as exc: + # pypdf raises its own hierarchy (PdfReadError, PdfStreamError, ...); + # callers should see one stable, actionable error type instead. + raise ValueError(f"Could not extract text from PDF: {source} ({exc})") from exc + if not text: + # Silently ingesting an empty corpus would produce confident + # not_found verdicts downstream, so refuse loudly instead. + raise ValueError( + "Could not extract text from PDF (scanned/image-only? " + f"OCR is not supported): {source}" + ) + return text, title or None + + def sha256_bytes(data: bytes) -> str: return hashlib.sha256(data).hexdigest() diff --git a/tests/unit/test_loaders.py b/tests/unit/test_loaders.py index d9aeb3a..8fac756 100644 --- a/tests/unit/test_loaders.py +++ b/tests/unit/test_loaders.py @@ -2,11 +2,13 @@ from __future__ import annotations +import io import socket from pathlib import Path import httpx import pytest +from pypdf import PdfWriter from orc.ingest.loaders import load_file, load_url, sha256_bytes @@ -22,6 +24,134 @@ def _text_response(body: str) -> httpx.Response: return httpx.Response(200, headers={"content-type": "text/plain"}, text=body) +def _pdf_bytes(*page_texts: str | None, title: str | None = None) -> bytes: + """Hand-rolled minimal one-object-per-page PDF so tests need no binary fixture. + + Each entry in `page_texts` becomes one page; None produces a page with an + empty content stream, mimicking a scanned/image-only page that pypdf + extracts as "". Streams are uncompressed so the whole file stays tiny. + """ + objects: list[bytes] = [] + + def add(body: str) -> int: + objects.append(body.encode("latin-1")) + return len(objects) + + catalog_num = add("<< /Type /Catalog /Pages 2 0 R >>") + pages_num = add("PLACEHOLDER") # patched below once page object numbers exist + font_num = add("<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>") + + kid_nums = [] + for text in page_texts: + stream = f"BT /F1 12 Tf 72 720 Td ({text}) Tj ET" if text is not None else "" + content_num = add(f"<< /Length {len(stream)} >>\nstream\n{stream}\nendstream") + kid_nums.append( + add( + "<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] " + f"/Resources << /Font << /F1 {font_num} 0 R >> >> " + f"/Contents {content_num} 0 R >>" + ) + ) + + kids = " ".join(f"{n} 0 R" for n in kid_nums) + objects[pages_num - 1] = ( + f"<< /Type /Pages /Kids [{kids}] /Count {len(kid_nums)} >>".encode("latin-1") + ) + info_num = add(f"<< /Title ({title}) >>") if title is not None else None + + out = io.BytesIO() + out.write(b"%PDF-1.4\n") + offsets = [] + for num, body in enumerate(objects, start=1): + offsets.append(out.tell()) + out.write(f"{num} 0 obj\n".encode("latin-1") + body + b"\nendobj\n") + xref_pos = out.tell() + out.write(f"xref\n0 {len(objects) + 1}\n".encode("latin-1")) + out.write(b"0000000000 65535 f \n") + for off in offsets: + out.write(f"{off:010d} 00000 n \n".encode("latin-1")) + trailer = f"<< /Size {len(objects) + 1} /Root {catalog_num} 0 R" + if info_num is not None: + trailer += f" /Info {info_num} 0 R" + trailer += " >>" + out.write(b"trailer\n" + trailer.encode("latin-1")) + out.write(f"\nstartxref\n{xref_pos}\n%%EOF\n".encode("latin-1")) + return out.getvalue() + + +def test_load_pdf_joins_pages_with_blank_line_and_skips_empty_pages( + tmp_path: Path, +) -> None: + p = tmp_path / "multi.pdf" + p.write_bytes(_pdf_bytes("Page one", None, "Page three")) + doc = load_file(p) + assert doc.text == "Page one\n\nPage three" + + +def test_load_pdf_with_no_extractable_text_raises_mentioning_ocr( + tmp_path: Path, +) -> None: + # A scanned/image-only PDF extracts as empty text. Ingesting it silently + # would yield an empty corpus and confident not_found verdicts downstream. + p = tmp_path / "scanned.pdf" + p.write_bytes(_pdf_bytes(None, None)) + with pytest.raises(ValueError, match="OCR"): + load_file(p) + + +def test_load_pdf_unparseable_raises_value_error(tmp_path: Path) -> None: + # pypdf internals (PdfStreamError etc.) must not leak to callers. + p = tmp_path / "corrupt.pdf" + p.write_bytes(b"%PDF-1.4\nthis is not really a pdf") + with pytest.raises(ValueError, match="Could not extract text from PDF"): + load_file(p) + + +def test_load_pdf_encrypted_raises_value_error(tmp_path: Path) -> None: + writer = PdfWriter() + writer.add_blank_page(width=612, height=792) + writer.encrypt("secret") + buf = io.BytesIO() + writer.write(buf) + p = tmp_path / "locked.pdf" + p.write_bytes(buf.getvalue()) + with pytest.raises(ValueError, match="Could not extract text from PDF"): + load_file(p) + + +def test_load_pdf_owner_locked_with_empty_user_password_ingests(tmp_path: Path) -> None: + # A large share of real-world contracts/credit memos are owner-password- + # locked but openable with an empty user password — pypdf decrypts them + # with decrypt(""). Rejecting those would fail ingest on exactly the + # document class PDF support targets. + writer = PdfWriter(clone_from=io.BytesIO(_pdf_bytes("Owner locked body"))) + writer.encrypt(user_password="", owner_password="owner-secret") + buf = io.BytesIO() + writer.write(buf) + p = tmp_path / "owner-locked.pdf" + p.write_bytes(buf.getvalue()) + doc = load_file(p) + assert "Owner locked body" in doc.text + + +def test_load_pdf_prefers_metadata_title_over_fallback(tmp_path: Path) -> None: + p = tmp_path / "scan-target.pdf" + p.write_bytes(_pdf_bytes("Some body text", title="Q3 Credit Memo")) + doc = load_file(p) + assert doc.title == "Q3 Credit Memo" + + +def test_load_pdf_extracts_text_and_falls_back_to_stem_title(tmp_path: Path) -> None: + raw = _pdf_bytes("Hello orc PDF") + p = tmp_path / "credit-memo.pdf" + p.write_bytes(raw) + doc = load_file(p) + assert doc.mime_type == "application/pdf" + assert "Hello orc PDF" in doc.text + assert doc.title == "credit-memo" + assert doc.raw_bytes == raw + + def test_load_markdown_extracts_h1_title(tmp_path: Path) -> None: p = tmp_path / "doc.md" p.write_text("# My Title\n\nSome body content.\n") @@ -73,6 +203,27 @@ def test_load_url_refuses_ssrf_targets(url: str) -> None: load_url(url) +def test_load_url_pdf_content_type_extracts_text( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setattr( + socket, "getaddrinfo", lambda host, port, *a, **k: _addrinfo(PUBLIC_IP, port) + ) + raw = _pdf_bytes("Hello orc PDF") + + def handler(request: httpx.Request) -> httpx.Response: + return httpx.Response( + 200, headers={"content-type": "application/pdf"}, content=raw + ) + + doc = load_url("http://example.com/memo.pdf", transport=httpx.MockTransport(handler)) + + assert doc.mime_type == "application/pdf" + assert "Hello orc PDF" in doc.text + assert doc.raw_bytes == raw + assert doc.source_uri == "http://example.com/memo.pdf" + + def test_load_url_pins_connection_to_validated_ip_with_original_host_header( monkeypatch: pytest.MonkeyPatch, ) -> None: From fab4986eb642b8d07f8c4611048b113c8fa22f21 Mon Sep 17 00:00:00 2001 From: Thormatt Date: Thu, 11 Jun 2026 22:36:56 -0400 Subject: [PATCH 2/4] feat(routing): real product domains for verify routing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit domain= previously only accepted HaluBench dataset names (covidQA, halueval, DROP, ...) — benchmark artifacts promoted to the product API. No legal team has a claim from domain "halueval". DOMAIN_TO_MODE is now the product map: general/legal -> evidence, clinical/biomedical -> binary, financial -> arithmetic, numeric -> binary, each annotated with the benchmark family it generalizes (legal honestly marked as having no benchmark evidence yet). The six HaluBench source names moved unchanged to BENCHMARK_SOURCE_TO_MODE and still route as aliases, so published F1 numbers stay reproducible — the benchmark now imports that map directly and cannot drift when product domains evolve. CLI/MCP help and the verify_claim docstring teach the product domains instead of dataset names. Co-Authored-By: Claude Fable 5 --- benchmarks/faithfulness/run.py | 4 +- src/orc/cli_commands/verify.py | 2 +- src/orc/directives/research/routing.py | 64 ++++++++++++++----- .../research/skills/verify_claim.py | 4 +- src/orc/mcp/server.py | 4 +- tests/unit/test_routing.py | 55 +++++++++++++--- tests/unit/test_verify_claim_skill.py | 2 +- 7 files changed, 104 insertions(+), 31 deletions(-) diff --git a/benchmarks/faithfulness/run.py b/benchmarks/faithfulness/run.py index 59d18d7..e945cc7 100644 --- a/benchmarks/faithfulness/run.py +++ b/benchmarks/faithfulness/run.py @@ -184,7 +184,9 @@ def _run_lynx_style_one(item: dict[str, Any], orc_home: Path) -> ItemResult: # subsample. Prose-heavy sources where corpus citations help → evidence mode. # Single-passage numeric/extraction tasks → binary mode. Mixed natural-language # Q+A → judgment mode. -from orc.directives.research.routing import DOMAIN_TO_MODE as SOURCE_TO_MODE # noqa: E402 +from orc.directives.research.routing import ( # noqa: E402 + BENCHMARK_SOURCE_TO_MODE as SOURCE_TO_MODE, +) def _run_with_mode(item: dict[str, Any], orc_home: Path, mode: str) -> ItemResult: diff --git a/src/orc/cli_commands/verify.py b/src/orc/cli_commands/verify.py index 17f08bc..270838f 100644 --- a/src/orc/cli_commands/verify.py +++ b/src/orc/cli_commands/verify.py @@ -43,7 +43,7 @@ @click.option( "--domain", default=None, - help="Route mode by domain hint (e.g. 'pubmedQA', 'DROP', 'FinanceBench')", + help="Route mode by domain hint (e.g. 'financial', 'clinical', 'legal')", ) @click.option("--yes", "-y", is_flag=True, help="Skip the confirmation prompt for batch verify") @click.option("--json", "as_json", is_flag=True, help="Emit raw JSON instead of formatted output") diff --git a/src/orc/directives/research/routing.py b/src/orc/directives/research/routing.py index ecc8769..40f5072 100644 --- a/src/orc/directives/research/routing.py +++ b/src/orc/directives/research/routing.py @@ -1,10 +1,12 @@ """Domain → verify-mode routing. -Callers can pass `domain="pubmedQA"` (or any other registered domain) to -`verify_claim` and the runtime picks the best mode empirically — derived from -the per-source-ds F1 breakdown in the HaluBench benchmark. The benchmark's -`SOURCE_TO_MODE` is now a thin import from this dict so the runtime and the -benchmark routing can never drift. +Callers pass a product domain (`domain="clinical"`, `domain="financial"`, ...) +to `verify_claim` and the runtime picks the verify mode that performed best on +the benchmark family that domain generalizes — derived from the per-source-ds +F1 breakdown in the HaluBench benchmark. The HaluBench `source_ds` names stay +accepted as benchmark aliases (`BENCHMARK_SOURCE_TO_MODE`) so the published +benchmark numbers remain reproducible, but the product surface is the domain +map: dataset names are benchmark artifacts, not domains a customer has. In production this lives behind a workspace tag, a manifest hint, or an explicit `--domain` flag on the verify call. Unknown domains raise rather than @@ -18,12 +20,36 @@ class UnknownDomainError(OrcError): - """Raised when a caller passes a domain not present in DOMAIN_TO_MODE.""" + """Raised when a caller passes a domain that is neither a product domain + (DOMAIN_TO_MODE) nor a benchmark source alias (BENCHMARK_SOURCE_TO_MODE).""" -# Empirically derived from per-source-ds F1 on the HaluBench 504-item stratified -# subsample. See docs/benchmarks/results-2026-05-19-source-routed.md. +# Product domains. Each mode is derived from the benchmark family the domain +# generalizes — per-source-ds F1 on the HaluBench 504-item stratified +# subsample (docs/benchmarks/results-2026-05-19-source-routed.md). DOMAIN_TO_MODE: dict[str, str] = { + # RAGTruth / covidQA family: prose-heavy retrieval QA where chunk-level + # citations carry the verdict. + "general": "evidence", + # No benchmark evidence for legal yet. Evidence mode is the deliberate + # default because chunk-level citations matter most in legal review. + "legal": "evidence", + # pubmedQA family: yes/no verdicts over a single passage. + "clinical": "binary", + # Alias of clinical — same pubmedQA family. + "biomedical": "binary", + # FinanceBench family: claims that hinge on derived numbers. + "financial": "arithmetic", + # DROP family: reading comprehension over numeric/tabular passages where + # the answer is a single extracted or computed value. + "numeric": "binary", +} + +# HaluBench source_ds names, pinned exactly as published. The benchmark's +# SOURCE_TO_MODE imports this dict, so reproducibility of the published F1 +# numbers cannot drift as product domains evolve. Do not edit without a +# benchmark re-run (docs/benchmarks/results-2026-05-19-source-routed.md). +BENCHMARK_SOURCE_TO_MODE: dict[str, str] = { "covidQA": "evidence", "RAGTruth": "evidence", "halueval": "judgment", @@ -36,16 +62,20 @@ class UnknownDomainError(OrcError): def route_to_mode(domain: str | None) -> str | None: """Return the routed mode for `domain`, or None if `domain` is None. - Raises UnknownDomainError when `domain` is a string not in DOMAIN_TO_MODE. - Callers must validate at their surface; we don't silently fall through to - a default — that would mask config typos and make replay non-deterministic. + Product domains resolve first; HaluBench source_ds names are accepted as + benchmark aliases so existing callers and published numbers keep working. + Raises UnknownDomainError otherwise — we don't silently fall through to a + default; that would mask config typos and make replay non-deterministic. """ if domain is None: return None - try: + if domain in DOMAIN_TO_MODE: return DOMAIN_TO_MODE[domain] - except KeyError as exc: - known = sorted(DOMAIN_TO_MODE.keys()) - raise UnknownDomainError( - f"unknown domain {domain!r}; known: {known}" - ) from exc + if domain in BENCHMARK_SOURCE_TO_MODE: + return BENCHMARK_SOURCE_TO_MODE[domain] + domains = sorted(DOMAIN_TO_MODE) + aliases = sorted(BENCHMARK_SOURCE_TO_MODE) + raise UnknownDomainError( + f"unknown domain {domain!r}; domains: {domains} " + f"(benchmark source aliases also accepted: {aliases})" + ) diff --git a/src/orc/directives/research/skills/verify_claim.py b/src/orc/directives/research/skills/verify_claim.py index e38371e..985f566 100644 --- a/src/orc/directives/research/skills/verify_claim.py +++ b/src/orc/directives/research/skills/verify_claim.py @@ -279,7 +279,9 @@ def run( Mode selection: - explicit `mode=` always wins - - else `domain=` (e.g. "pubmedQA", "DROP") routes via DOMAIN_TO_MODE + - else `domain=` (e.g. "financial", "clinical", "legal") routes via + DOMAIN_TO_MODE; HaluBench source names remain as benchmark-only + aliases in BENCHMARK_SOURCE_TO_MODE - else default = "evidence" Modes: diff --git a/src/orc/mcp/server.py b/src/orc/mcp/server.py index c8b4245..c0fdcf2 100644 --- a/src/orc/mcp/server.py +++ b/src/orc/mcp/server.py @@ -121,8 +121,8 @@ def build_server() -> FastMCP: description=( "Verify a claim against the workspace's evidence corpus. " "Omit `workspace` to use ORC_DEFAULT_WORKSPACE (or the literal 'default' workspace). " - "Optionally pass `domain` (e.g. 'pubmedQA', 'DROP') to route to an empirically " - "best verify mode for that domain — see DOMAIN_TO_MODE in the runtime." + "Optionally pass `domain` (e.g. 'financial', 'clinical', 'legal') to route to " + "the empirically best verify mode for that domain — see DOMAIN_TO_MODE in the runtime." ) ) def orc_verify_claim( diff --git a/tests/unit/test_routing.py b/tests/unit/test_routing.py index b1fc698..b1b12d3 100644 --- a/tests/unit/test_routing.py +++ b/tests/unit/test_routing.py @@ -5,37 +5,76 @@ import pytest from orc.directives.research.routing import ( + BENCHMARK_SOURCE_TO_MODE, DOMAIN_TO_MODE, UnknownDomainError, route_to_mode, ) -def test_route_to_mode_returns_expected_modes_for_each_domain() -> None: +def test_benchmark_source_map_returns_expected_modes_for_each_source() -> None: # The mapping is the load-bearing piece: per-source-ds F1 in the # source-routed HaluBench result. If this changes without a benchmark # re-run, the public F1 claim drifts from reality. - assert DOMAIN_TO_MODE["covidQA"] == "evidence" - assert DOMAIN_TO_MODE["RAGTruth"] == "evidence" - assert DOMAIN_TO_MODE["halueval"] == "judgment" - assert DOMAIN_TO_MODE["pubmedQA"] == "binary" - assert DOMAIN_TO_MODE["FinanceBench"] == "arithmetic" - assert DOMAIN_TO_MODE["DROP"] == "binary" + assert BENCHMARK_SOURCE_TO_MODE["covidQA"] == "evidence" + assert BENCHMARK_SOURCE_TO_MODE["RAGTruth"] == "evidence" + assert BENCHMARK_SOURCE_TO_MODE["halueval"] == "judgment" + assert BENCHMARK_SOURCE_TO_MODE["pubmedQA"] == "binary" + assert BENCHMARK_SOURCE_TO_MODE["FinanceBench"] == "arithmetic" + assert BENCHMARK_SOURCE_TO_MODE["DROP"] == "binary" # Every value must be one of the modes verify_claim accepts. valid_modes = {"evidence", "judgment", "binary", "decomposed", "arithmetic"} + assert set(BENCHMARK_SOURCE_TO_MODE.values()) <= valid_modes assert set(DOMAIN_TO_MODE.values()) <= valid_modes +def test_benchmark_source_map_contains_exactly_the_six_halubench_sources() -> None: + """Published benchmark numbers were produced with exactly these six + source_ds names — extra or missing keys mean reproducibility drift.""" + assert set(BENCHMARK_SOURCE_TO_MODE) == { + "covidQA", + "RAGTruth", + "halueval", + "pubmedQA", + "FinanceBench", + "DROP", + } + + def test_route_to_mode_none_returns_none() -> None: """None in → None out so verify_claim can fall back to its default mode.""" assert route_to_mode(None) is None -def test_route_to_mode_known_domain_returns_mode() -> None: +def test_route_to_mode_benchmark_source_aliases_still_route() -> None: + """Dataset names predate the product domains; existing callers passing + them must keep routing identically.""" assert route_to_mode("pubmedQA") == "binary" assert route_to_mode("covidQA") == "evidence" +def test_route_to_mode_routes_each_product_domain() -> None: + """Product domains are the real surface — each routes to the mode derived + from the benchmark family it generalizes.""" + assert route_to_mode("general") == "evidence" + assert route_to_mode("legal") == "evidence" + assert route_to_mode("clinical") == "binary" + assert route_to_mode("biomedical") == "binary" + assert route_to_mode("financial") == "arithmetic" + assert route_to_mode("numeric") == "binary" + + +def test_route_to_mode_unknown_domain_message_lists_product_domains() -> None: + """The error must teach the product surface first; benchmark dataset + names are aliases and should be mentioned separately, not as peers.""" + with pytest.raises(UnknownDomainError) as excinfo: + route_to_mode("MadeUpDomain") + message = str(excinfo.value) + for product_domain in ("general", "legal", "clinical", "financial"): + assert product_domain in message + assert "alias" in message + + def test_route_to_mode_unknown_domain_raises() -> None: """Silent fall-through would mask config typos and break replay determinism.""" with pytest.raises(UnknownDomainError) as excinfo: diff --git a/tests/unit/test_verify_claim_skill.py b/tests/unit/test_verify_claim_skill.py index 26a969d..8c6f289 100644 --- a/tests/unit/test_verify_claim_skill.py +++ b/tests/unit/test_verify_claim_skill.py @@ -391,7 +391,7 @@ def test_verify_explicit_mode_wins_over_domain( ) run.close(output={}) - # DROP routes to binary in DOMAIN_TO_MODE; explicit mode="evidence" + # DROP routes to binary via the benchmark alias map; explicit mode="evidence" # must override that, so the record_verdict (evidence) tool was used. assert fake.calls[0]["tool_choice"] == {"type": "tool", "name": "record_verdict"} From 89ab5acad18e6c32fe20af1c904fc66e164dfc14 Mon Sep 17 00:00:00 2001 From: Thormatt Date: Thu, 11 Jun 2026 22:36:56 -0400 Subject: [PATCH 3/4] chore(release): package as orc-ai 0.2.0 with CI + publish MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The PyPI name "orc" is taken by an unrelated project; the distribution becomes orc-ai while the import package and CLI command stay orc. Version 0.2.0 covers the review-hardening fixes plus PDF ingestion and product domain routing (see CHANGELOG). Adds minimal CI (pytest + ruff on a 3.11-3.13 matrix) and a tag-triggered release workflow using PyPI Trusted Publishing — no token stored; the one-time publisher setup is documented in the workflow header. The release job refuses to publish when the tag doesn't match the pyproject version and runs the suite before building. README project-status section now attributes features to the right versions. Co-Authored-By: Claude Fable 5 --- .github/workflows/ci.yml | 30 +++++++++++++++++ .github/workflows/release.yml | 55 ++++++++++++++++++++++++++++++ CHANGELOG.md | 63 +++++++++++++++++++++++++++++------ README.md | 23 +++++++------ pyproject.toml | 6 ++-- src/orc/__init__.py | 2 +- uv.lock | 15 +++++++-- 7 files changed, 168 insertions(+), 26 deletions(-) create mode 100644 .github/workflows/ci.yml create mode 100644 .github/workflows/release.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..a5e5fc5 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,30 @@ +name: CI + +on: + push: + branches: [main] + pull_request: + +jobs: + test: + runs-on: ubuntu-latest + strategy: + matrix: + # Matches requires-python >=3.11 and the advertised classifiers. + python-version: ["3.11", "3.12", "3.13"] + steps: + - uses: actions/checkout@v4 + + - name: Install uv + uses: astral-sh/setup-uv@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: uv sync --extra dev + + - name: Run tests + run: uv run pytest -q + + - name: Lint + run: uv run ruff check src tests diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..af8bdd8 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,55 @@ +# Publishes to PyPI via Trusted Publishing (OIDC) — no API token is stored +# in this repo. One-time setup on PyPI before the first tagged release: +# +# 1. Create (or claim) the "orc-ai" project on https://pypi.org. +# 2. Under the project's Publishing settings, add a Trusted Publisher: +# owner: Thormatt +# repository: orc +# workflow: release.yml +# environment: pypi +# 3. In this GitHub repo, create an environment named "pypi" +# (Settings → Environments) — optionally with required reviewers. +# +# Then `git tag v0.2.0 && git push --tags` publishes automatically. +name: Release + +on: + push: + tags: ["v*"] + +jobs: + publish: + runs-on: ubuntu-latest + environment: pypi + permissions: + # Required for PyPI Trusted Publishing (OIDC token exchange). + id-token: write + steps: + - uses: actions/checkout@v4 + + - name: Install uv + uses: astral-sh/setup-uv@v5 + with: + python-version: "3.12" + + - name: Check tag matches pyproject version + # Tagging v0.3.0 on a 0.2.0 pyproject would otherwise silently + # publish the wrong version. + run: | + PYPROJECT_VERSION=$(uv run python -c "import tomllib; print(tomllib.load(open('pyproject.toml','rb'))['project']['version'])") + TAG_VERSION="${GITHUB_REF_NAME#v}" + if [ "$PYPROJECT_VERSION" != "$TAG_VERSION" ]; then + echo "Tag $GITHUB_REF_NAME does not match pyproject version $PYPROJECT_VERSION" >&2 + exit 1 + fi + + - name: Run tests + run: | + uv sync --extra dev + uv run pytest -q + + - name: Build sdist and wheel + run: uv build + + - name: Publish to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 diff --git a/CHANGELOG.md b/CHANGELOG.md index 18e8df6..8a0fa97 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,8 +7,39 @@ Version numbers follow [SemVer](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +### Planned + +- `gads` directive (Google Ads agentic analysis: lens-based decomposition, + read-only MCP integration, evidence-bound recommendation verification). +- `orc eval consistency|perturb|retrieval|regression` reliability commands. +- Voyage-AI or local-`sentence-transformers` embeddings + hybrid retrieval (RRF over BM25 + vector). +- Hosted runtime (scheduled triggers, web dashboard, team workspaces). +- Decomposition + arithmetic combined for DROP-shaped multi-step claims. + +## [0.2.0] — 2026-06-11 + +First PyPI release. The distribution is named **`orc-ai`** — `orc` is taken on +PyPI by an unrelated project — but the import package (`import orc`) and the +CLI command (`orc`) are unchanged. + ### Added +- **PDF ingestion** — `orc ingest report.pdf` now works alongside markdown, + text, json, and URLs. Text is extracted page-by-page via `pypdf`, and the + PDF metadata title is used when the body carries no markdown-style heading + (typical for credit memos and contracts). (`src/orc/ingest/loaders.py`) +- **Product domain routing** — `--domain` / `domain=` on `verify_claim` takes + product domains (`general`, `legal`, `clinical`, `biomedical`, `financial`, + `numeric`), each mapped to the verify mode that scored best on the benchmark + family the domain generalizes. The HaluBench `source_ds` names stay accepted + as benchmark-only aliases (`BENCHMARK_SOURCE_TO_MODE`) so the published F1 + numbers remain reproducible, but dataset names are no longer the product + surface. Unknown domains still raise `UnknownDomainError`. + (`src/orc/directives/research/routing.py`) +- **CI + release workflows** — `.github/workflows/ci.yml` runs `pytest` + + `ruff` on pushes to `main` and on pull requests; `.github/workflows/release.yml` + builds sdist + wheel with uv on `v*` tags and publishes to PyPI via Trusted + Publishing (OIDC, no long-lived token in the repo). - **Isolated write paths (Phase 1)** — the effect plane that makes the Approval invariant enforceable rather than aspirational (see `docs/design/0001-isolated-write-paths.md`): @@ -36,6 +67,28 @@ Version numbers follow [SemVer](https://semver.org/spec/v2.0.0.html). ### Fixed (hardening) +- **SSRF guard hardened against DNS rebinding** — `load_url` now connects to + the exact IP it vetted (re-pinned on every redirect hop) instead of letting + the HTTP client re-resolve the hostname at request time, closing the + validate-then-connect TOCTOU window a low-TTL DNS record could exploit. A + `transport` injection seam keeps the loader testable without real sockets. + (`src/orc/ingest/loaders.py`) +- **Decomposed-mode negative voting** — atoms run in binary mode, which can + only say faithful or unfaithful; the negative vote now keys off `not_found` + and a negative net aggregates back to `not_found` instead of `contradicted` + — a distinction the atoms never actually made. + (`src/orc/directives/research/skills/verify_claim.py`) +- **Citation guard covers judgment mode** — judgment-mode verdicts pass + through the same hallucinated-chunk-ID filter and no-valid-grounding + downgrade as evidence mode, instead of shipping unguarded citations. +- **UTF-8-exact chunking** — chunk windows are computed at the byte level and + snapped forward to UTF-8 character starts, so a cl100k token boundary that + falls inside a multi-byte character (routine for CJK and emoji) can no + longer corrupt chunk text. (`src/orc/ingest/chunker.py`) +- **Offline guard covers the full credential surface** — the autouse test + fixture strips `ANTHROPIC_API_KEY`, `OPENROUTER_API_KEY`, *and* + `ORC_PROVIDER`, so a developer's shell environment can't leak live LLM + calls into the default suite. (`tests/conftest.py`) - **Replay determinism** — LLM sampling is now pinned to `temperature=0` at the `messages_create` chokepoint, so `orc replay` re-issues the recorded decision rather than a fresh sample. (`src/orc/llm/client.py`) @@ -60,16 +113,6 @@ Version numbers follow [SemVer](https://semver.org/spec/v2.0.0.html). - README invariants reworded to match what the code enforces (approval-queue isolation flagged as roadmap, not yet implemented). -### Planned - -- `gads` directive (Google Ads agentic analysis: lens-based decomposition, - read-only MCP integration, evidence-bound recommendation verification). -- `orc eval consistency|perturb|retrieval|regression` reliability commands. -- Voyage-AI or local-`sentence-transformers` embeddings + hybrid retrieval (RRF over BM25 + vector). -- PDF ingestion. -- Hosted runtime (scheduled triggers, web dashboard, team workspaces). -- Decomposition + arithmetic combined for DROP-shaped multi-step claims. - ## [0.1.4] — 2026-05-19 ### Added diff --git a/README.md b/README.md index 7860eac..6ea3db0 100644 --- a/README.md +++ b/README.md @@ -27,8 +27,8 @@ Built for **research analysts, editorial teams, legal & compliance, agentic-work # Install uv pip install git+https://github.com/Thormatt/orc -# Or, once published to PyPI: -# uv pip install orc +# Or, once published to PyPI (the CLI command and import name stay `orc`): +# uv pip install orc-ai # Set up credentials (either of these works; OpenRouter takes priority if both set) export ANTHROPIC_API_KEY=sk-ant-... @@ -63,7 +63,7 @@ claude mcp add orc -- uv run --directory $(pwd) orc mcp serve ``` orc workspace create create a new workspace orc workspace list list workspaces -orc ingest [-w ] add evidence (md, txt, urls) +orc ingest [-w ] add evidence (md, txt, json, pdf, urls) orc search "" [-w ] BM25 retrieval, no LLM orc verify "" [-w ] verify a single claim orc verify --file extract + verify every claim in a draft @@ -111,7 +111,7 @@ A `.env` file in the repo root or at `$ORC_HOME/.env` is auto-loaded. Shell-expo ## Project status -`v0.1.4` — current. Faithfulness benchmark headline (HaluBench, stratified 504-item subsample, source-aware routing): +`v0.2.0` — current. Faithfulness benchmark headline (HaluBench, stratified 504-item subsample, source-aware routing; measured on v0.1.4, runtime unchanged since): | Metric | Score | |---|---:| @@ -122,13 +122,14 @@ A `.env` file in the repo root or at `$ORC_HOME/.env` is auto-loaded. Shell-expo > **0.864 is competitive with Patronus AI's Lynx-70B published home-court F1 of 0.85** — not a same-set head-to-head: orc's number comes from a stratified 504-item HaluBench subsample, with source-aware routing tuned on that same subsample, while Lynx reported on the full benchmark. It is achieved with a general-purpose Claude Sonnet 4.6 call (no fine-tuning) plus a safe arithmetic evaluator the model can invoke for numeric claims. Orc additionally produces chunk-level citations, deterministic replay against a frozen corpus snapshot, audit-export bundles that can be self-contained (`--include-evidence`), and a multi-approver gate for high-risk verdicts — artifacts the competitive set of post-hoc faithfulness judges does not produce. -What shipped in this version: +What shipped in v0.2.0: -- `domain=` parameter on `verify_claim` + `--domain` CLI flag → source-aware routing is a real product feature, not a benchmark variant. -- `--include-evidence` flag on `orc audit export` → optional self-contained bundles (workspace DB + evidence files included) for offline regulator handoff. -- `mode="arithmetic"` for numeric claims — multi-turn LLM loop with a safe AST-walking calculator. FinanceBench F1 climbed 0.736 → 0.916. -- Citation guard: an evidence-mode verdict can no longer ship as `supported` with zero valid citations (downgraded to `not_found` and the dropped IDs land in the trace). -- Self-hosting any open-weight 70B judge: the runtime is model-agnostic — pass `model="llama-3.3-70b-instruct"` (or even Lynx itself) at any compatible endpoint and every artifact above is unchanged. +- **PDF ingestion** — `orc ingest report.pdf` (and PDF URLs) extracts text via pypdf, with metadata titles, owner-locked-PDF handling, and loud rejection of scanned/image-only files (OCR not yet supported). +- **Product domain routing** — `domain=` now takes real domains (`general`, `legal`, `clinical`, `biomedical`, `financial`, `numeric`); the HaluBench source names stay accepted as benchmark-only aliases so published numbers remain reproducible. +- **Hardening from a full code review** — SSRF guard now pins the validated IP against DNS rebinding, decomposed mode can vote against a claim, the citation guard covers judgment mode, chunking is UTF-8-exact for CJK/emoji corpora. +- **PyPI packaging as `orc-ai`** (the name `orc` was taken; CLI command and import name remain `orc`), plus CI and tag-triggered release workflows. + +Shipped earlier in v0.1.4: `--include-evidence` self-contained audit bundles, `mode="arithmetic"` with a safe AST-walking calculator (FinanceBench F1 0.736 → 0.916), the evidence-mode citation guard, and model-agnostic self-hosting of any open-weight judge. Live walkthrough: **[pagenta.app/p/thorm/orc-how-it-works](https://pagenta.app/p/thorm/orc-how-it-works)** — six-scene visual explainer. Full pitch: **[pagenta.app/p/thorm/orc-pitch](https://pagenta.app/p/thorm/orc-pitch)**. @@ -153,7 +154,7 @@ Live LLM tests are gated behind `ORC_TEST_ALLOW_LIVE_LLM=1` and require a real A ## Roadmap - Embedding-based retrieval (hybrid BM25 + vector via `sqlite-vec`) -- PDF ingestion +- OCR for scanned/image-only PDFs - Long-running directives (scheduled triggers, cloud execution) - `marketing` directive (assisted-only at first, autonomous behind approval gates later) - `legal` / `gads` / `code-review` directives — same runtime, new skill packages diff --git a/pyproject.toml b/pyproject.toml index ba2cf83..60f1d39 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,9 +2,11 @@ requires = ["hatchling"] build-backend = "hatchling.build" +# Distribution name is "orc-ai" — "orc" is taken on PyPI by an unrelated +# project. The import package stays `orc` and the CLI command stays `orc`. [project] -name = "orc" -version = "0.1.4" +name = "orc-ai" +version = "0.2.0" description = "The verification runtime for AI that has to be defensible. Evidence-bound claim verification, structured citations, trace + replay, MCP-ready CLI." readme = "README.md" requires-python = ">=3.11" diff --git a/src/orc/__init__.py b/src/orc/__init__.py index bbab024..d3ec452 100644 --- a/src/orc/__init__.py +++ b/src/orc/__init__.py @@ -1 +1 @@ -__version__ = "0.1.4" +__version__ = "0.2.0" diff --git a/uv.lock b/uv.lock index c411c2e..9b48075 100644 --- a/uv.lock +++ b/uv.lock @@ -1442,8 +1442,8 @@ wheels = [ ] [[package]] -name = "orc" -version = "0.1.4" +name = "orc-ai" +version = "0.2.0" source = { editable = "." } dependencies = [ { name = "anthropic" }, @@ -1452,6 +1452,7 @@ dependencies = [ { name = "markdown-it-py" }, { name = "mcp" }, { name = "pydantic" }, + { name = "pypdf" }, { name = "python-dotenv" }, { name = "python-ulid" }, { name = "pyyaml" }, @@ -1483,6 +1484,7 @@ requires-dist = [ { name = "markdown-it-py", specifier = ">=3.0" }, { name = "mcp", specifier = ">=1.0" }, { name = "pydantic", specifier = ">=2.7" }, + { name = "pypdf", specifier = ">=4.0" }, { name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0" }, { name = "pytest-asyncio", marker = "extra == 'dev'", specifier = ">=0.23" }, { name = "python-dotenv", specifier = ">=1.0" }, @@ -1898,6 +1900,15 @@ crypto = [ { name = "cryptography" }, ] +[[package]] +name = "pypdf" +version = "6.13.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/99/0a/48fe05c6bb3aa4bb4d2a4079a383d33c0dfec1edf613a642f07d8b8b5c2e/pypdf-6.13.2.tar.gz", hash = "sha256:5a96a17dbdfbf9c2ab24c0a13fa0aba182be22ba6f283098712c16fc242f509f", size = 6479250 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/cb/17/378943705992f74e451a06de3401ce68e3213763c81e44d0614559c45599/pypdf-6.13.2-py3-none-any.whl", hash = "sha256:6eeb9e57693f29d41bd01255d02660cbbb41fd7fc818a982677389a35e4f2083", size = 346555 }, +] + [[package]] name = "pytest" version = "9.0.3" From 83df3e0dc764edd2765390523ea48e6421f62182 Mon Sep 17 00:00:00 2001 From: Thormatt Date: Fri, 12 Jun 2026 10:41:20 -0400 Subject: [PATCH 4/4] fix(extract): retry truncated extraction, never pass empty MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Found live: a claim-dense draft hit extract_claims' 1024-token cap, the forced tool call came back cut off (stop_reason=max_tokens), and the partial input parsed as zero claims — so verify --file reported "No claims extracted" and the caller's gate passed vacuously. Truncation now escalates the budget (1024 -> 4096 -> 16384), each attempt is recorded in the trace with its stop_reason and a truncated flag, and extraction that still truncates at the ceiling raises rather than returning a partial claim list that would let unextracted claims bypass verification silently. Co-Authored-By: Claude Fable 5 --- .../research/skills/extract_claims.py | 111 ++++++++++-------- tests/unit/test_extract_and_research.py | 61 ++++++++++ 2 files changed, 125 insertions(+), 47 deletions(-) diff --git a/src/orc/directives/research/skills/extract_claims.py b/src/orc/directives/research/skills/extract_claims.py index 4e507b9..3c16c58 100644 --- a/src/orc/directives/research/skills/extract_claims.py +++ b/src/orc/directives/research/skills/extract_claims.py @@ -68,55 +68,72 @@ def run( anthropic_client = client or get_client() provider_model = resolve_model_for_provider(resolved_model) - start = time.monotonic() - response = messages_create( - anthropic_client, - model=provider_model, - max_tokens=max_tokens, - system=_load_system_prompt(), - tools=[EXTRACT_CLAIMS_TOOL_SCHEMA], - tool_choice={"type": "tool", "name": "record_claims"}, - messages=[{"role": "user", "content": f"\n{document}\n"}], - ) - elapsed_ms = int((time.monotonic() - start) * 1000) - - tool_use = next( - ( - b - for b in response.content - if getattr(b, "type", None) == "tool_use" - and getattr(b, "name", None) == "record_claims" - ), - None, - ) - if tool_use is None: - raise RuntimeError( - "LLM did not call record_claims; " - f"stop_reason={getattr(response, 'stop_reason', None)!r}" + # A response cut off by max_tokens parses as a partial (often empty) + # claim list, which downstream callers would treat as "nothing to + # verify" — a vacuous pass of the verification gate. Escalate the + # budget on truncation and fail loudly if the ceiling still truncates. + budgets = [max_tokens, max_tokens * 4, max_tokens * 16] + for attempt, budget in enumerate(budgets): + start = time.monotonic() + response = messages_create( + anthropic_client, + model=provider_model, + max_tokens=budget, + system=_load_system_prompt(), + tools=[EXTRACT_CLAIMS_TOOL_SCHEMA], + tool_choice={"type": "tool", "name": "record_claims"}, + messages=[ + {"role": "user", "content": f"\n{document}\n"} + ], ) - claims = list(tool_use.input.get("claims", [])) - - usage = response.usage - run.record_llm_call( - call_id=new_id(), - model=resolved_model, - request={ - "tool_name": "record_claims", - "max_tokens": max_tokens, - "document_chars": len(document), - }, - response={ - "stop_reason": getattr(response, "stop_reason", None), - "claim_count": len(claims), - }, - input_tokens=getattr(usage, "input_tokens", 0) or 0, - output_tokens=getattr(usage, "output_tokens", 0) or 0, - cache_read_input_tokens=getattr(usage, "cache_read_input_tokens", 0) or 0, - cache_creation_input_tokens=getattr(usage, "cache_creation_input_tokens", 0) or 0, - elapsed_ms=elapsed_ms, - ) + elapsed_ms = int((time.monotonic() - start) * 1000) + stop_reason = getattr(response, "stop_reason", None) + truncated = stop_reason == "max_tokens" + + tool_use = next( + ( + b + for b in response.content + if getattr(b, "type", None) == "tool_use" + and getattr(b, "name", None) == "record_claims" + ), + None, + ) + if tool_use is None and not truncated: + raise RuntimeError( + f"LLM did not call record_claims; stop_reason={stop_reason!r}" + ) + claims = list(tool_use.input.get("claims", [])) if tool_use else [] + + usage = response.usage + run.record_llm_call( + call_id=new_id(), + model=resolved_model, + request={ + "tool_name": "record_claims", + "max_tokens": budget, + "attempt": attempt, + "document_chars": len(document), + }, + response={ + "stop_reason": stop_reason, + "claim_count": len(claims), + "truncated": truncated, + }, + input_tokens=getattr(usage, "input_tokens", 0) or 0, + output_tokens=getattr(usage, "output_tokens", 0) or 0, + cache_read_input_tokens=getattr(usage, "cache_read_input_tokens", 0) or 0, + cache_creation_input_tokens=getattr(usage, "cache_creation_input_tokens", 0) + or 0, + elapsed_ms=elapsed_ms, + ) + if not truncated: + return {"claims": claims, "model": resolved_model} - return {"claims": claims, "model": resolved_model} + raise RuntimeError( + f"claim extraction truncated even at max_tokens={budgets[-1]}; " + "refusing to return a partial claim list" + ) extract_claims = _ExtractClaims() diff --git a/tests/unit/test_extract_and_research.py b/tests/unit/test_extract_and_research.py index 6acd832..54497f6 100644 --- a/tests/unit/test_extract_and_research.py +++ b/tests/unit/test_extract_and_research.py @@ -230,3 +230,64 @@ def test_cli_verify_requires_input(orc_home: Path, tmp_path: Path) -> None: result = runner.invoke(main, ["verify", "--workspace", "demo"]) assert result.exit_code != 0 assert "Provide" in result.output or "Provide" in str(result.exception) + + +def _claims_response(n_claims: int, stop_reason: str) -> FakeResponse: + return FakeResponse( + content=[ + FakeContentBlock( + type="tool_use", + name="record_claims", + input={"claims": [{"text": f"claim {i}"} for i in range(n_claims)]}, + ) + ], + stop_reason=stop_reason, + ) + + +def test_extract_claims_retries_with_bigger_budget_when_truncated( + orc_home: Path, tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """stop_reason=max_tokens means the tool call was cut off mid-emission — + the observed failure was a truncated call parsing as zero claims and the + gate passing vacuously. Truncation must trigger a retry, not an empty OK.""" + name = _seed(orc_home, tmp_path) + fake = FakeAnthropic( + responses=[ + _claims_response(0, stop_reason="max_tokens"), + _claims_response(2, stop_reason="tool_use"), + ] + ) + monkeypatch.setattr(client_module, "_client", fake) + monkeypatch.setattr(client_module, "_factory", None) + + ws = ws_module.resolve(name) + skill = directives.get("research").skills["extract_claims"] + with open_run(ws, directive="research", skill="extract_claims", inputs={}) as run: + result = skill.run(workspace=ws, run=run, document="Some long document.") + run.close(output=result) + + assert len(result["claims"]) == 2 + assert len(fake.calls) == 2 + # Retry must actually raise the budget, not replay the same request. + assert fake.calls[1]["max_tokens"] > fake.calls[0]["max_tokens"] + + +def test_extract_claims_raises_when_truncated_at_final_budget( + orc_home: Path, tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """If every escalation still truncates, fail loudly: a partial claim list + would let unextracted claims bypass verification silently.""" + name = _seed(orc_home, tmp_path) + fake = FakeAnthropic( + responder=lambda kwargs: _claims_response(1, stop_reason="max_tokens") + ) + monkeypatch.setattr(client_module, "_client", fake) + monkeypatch.setattr(client_module, "_factory", None) + + ws = ws_module.resolve(name) + skill = directives.get("research").skills["extract_claims"] + with open_run(ws, directive="research", skill="extract_claims", inputs={}) as run: + with pytest.raises(RuntimeError, match="truncated"): + skill.run(workspace=ws, run=run, document="Some long document.") + run.close(output={})