diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..a5e5fc5 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,30 @@ +name: CI + +on: + push: + branches: [main] + pull_request: + +jobs: + test: + runs-on: ubuntu-latest + strategy: + matrix: + # Matches requires-python >=3.11 and the advertised classifiers. + python-version: ["3.11", "3.12", "3.13"] + steps: + - uses: actions/checkout@v4 + + - name: Install uv + uses: astral-sh/setup-uv@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: uv sync --extra dev + + - name: Run tests + run: uv run pytest -q + + - name: Lint + run: uv run ruff check src tests diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..af8bdd8 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,55 @@ +# Publishes to PyPI via Trusted Publishing (OIDC) — no API token is stored +# in this repo. One-time setup on PyPI before the first tagged release: +# +# 1. Create (or claim) the "orc-ai" project on https://pypi.org. +# 2. Under the project's Publishing settings, add a Trusted Publisher: +# owner: Thormatt +# repository: orc +# workflow: release.yml +# environment: pypi +# 3. In this GitHub repo, create an environment named "pypi" +# (Settings → Environments) — optionally with required reviewers. +# +# Then `git tag v0.2.0 && git push --tags` publishes automatically. +name: Release + +on: + push: + tags: ["v*"] + +jobs: + publish: + runs-on: ubuntu-latest + environment: pypi + permissions: + # Required for PyPI Trusted Publishing (OIDC token exchange). + id-token: write + steps: + - uses: actions/checkout@v4 + + - name: Install uv + uses: astral-sh/setup-uv@v5 + with: + python-version: "3.12" + + - name: Check tag matches pyproject version + # Tagging v0.3.0 on a 0.2.0 pyproject would otherwise silently + # publish the wrong version. + run: | + PYPROJECT_VERSION=$(uv run python -c "import tomllib; print(tomllib.load(open('pyproject.toml','rb'))['project']['version'])") + TAG_VERSION="${GITHUB_REF_NAME#v}" + if [ "$PYPROJECT_VERSION" != "$TAG_VERSION" ]; then + echo "Tag $GITHUB_REF_NAME does not match pyproject version $PYPROJECT_VERSION" >&2 + exit 1 + fi + + - name: Run tests + run: | + uv sync --extra dev + uv run pytest -q + + - name: Build sdist and wheel + run: uv build + + - name: Publish to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 diff --git a/CHANGELOG.md b/CHANGELOG.md index 18e8df6..8a0fa97 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,8 +7,39 @@ Version numbers follow [SemVer](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +### Planned + +- `gads` directive (Google Ads agentic analysis: lens-based decomposition, + read-only MCP integration, evidence-bound recommendation verification). +- `orc eval consistency|perturb|retrieval|regression` reliability commands. +- Voyage-AI or local-`sentence-transformers` embeddings + hybrid retrieval (RRF over BM25 + vector). +- Hosted runtime (scheduled triggers, web dashboard, team workspaces). +- Decomposition + arithmetic combined for DROP-shaped multi-step claims. + +## [0.2.0] — 2026-06-11 + +First PyPI release. The distribution is named **`orc-ai`** — `orc` is taken on +PyPI by an unrelated project — but the import package (`import orc`) and the +CLI command (`orc`) are unchanged. + ### Added +- **PDF ingestion** — `orc ingest report.pdf` now works alongside markdown, + text, json, and URLs. Text is extracted page-by-page via `pypdf`, and the + PDF metadata title is used when the body carries no markdown-style heading + (typical for credit memos and contracts). (`src/orc/ingest/loaders.py`) +- **Product domain routing** — `--domain` / `domain=` on `verify_claim` takes + product domains (`general`, `legal`, `clinical`, `biomedical`, `financial`, + `numeric`), each mapped to the verify mode that scored best on the benchmark + family the domain generalizes. The HaluBench `source_ds` names stay accepted + as benchmark-only aliases (`BENCHMARK_SOURCE_TO_MODE`) so the published F1 + numbers remain reproducible, but dataset names are no longer the product + surface. Unknown domains still raise `UnknownDomainError`. + (`src/orc/directives/research/routing.py`) +- **CI + release workflows** — `.github/workflows/ci.yml` runs `pytest` + + `ruff` on pushes to `main` and on pull requests; `.github/workflows/release.yml` + builds sdist + wheel with uv on `v*` tags and publishes to PyPI via Trusted + Publishing (OIDC, no long-lived token in the repo). - **Isolated write paths (Phase 1)** — the effect plane that makes the Approval invariant enforceable rather than aspirational (see `docs/design/0001-isolated-write-paths.md`): @@ -36,6 +67,28 @@ Version numbers follow [SemVer](https://semver.org/spec/v2.0.0.html). ### Fixed (hardening) +- **SSRF guard hardened against DNS rebinding** — `load_url` now connects to + the exact IP it vetted (re-pinned on every redirect hop) instead of letting + the HTTP client re-resolve the hostname at request time, closing the + validate-then-connect TOCTOU window a low-TTL DNS record could exploit. A + `transport` injection seam keeps the loader testable without real sockets. + (`src/orc/ingest/loaders.py`) +- **Decomposed-mode negative voting** — atoms run in binary mode, which can + only say faithful or unfaithful; the negative vote now keys off `not_found` + and a negative net aggregates back to `not_found` instead of `contradicted` + — a distinction the atoms never actually made. + (`src/orc/directives/research/skills/verify_claim.py`) +- **Citation guard covers judgment mode** — judgment-mode verdicts pass + through the same hallucinated-chunk-ID filter and no-valid-grounding + downgrade as evidence mode, instead of shipping unguarded citations. +- **UTF-8-exact chunking** — chunk windows are computed at the byte level and + snapped forward to UTF-8 character starts, so a cl100k token boundary that + falls inside a multi-byte character (routine for CJK and emoji) can no + longer corrupt chunk text. (`src/orc/ingest/chunker.py`) +- **Offline guard covers the full credential surface** — the autouse test + fixture strips `ANTHROPIC_API_KEY`, `OPENROUTER_API_KEY`, *and* + `ORC_PROVIDER`, so a developer's shell environment can't leak live LLM + calls into the default suite. (`tests/conftest.py`) - **Replay determinism** — LLM sampling is now pinned to `temperature=0` at the `messages_create` chokepoint, so `orc replay` re-issues the recorded decision rather than a fresh sample. (`src/orc/llm/client.py`) @@ -60,16 +113,6 @@ Version numbers follow [SemVer](https://semver.org/spec/v2.0.0.html). - README invariants reworded to match what the code enforces (approval-queue isolation flagged as roadmap, not yet implemented). -### Planned - -- `gads` directive (Google Ads agentic analysis: lens-based decomposition, - read-only MCP integration, evidence-bound recommendation verification). -- `orc eval consistency|perturb|retrieval|regression` reliability commands. -- Voyage-AI or local-`sentence-transformers` embeddings + hybrid retrieval (RRF over BM25 + vector). -- PDF ingestion. -- Hosted runtime (scheduled triggers, web dashboard, team workspaces). -- Decomposition + arithmetic combined for DROP-shaped multi-step claims. - ## [0.1.4] — 2026-05-19 ### Added diff --git a/README.md b/README.md index 7860eac..6ea3db0 100644 --- a/README.md +++ b/README.md @@ -27,8 +27,8 @@ Built for **research analysts, editorial teams, legal & compliance, agentic-work # Install uv pip install git+https://github.com/Thormatt/orc -# Or, once published to PyPI: -# uv pip install orc +# Or, once published to PyPI (the CLI command and import name stay `orc`): +# uv pip install orc-ai # Set up credentials (either of these works; OpenRouter takes priority if both set) export ANTHROPIC_API_KEY=sk-ant-... @@ -63,7 +63,7 @@ claude mcp add orc -- uv run --directory $(pwd) orc mcp serve ``` orc workspace create create a new workspace orc workspace list list workspaces -orc ingest [-w ] add evidence (md, txt, urls) +orc ingest [-w ] add evidence (md, txt, json, pdf, urls) orc search "" [-w ] BM25 retrieval, no LLM orc verify "" [-w ] verify a single claim orc verify --file extract + verify every claim in a draft @@ -111,7 +111,7 @@ A `.env` file in the repo root or at `$ORC_HOME/.env` is auto-loaded. Shell-expo ## Project status -`v0.1.4` — current. Faithfulness benchmark headline (HaluBench, stratified 504-item subsample, source-aware routing): +`v0.2.0` — current. Faithfulness benchmark headline (HaluBench, stratified 504-item subsample, source-aware routing; measured on v0.1.4, runtime unchanged since): | Metric | Score | |---|---:| @@ -122,13 +122,14 @@ A `.env` file in the repo root or at `$ORC_HOME/.env` is auto-loaded. Shell-expo > **0.864 is competitive with Patronus AI's Lynx-70B published home-court F1 of 0.85** — not a same-set head-to-head: orc's number comes from a stratified 504-item HaluBench subsample, with source-aware routing tuned on that same subsample, while Lynx reported on the full benchmark. It is achieved with a general-purpose Claude Sonnet 4.6 call (no fine-tuning) plus a safe arithmetic evaluator the model can invoke for numeric claims. Orc additionally produces chunk-level citations, deterministic replay against a frozen corpus snapshot, audit-export bundles that can be self-contained (`--include-evidence`), and a multi-approver gate for high-risk verdicts — artifacts the competitive set of post-hoc faithfulness judges does not produce. -What shipped in this version: +What shipped in v0.2.0: -- `domain=` parameter on `verify_claim` + `--domain` CLI flag → source-aware routing is a real product feature, not a benchmark variant. -- `--include-evidence` flag on `orc audit export` → optional self-contained bundles (workspace DB + evidence files included) for offline regulator handoff. -- `mode="arithmetic"` for numeric claims — multi-turn LLM loop with a safe AST-walking calculator. FinanceBench F1 climbed 0.736 → 0.916. -- Citation guard: an evidence-mode verdict can no longer ship as `supported` with zero valid citations (downgraded to `not_found` and the dropped IDs land in the trace). -- Self-hosting any open-weight 70B judge: the runtime is model-agnostic — pass `model="llama-3.3-70b-instruct"` (or even Lynx itself) at any compatible endpoint and every artifact above is unchanged. +- **PDF ingestion** — `orc ingest report.pdf` (and PDF URLs) extracts text via pypdf, with metadata titles, owner-locked-PDF handling, and loud rejection of scanned/image-only files (OCR not yet supported). +- **Product domain routing** — `domain=` now takes real domains (`general`, `legal`, `clinical`, `biomedical`, `financial`, `numeric`); the HaluBench source names stay accepted as benchmark-only aliases so published numbers remain reproducible. +- **Hardening from a full code review** — SSRF guard now pins the validated IP against DNS rebinding, decomposed mode can vote against a claim, the citation guard covers judgment mode, chunking is UTF-8-exact for CJK/emoji corpora. +- **PyPI packaging as `orc-ai`** (the name `orc` was taken; CLI command and import name remain `orc`), plus CI and tag-triggered release workflows. + +Shipped earlier in v0.1.4: `--include-evidence` self-contained audit bundles, `mode="arithmetic"` with a safe AST-walking calculator (FinanceBench F1 0.736 → 0.916), the evidence-mode citation guard, and model-agnostic self-hosting of any open-weight judge. Live walkthrough: **[pagenta.app/p/thorm/orc-how-it-works](https://pagenta.app/p/thorm/orc-how-it-works)** — six-scene visual explainer. Full pitch: **[pagenta.app/p/thorm/orc-pitch](https://pagenta.app/p/thorm/orc-pitch)**. @@ -153,7 +154,7 @@ Live LLM tests are gated behind `ORC_TEST_ALLOW_LIVE_LLM=1` and require a real A ## Roadmap - Embedding-based retrieval (hybrid BM25 + vector via `sqlite-vec`) -- PDF ingestion +- OCR for scanned/image-only PDFs - Long-running directives (scheduled triggers, cloud execution) - `marketing` directive (assisted-only at first, autonomous behind approval gates later) - `legal` / `gads` / `code-review` directives — same runtime, new skill packages diff --git a/benchmarks/faithfulness/run.py b/benchmarks/faithfulness/run.py index 59d18d7..e945cc7 100644 --- a/benchmarks/faithfulness/run.py +++ b/benchmarks/faithfulness/run.py @@ -184,7 +184,9 @@ def _run_lynx_style_one(item: dict[str, Any], orc_home: Path) -> ItemResult: # subsample. Prose-heavy sources where corpus citations help → evidence mode. # Single-passage numeric/extraction tasks → binary mode. Mixed natural-language # Q+A → judgment mode. -from orc.directives.research.routing import DOMAIN_TO_MODE as SOURCE_TO_MODE # noqa: E402 +from orc.directives.research.routing import ( # noqa: E402 + BENCHMARK_SOURCE_TO_MODE as SOURCE_TO_MODE, +) def _run_with_mode(item: dict[str, Any], orc_home: Path, mode: str) -> ItemResult: diff --git a/pyproject.toml b/pyproject.toml index 91b101b..60f1d39 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,9 +2,11 @@ requires = ["hatchling"] build-backend = "hatchling.build" +# Distribution name is "orc-ai" — "orc" is taken on PyPI by an unrelated +# project. The import package stays `orc` and the CLI command stays `orc`. [project] -name = "orc" -version = "0.1.4" +name = "orc-ai" +version = "0.2.0" description = "The verification runtime for AI that has to be defensible. Evidence-bound claim verification, structured citations, trace + replay, MCP-ready CLI." readme = "README.md" requires-python = ">=3.11" @@ -45,6 +47,7 @@ dependencies = [ "rich>=13.0", "python-ulid>=2.0", "python-dotenv>=1.0", + "pypdf>=4.0", ] [project.optional-dependencies] diff --git a/src/orc/__init__.py b/src/orc/__init__.py index bbab024..d3ec452 100644 --- a/src/orc/__init__.py +++ b/src/orc/__init__.py @@ -1 +1 @@ -__version__ = "0.1.4" +__version__ = "0.2.0" diff --git a/src/orc/cli_commands/verify.py b/src/orc/cli_commands/verify.py index 17f08bc..270838f 100644 --- a/src/orc/cli_commands/verify.py +++ b/src/orc/cli_commands/verify.py @@ -43,7 +43,7 @@ @click.option( "--domain", default=None, - help="Route mode by domain hint (e.g. 'pubmedQA', 'DROP', 'FinanceBench')", + help="Route mode by domain hint (e.g. 'financial', 'clinical', 'legal')", ) @click.option("--yes", "-y", is_flag=True, help="Skip the confirmation prompt for batch verify") @click.option("--json", "as_json", is_flag=True, help="Emit raw JSON instead of formatted output") diff --git a/src/orc/directives/research/routing.py b/src/orc/directives/research/routing.py index ecc8769..40f5072 100644 --- a/src/orc/directives/research/routing.py +++ b/src/orc/directives/research/routing.py @@ -1,10 +1,12 @@ """Domain → verify-mode routing. -Callers can pass `domain="pubmedQA"` (or any other registered domain) to -`verify_claim` and the runtime picks the best mode empirically — derived from -the per-source-ds F1 breakdown in the HaluBench benchmark. The benchmark's -`SOURCE_TO_MODE` is now a thin import from this dict so the runtime and the -benchmark routing can never drift. +Callers pass a product domain (`domain="clinical"`, `domain="financial"`, ...) +to `verify_claim` and the runtime picks the verify mode that performed best on +the benchmark family that domain generalizes — derived from the per-source-ds +F1 breakdown in the HaluBench benchmark. The HaluBench `source_ds` names stay +accepted as benchmark aliases (`BENCHMARK_SOURCE_TO_MODE`) so the published +benchmark numbers remain reproducible, but the product surface is the domain +map: dataset names are benchmark artifacts, not domains a customer has. In production this lives behind a workspace tag, a manifest hint, or an explicit `--domain` flag on the verify call. Unknown domains raise rather than @@ -18,12 +20,36 @@ class UnknownDomainError(OrcError): - """Raised when a caller passes a domain not present in DOMAIN_TO_MODE.""" + """Raised when a caller passes a domain that is neither a product domain + (DOMAIN_TO_MODE) nor a benchmark source alias (BENCHMARK_SOURCE_TO_MODE).""" -# Empirically derived from per-source-ds F1 on the HaluBench 504-item stratified -# subsample. See docs/benchmarks/results-2026-05-19-source-routed.md. +# Product domains. Each mode is derived from the benchmark family the domain +# generalizes — per-source-ds F1 on the HaluBench 504-item stratified +# subsample (docs/benchmarks/results-2026-05-19-source-routed.md). DOMAIN_TO_MODE: dict[str, str] = { + # RAGTruth / covidQA family: prose-heavy retrieval QA where chunk-level + # citations carry the verdict. + "general": "evidence", + # No benchmark evidence for legal yet. Evidence mode is the deliberate + # default because chunk-level citations matter most in legal review. + "legal": "evidence", + # pubmedQA family: yes/no verdicts over a single passage. + "clinical": "binary", + # Alias of clinical — same pubmedQA family. + "biomedical": "binary", + # FinanceBench family: claims that hinge on derived numbers. + "financial": "arithmetic", + # DROP family: reading comprehension over numeric/tabular passages where + # the answer is a single extracted or computed value. + "numeric": "binary", +} + +# HaluBench source_ds names, pinned exactly as published. The benchmark's +# SOURCE_TO_MODE imports this dict, so reproducibility of the published F1 +# numbers cannot drift as product domains evolve. Do not edit without a +# benchmark re-run (docs/benchmarks/results-2026-05-19-source-routed.md). +BENCHMARK_SOURCE_TO_MODE: dict[str, str] = { "covidQA": "evidence", "RAGTruth": "evidence", "halueval": "judgment", @@ -36,16 +62,20 @@ class UnknownDomainError(OrcError): def route_to_mode(domain: str | None) -> str | None: """Return the routed mode for `domain`, or None if `domain` is None. - Raises UnknownDomainError when `domain` is a string not in DOMAIN_TO_MODE. - Callers must validate at their surface; we don't silently fall through to - a default — that would mask config typos and make replay non-deterministic. + Product domains resolve first; HaluBench source_ds names are accepted as + benchmark aliases so existing callers and published numbers keep working. + Raises UnknownDomainError otherwise — we don't silently fall through to a + default; that would mask config typos and make replay non-deterministic. """ if domain is None: return None - try: + if domain in DOMAIN_TO_MODE: return DOMAIN_TO_MODE[domain] - except KeyError as exc: - known = sorted(DOMAIN_TO_MODE.keys()) - raise UnknownDomainError( - f"unknown domain {domain!r}; known: {known}" - ) from exc + if domain in BENCHMARK_SOURCE_TO_MODE: + return BENCHMARK_SOURCE_TO_MODE[domain] + domains = sorted(DOMAIN_TO_MODE) + aliases = sorted(BENCHMARK_SOURCE_TO_MODE) + raise UnknownDomainError( + f"unknown domain {domain!r}; domains: {domains} " + f"(benchmark source aliases also accepted: {aliases})" + ) diff --git a/src/orc/directives/research/skills/extract_claims.py b/src/orc/directives/research/skills/extract_claims.py index 4e507b9..3c16c58 100644 --- a/src/orc/directives/research/skills/extract_claims.py +++ b/src/orc/directives/research/skills/extract_claims.py @@ -68,55 +68,72 @@ def run( anthropic_client = client or get_client() provider_model = resolve_model_for_provider(resolved_model) - start = time.monotonic() - response = messages_create( - anthropic_client, - model=provider_model, - max_tokens=max_tokens, - system=_load_system_prompt(), - tools=[EXTRACT_CLAIMS_TOOL_SCHEMA], - tool_choice={"type": "tool", "name": "record_claims"}, - messages=[{"role": "user", "content": f"\n{document}\n"}], - ) - elapsed_ms = int((time.monotonic() - start) * 1000) - - tool_use = next( - ( - b - for b in response.content - if getattr(b, "type", None) == "tool_use" - and getattr(b, "name", None) == "record_claims" - ), - None, - ) - if tool_use is None: - raise RuntimeError( - "LLM did not call record_claims; " - f"stop_reason={getattr(response, 'stop_reason', None)!r}" + # A response cut off by max_tokens parses as a partial (often empty) + # claim list, which downstream callers would treat as "nothing to + # verify" — a vacuous pass of the verification gate. Escalate the + # budget on truncation and fail loudly if the ceiling still truncates. + budgets = [max_tokens, max_tokens * 4, max_tokens * 16] + for attempt, budget in enumerate(budgets): + start = time.monotonic() + response = messages_create( + anthropic_client, + model=provider_model, + max_tokens=budget, + system=_load_system_prompt(), + tools=[EXTRACT_CLAIMS_TOOL_SCHEMA], + tool_choice={"type": "tool", "name": "record_claims"}, + messages=[ + {"role": "user", "content": f"\n{document}\n"} + ], ) - claims = list(tool_use.input.get("claims", [])) - - usage = response.usage - run.record_llm_call( - call_id=new_id(), - model=resolved_model, - request={ - "tool_name": "record_claims", - "max_tokens": max_tokens, - "document_chars": len(document), - }, - response={ - "stop_reason": getattr(response, "stop_reason", None), - "claim_count": len(claims), - }, - input_tokens=getattr(usage, "input_tokens", 0) or 0, - output_tokens=getattr(usage, "output_tokens", 0) or 0, - cache_read_input_tokens=getattr(usage, "cache_read_input_tokens", 0) or 0, - cache_creation_input_tokens=getattr(usage, "cache_creation_input_tokens", 0) or 0, - elapsed_ms=elapsed_ms, - ) + elapsed_ms = int((time.monotonic() - start) * 1000) + stop_reason = getattr(response, "stop_reason", None) + truncated = stop_reason == "max_tokens" + + tool_use = next( + ( + b + for b in response.content + if getattr(b, "type", None) == "tool_use" + and getattr(b, "name", None) == "record_claims" + ), + None, + ) + if tool_use is None and not truncated: + raise RuntimeError( + f"LLM did not call record_claims; stop_reason={stop_reason!r}" + ) + claims = list(tool_use.input.get("claims", [])) if tool_use else [] + + usage = response.usage + run.record_llm_call( + call_id=new_id(), + model=resolved_model, + request={ + "tool_name": "record_claims", + "max_tokens": budget, + "attempt": attempt, + "document_chars": len(document), + }, + response={ + "stop_reason": stop_reason, + "claim_count": len(claims), + "truncated": truncated, + }, + input_tokens=getattr(usage, "input_tokens", 0) or 0, + output_tokens=getattr(usage, "output_tokens", 0) or 0, + cache_read_input_tokens=getattr(usage, "cache_read_input_tokens", 0) or 0, + cache_creation_input_tokens=getattr(usage, "cache_creation_input_tokens", 0) + or 0, + elapsed_ms=elapsed_ms, + ) + if not truncated: + return {"claims": claims, "model": resolved_model} - return {"claims": claims, "model": resolved_model} + raise RuntimeError( + f"claim extraction truncated even at max_tokens={budgets[-1]}; " + "refusing to return a partial claim list" + ) extract_claims = _ExtractClaims() diff --git a/src/orc/directives/research/skills/verify_claim.py b/src/orc/directives/research/skills/verify_claim.py index e38371e..985f566 100644 --- a/src/orc/directives/research/skills/verify_claim.py +++ b/src/orc/directives/research/skills/verify_claim.py @@ -279,7 +279,9 @@ def run( Mode selection: - explicit `mode=` always wins - - else `domain=` (e.g. "pubmedQA", "DROP") routes via DOMAIN_TO_MODE + - else `domain=` (e.g. "financial", "clinical", "legal") routes via + DOMAIN_TO_MODE; HaluBench source names remain as benchmark-only + aliases in BENCHMARK_SOURCE_TO_MODE - else default = "evidence" Modes: diff --git a/src/orc/ingest/loaders.py b/src/orc/ingest/loaders.py index e4ab577..eedb62c 100644 --- a/src/orc/ingest/loaders.py +++ b/src/orc/ingest/loaders.py @@ -1,8 +1,14 @@ -"""File and URL loaders. Each returns a `LoadedDoc` with raw bytes + decoded text.""" +"""File and URL loaders. Each returns a `LoadedDoc` with raw bytes + extracted text. + +Supported formats: the text mimes in SUPPORTED_TEXT_MIMES (markdown, plain text, +HTML, reST, JSON) plus application/pdf, whose text is extracted with pypdf. +Scanned/image-only PDFs are rejected — OCR is not supported. +""" from __future__ import annotations import hashlib +import io import ipaddress import mimetypes import socket @@ -11,10 +17,15 @@ from urllib.parse import urljoin, urlparse import httpx +from pypdf import PdfReader + +from orc import __version__ MAX_URL_BYTES = 25 * 1024 * 1024 MAX_REDIRECTS = 5 +PDF_MIME = "application/pdf" + SUPPORTED_TEXT_MIMES = { "text/markdown", "text/x-markdown", @@ -37,6 +48,15 @@ class LoadedDoc: def load_file(path: Path) -> LoadedDoc: raw_bytes = path.read_bytes() mime = _guess_mime(path) + if mime == PDF_MIME: + text, pdf_title = _extract_pdf(raw_bytes, source=str(path)) + return LoadedDoc( + source_uri=str(path.resolve()), + title=pdf_title or _extract_title(text, fallback=path.stem), + mime_type=mime, + text=text, + raw_bytes=raw_bytes, + ) if mime not in SUPPORTED_TEXT_MIMES and not mime.startswith("text/"): raise ValueError(f"Unsupported file type for ingest: {mime} ({path})") text = raw_bytes.decode("utf-8", errors="replace") @@ -133,7 +153,7 @@ def load_url( with httpx.Client( timeout=timeout, follow_redirects=False, - headers={"User-Agent": "orc/0.1.0"}, + headers={"User-Agent": f"orc/{__version__}"}, transport=transport, ) as http: for _ in range(MAX_REDIRECTS + 1): @@ -151,6 +171,15 @@ def load_url( if len(raw_bytes) > MAX_URL_BYTES: raise ValueError(f"URL response exceeds {MAX_URL_BYTES} byte limit: {url!r}") mime = response.headers.get("content-type", "application/octet-stream").split(";")[0].strip() + if mime == PDF_MIME: + text, pdf_title = _extract_pdf(raw_bytes, source=url) + return LoadedDoc( + source_uri=url, + title=pdf_title or _extract_title(text, fallback=url), + mime_type=mime, + text=text, + raw_bytes=raw_bytes, + ) if mime not in SUPPORTED_TEXT_MIMES and not mime.startswith("text/"): raise ValueError(f"Unsupported URL content-type for ingest: {mime} ({url})") text = raw_bytes.decode(response.encoding or "utf-8", errors="replace") @@ -163,6 +192,40 @@ def load_url( ) +def _extract_pdf(raw_bytes: bytes, *, source: str) -> tuple[str, str | None]: + """Extract (text, metadata /Title) from a PDF in a single parse. + + Pages are joined with blank lines, skipping empty ones. The metadata title + is surfaced because PDF corpora (credit memos, contracts) rarely contain + the markdown-style headings _extract_title scans for. + """ + try: + reader = PdfReader(io.BytesIO(raw_bytes)) + # Owner-password-locked PDFs with an empty user password (common for + # distributed contracts/memos) open with decrypt(""); only PDFs that + # truly require a password are refused. + if reader.is_encrypted and not reader.decrypt(""): + raise ValueError(f"Could not extract text from PDF (encrypted): {source}") + page_texts = (page.extract_text() for page in reader.pages) + text = "\n\n".join(page_text for page_text in page_texts if page_text.strip()) + meta = reader.metadata + title = (meta.title or "").strip() if meta is not None else "" + except ValueError: + raise + except Exception as exc: + # pypdf raises its own hierarchy (PdfReadError, PdfStreamError, ...); + # callers should see one stable, actionable error type instead. + raise ValueError(f"Could not extract text from PDF: {source} ({exc})") from exc + if not text: + # Silently ingesting an empty corpus would produce confident + # not_found verdicts downstream, so refuse loudly instead. + raise ValueError( + "Could not extract text from PDF (scanned/image-only? " + f"OCR is not supported): {source}" + ) + return text, title or None + + def sha256_bytes(data: bytes) -> str: return hashlib.sha256(data).hexdigest() diff --git a/src/orc/mcp/server.py b/src/orc/mcp/server.py index c8b4245..c0fdcf2 100644 --- a/src/orc/mcp/server.py +++ b/src/orc/mcp/server.py @@ -121,8 +121,8 @@ def build_server() -> FastMCP: description=( "Verify a claim against the workspace's evidence corpus. " "Omit `workspace` to use ORC_DEFAULT_WORKSPACE (or the literal 'default' workspace). " - "Optionally pass `domain` (e.g. 'pubmedQA', 'DROP') to route to an empirically " - "best verify mode for that domain — see DOMAIN_TO_MODE in the runtime." + "Optionally pass `domain` (e.g. 'financial', 'clinical', 'legal') to route to " + "the empirically best verify mode for that domain — see DOMAIN_TO_MODE in the runtime." ) ) def orc_verify_claim( diff --git a/tests/unit/test_extract_and_research.py b/tests/unit/test_extract_and_research.py index 6acd832..54497f6 100644 --- a/tests/unit/test_extract_and_research.py +++ b/tests/unit/test_extract_and_research.py @@ -230,3 +230,64 @@ def test_cli_verify_requires_input(orc_home: Path, tmp_path: Path) -> None: result = runner.invoke(main, ["verify", "--workspace", "demo"]) assert result.exit_code != 0 assert "Provide" in result.output or "Provide" in str(result.exception) + + +def _claims_response(n_claims: int, stop_reason: str) -> FakeResponse: + return FakeResponse( + content=[ + FakeContentBlock( + type="tool_use", + name="record_claims", + input={"claims": [{"text": f"claim {i}"} for i in range(n_claims)]}, + ) + ], + stop_reason=stop_reason, + ) + + +def test_extract_claims_retries_with_bigger_budget_when_truncated( + orc_home: Path, tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """stop_reason=max_tokens means the tool call was cut off mid-emission — + the observed failure was a truncated call parsing as zero claims and the + gate passing vacuously. Truncation must trigger a retry, not an empty OK.""" + name = _seed(orc_home, tmp_path) + fake = FakeAnthropic( + responses=[ + _claims_response(0, stop_reason="max_tokens"), + _claims_response(2, stop_reason="tool_use"), + ] + ) + monkeypatch.setattr(client_module, "_client", fake) + monkeypatch.setattr(client_module, "_factory", None) + + ws = ws_module.resolve(name) + skill = directives.get("research").skills["extract_claims"] + with open_run(ws, directive="research", skill="extract_claims", inputs={}) as run: + result = skill.run(workspace=ws, run=run, document="Some long document.") + run.close(output=result) + + assert len(result["claims"]) == 2 + assert len(fake.calls) == 2 + # Retry must actually raise the budget, not replay the same request. + assert fake.calls[1]["max_tokens"] > fake.calls[0]["max_tokens"] + + +def test_extract_claims_raises_when_truncated_at_final_budget( + orc_home: Path, tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """If every escalation still truncates, fail loudly: a partial claim list + would let unextracted claims bypass verification silently.""" + name = _seed(orc_home, tmp_path) + fake = FakeAnthropic( + responder=lambda kwargs: _claims_response(1, stop_reason="max_tokens") + ) + monkeypatch.setattr(client_module, "_client", fake) + monkeypatch.setattr(client_module, "_factory", None) + + ws = ws_module.resolve(name) + skill = directives.get("research").skills["extract_claims"] + with open_run(ws, directive="research", skill="extract_claims", inputs={}) as run: + with pytest.raises(RuntimeError, match="truncated"): + skill.run(workspace=ws, run=run, document="Some long document.") + run.close(output={}) diff --git a/tests/unit/test_loaders.py b/tests/unit/test_loaders.py index d9aeb3a..8fac756 100644 --- a/tests/unit/test_loaders.py +++ b/tests/unit/test_loaders.py @@ -2,11 +2,13 @@ from __future__ import annotations +import io import socket from pathlib import Path import httpx import pytest +from pypdf import PdfWriter from orc.ingest.loaders import load_file, load_url, sha256_bytes @@ -22,6 +24,134 @@ def _text_response(body: str) -> httpx.Response: return httpx.Response(200, headers={"content-type": "text/plain"}, text=body) +def _pdf_bytes(*page_texts: str | None, title: str | None = None) -> bytes: + """Hand-rolled minimal one-object-per-page PDF so tests need no binary fixture. + + Each entry in `page_texts` becomes one page; None produces a page with an + empty content stream, mimicking a scanned/image-only page that pypdf + extracts as "". Streams are uncompressed so the whole file stays tiny. + """ + objects: list[bytes] = [] + + def add(body: str) -> int: + objects.append(body.encode("latin-1")) + return len(objects) + + catalog_num = add("<< /Type /Catalog /Pages 2 0 R >>") + pages_num = add("PLACEHOLDER") # patched below once page object numbers exist + font_num = add("<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>") + + kid_nums = [] + for text in page_texts: + stream = f"BT /F1 12 Tf 72 720 Td ({text}) Tj ET" if text is not None else "" + content_num = add(f"<< /Length {len(stream)} >>\nstream\n{stream}\nendstream") + kid_nums.append( + add( + "<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] " + f"/Resources << /Font << /F1 {font_num} 0 R >> >> " + f"/Contents {content_num} 0 R >>" + ) + ) + + kids = " ".join(f"{n} 0 R" for n in kid_nums) + objects[pages_num - 1] = ( + f"<< /Type /Pages /Kids [{kids}] /Count {len(kid_nums)} >>".encode("latin-1") + ) + info_num = add(f"<< /Title ({title}) >>") if title is not None else None + + out = io.BytesIO() + out.write(b"%PDF-1.4\n") + offsets = [] + for num, body in enumerate(objects, start=1): + offsets.append(out.tell()) + out.write(f"{num} 0 obj\n".encode("latin-1") + body + b"\nendobj\n") + xref_pos = out.tell() + out.write(f"xref\n0 {len(objects) + 1}\n".encode("latin-1")) + out.write(b"0000000000 65535 f \n") + for off in offsets: + out.write(f"{off:010d} 00000 n \n".encode("latin-1")) + trailer = f"<< /Size {len(objects) + 1} /Root {catalog_num} 0 R" + if info_num is not None: + trailer += f" /Info {info_num} 0 R" + trailer += " >>" + out.write(b"trailer\n" + trailer.encode("latin-1")) + out.write(f"\nstartxref\n{xref_pos}\n%%EOF\n".encode("latin-1")) + return out.getvalue() + + +def test_load_pdf_joins_pages_with_blank_line_and_skips_empty_pages( + tmp_path: Path, +) -> None: + p = tmp_path / "multi.pdf" + p.write_bytes(_pdf_bytes("Page one", None, "Page three")) + doc = load_file(p) + assert doc.text == "Page one\n\nPage three" + + +def test_load_pdf_with_no_extractable_text_raises_mentioning_ocr( + tmp_path: Path, +) -> None: + # A scanned/image-only PDF extracts as empty text. Ingesting it silently + # would yield an empty corpus and confident not_found verdicts downstream. + p = tmp_path / "scanned.pdf" + p.write_bytes(_pdf_bytes(None, None)) + with pytest.raises(ValueError, match="OCR"): + load_file(p) + + +def test_load_pdf_unparseable_raises_value_error(tmp_path: Path) -> None: + # pypdf internals (PdfStreamError etc.) must not leak to callers. + p = tmp_path / "corrupt.pdf" + p.write_bytes(b"%PDF-1.4\nthis is not really a pdf") + with pytest.raises(ValueError, match="Could not extract text from PDF"): + load_file(p) + + +def test_load_pdf_encrypted_raises_value_error(tmp_path: Path) -> None: + writer = PdfWriter() + writer.add_blank_page(width=612, height=792) + writer.encrypt("secret") + buf = io.BytesIO() + writer.write(buf) + p = tmp_path / "locked.pdf" + p.write_bytes(buf.getvalue()) + with pytest.raises(ValueError, match="Could not extract text from PDF"): + load_file(p) + + +def test_load_pdf_owner_locked_with_empty_user_password_ingests(tmp_path: Path) -> None: + # A large share of real-world contracts/credit memos are owner-password- + # locked but openable with an empty user password — pypdf decrypts them + # with decrypt(""). Rejecting those would fail ingest on exactly the + # document class PDF support targets. + writer = PdfWriter(clone_from=io.BytesIO(_pdf_bytes("Owner locked body"))) + writer.encrypt(user_password="", owner_password="owner-secret") + buf = io.BytesIO() + writer.write(buf) + p = tmp_path / "owner-locked.pdf" + p.write_bytes(buf.getvalue()) + doc = load_file(p) + assert "Owner locked body" in doc.text + + +def test_load_pdf_prefers_metadata_title_over_fallback(tmp_path: Path) -> None: + p = tmp_path / "scan-target.pdf" + p.write_bytes(_pdf_bytes("Some body text", title="Q3 Credit Memo")) + doc = load_file(p) + assert doc.title == "Q3 Credit Memo" + + +def test_load_pdf_extracts_text_and_falls_back_to_stem_title(tmp_path: Path) -> None: + raw = _pdf_bytes("Hello orc PDF") + p = tmp_path / "credit-memo.pdf" + p.write_bytes(raw) + doc = load_file(p) + assert doc.mime_type == "application/pdf" + assert "Hello orc PDF" in doc.text + assert doc.title == "credit-memo" + assert doc.raw_bytes == raw + + def test_load_markdown_extracts_h1_title(tmp_path: Path) -> None: p = tmp_path / "doc.md" p.write_text("# My Title\n\nSome body content.\n") @@ -73,6 +203,27 @@ def test_load_url_refuses_ssrf_targets(url: str) -> None: load_url(url) +def test_load_url_pdf_content_type_extracts_text( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setattr( + socket, "getaddrinfo", lambda host, port, *a, **k: _addrinfo(PUBLIC_IP, port) + ) + raw = _pdf_bytes("Hello orc PDF") + + def handler(request: httpx.Request) -> httpx.Response: + return httpx.Response( + 200, headers={"content-type": "application/pdf"}, content=raw + ) + + doc = load_url("http://example.com/memo.pdf", transport=httpx.MockTransport(handler)) + + assert doc.mime_type == "application/pdf" + assert "Hello orc PDF" in doc.text + assert doc.raw_bytes == raw + assert doc.source_uri == "http://example.com/memo.pdf" + + def test_load_url_pins_connection_to_validated_ip_with_original_host_header( monkeypatch: pytest.MonkeyPatch, ) -> None: diff --git a/tests/unit/test_routing.py b/tests/unit/test_routing.py index b1fc698..b1b12d3 100644 --- a/tests/unit/test_routing.py +++ b/tests/unit/test_routing.py @@ -5,37 +5,76 @@ import pytest from orc.directives.research.routing import ( + BENCHMARK_SOURCE_TO_MODE, DOMAIN_TO_MODE, UnknownDomainError, route_to_mode, ) -def test_route_to_mode_returns_expected_modes_for_each_domain() -> None: +def test_benchmark_source_map_returns_expected_modes_for_each_source() -> None: # The mapping is the load-bearing piece: per-source-ds F1 in the # source-routed HaluBench result. If this changes without a benchmark # re-run, the public F1 claim drifts from reality. - assert DOMAIN_TO_MODE["covidQA"] == "evidence" - assert DOMAIN_TO_MODE["RAGTruth"] == "evidence" - assert DOMAIN_TO_MODE["halueval"] == "judgment" - assert DOMAIN_TO_MODE["pubmedQA"] == "binary" - assert DOMAIN_TO_MODE["FinanceBench"] == "arithmetic" - assert DOMAIN_TO_MODE["DROP"] == "binary" + assert BENCHMARK_SOURCE_TO_MODE["covidQA"] == "evidence" + assert BENCHMARK_SOURCE_TO_MODE["RAGTruth"] == "evidence" + assert BENCHMARK_SOURCE_TO_MODE["halueval"] == "judgment" + assert BENCHMARK_SOURCE_TO_MODE["pubmedQA"] == "binary" + assert BENCHMARK_SOURCE_TO_MODE["FinanceBench"] == "arithmetic" + assert BENCHMARK_SOURCE_TO_MODE["DROP"] == "binary" # Every value must be one of the modes verify_claim accepts. valid_modes = {"evidence", "judgment", "binary", "decomposed", "arithmetic"} + assert set(BENCHMARK_SOURCE_TO_MODE.values()) <= valid_modes assert set(DOMAIN_TO_MODE.values()) <= valid_modes +def test_benchmark_source_map_contains_exactly_the_six_halubench_sources() -> None: + """Published benchmark numbers were produced with exactly these six + source_ds names — extra or missing keys mean reproducibility drift.""" + assert set(BENCHMARK_SOURCE_TO_MODE) == { + "covidQA", + "RAGTruth", + "halueval", + "pubmedQA", + "FinanceBench", + "DROP", + } + + def test_route_to_mode_none_returns_none() -> None: """None in → None out so verify_claim can fall back to its default mode.""" assert route_to_mode(None) is None -def test_route_to_mode_known_domain_returns_mode() -> None: +def test_route_to_mode_benchmark_source_aliases_still_route() -> None: + """Dataset names predate the product domains; existing callers passing + them must keep routing identically.""" assert route_to_mode("pubmedQA") == "binary" assert route_to_mode("covidQA") == "evidence" +def test_route_to_mode_routes_each_product_domain() -> None: + """Product domains are the real surface — each routes to the mode derived + from the benchmark family it generalizes.""" + assert route_to_mode("general") == "evidence" + assert route_to_mode("legal") == "evidence" + assert route_to_mode("clinical") == "binary" + assert route_to_mode("biomedical") == "binary" + assert route_to_mode("financial") == "arithmetic" + assert route_to_mode("numeric") == "binary" + + +def test_route_to_mode_unknown_domain_message_lists_product_domains() -> None: + """The error must teach the product surface first; benchmark dataset + names are aliases and should be mentioned separately, not as peers.""" + with pytest.raises(UnknownDomainError) as excinfo: + route_to_mode("MadeUpDomain") + message = str(excinfo.value) + for product_domain in ("general", "legal", "clinical", "financial"): + assert product_domain in message + assert "alias" in message + + def test_route_to_mode_unknown_domain_raises() -> None: """Silent fall-through would mask config typos and break replay determinism.""" with pytest.raises(UnknownDomainError) as excinfo: diff --git a/tests/unit/test_verify_claim_skill.py b/tests/unit/test_verify_claim_skill.py index 26a969d..8c6f289 100644 --- a/tests/unit/test_verify_claim_skill.py +++ b/tests/unit/test_verify_claim_skill.py @@ -391,7 +391,7 @@ def test_verify_explicit_mode_wins_over_domain( ) run.close(output={}) - # DROP routes to binary in DOMAIN_TO_MODE; explicit mode="evidence" + # DROP routes to binary via the benchmark alias map; explicit mode="evidence" # must override that, so the record_verdict (evidence) tool was used. assert fake.calls[0]["tool_choice"] == {"type": "tool", "name": "record_verdict"} diff --git a/uv.lock b/uv.lock index c411c2e..9b48075 100644 --- a/uv.lock +++ b/uv.lock @@ -1442,8 +1442,8 @@ wheels = [ ] [[package]] -name = "orc" -version = "0.1.4" +name = "orc-ai" +version = "0.2.0" source = { editable = "." } dependencies = [ { name = "anthropic" }, @@ -1452,6 +1452,7 @@ dependencies = [ { name = "markdown-it-py" }, { name = "mcp" }, { name = "pydantic" }, + { name = "pypdf" }, { name = "python-dotenv" }, { name = "python-ulid" }, { name = "pyyaml" }, @@ -1483,6 +1484,7 @@ requires-dist = [ { name = "markdown-it-py", specifier = ">=3.0" }, { name = "mcp", specifier = ">=1.0" }, { name = "pydantic", specifier = ">=2.7" }, + { name = "pypdf", specifier = ">=4.0" }, { name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0" }, { name = "pytest-asyncio", marker = "extra == 'dev'", specifier = ">=0.23" }, { name = "python-dotenv", specifier = ">=1.0" }, @@ -1898,6 +1900,15 @@ crypto = [ { name = "cryptography" }, ] +[[package]] +name = "pypdf" +version = "6.13.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/99/0a/48fe05c6bb3aa4bb4d2a4079a383d33c0dfec1edf613a642f07d8b8b5c2e/pypdf-6.13.2.tar.gz", hash = "sha256:5a96a17dbdfbf9c2ab24c0a13fa0aba182be22ba6f283098712c16fc242f509f", size = 6479250 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/cb/17/378943705992f74e451a06de3401ce68e3213763c81e44d0614559c45599/pypdf-6.13.2-py3-none-any.whl", hash = "sha256:6eeb9e57693f29d41bd01255d02660cbbb41fd7fc818a982677389a35e4f2083", size = 346555 }, +] + [[package]] name = "pytest" version = "9.0.3"