diff --git a/CHANGELOG.md b/CHANGELOG.md index 85b3e9f..a6dd4db 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,23 @@ Version numbers follow [SemVer](https://semver.org/spec/v2.0.0.html). ## [Unreleased] -### Added (not yet released) +### Planned + +- `gads` directive (Google Ads agentic analysis: lens-based decomposition, + read-only MCP integration, evidence-bound recommendation verification). +- `orc eval consistency|perturb|retrieval|regression` reliability commands. +- Voyage-AI / OpenAI embedding backends behind the existing `Embedder` protocol. +- Hosted runtime (scheduled triggers, web dashboard, team workspaces). +- Decomposition + arithmetic combined for DROP-shaped multi-step claims. + +## [0.2.0] — unreleased + +Packaged for PyPI as **`orc-ai`** (`orc` is taken by an unrelated project); +the import package (`import orc`) and CLI command (`orc`) are unchanged. The +release workflow publishes on a `v0.2.0` tag once the trusted publisher is +configured — not yet tagged or published. + +### Added - **Hybrid retrieval** — opt-in BM25 + dense-vector retrieval fused with Reciprocal Rank Fusion. Local `sentence-transformers` embedder by default @@ -28,25 +44,6 @@ Version numbers follow [SemVer](https://semver.org/spec/v2.0.0.html). - **`orc eval calibrate`** — derive the tiered escalation threshold from the gold set (lowest cutoff meeting `--target`, default 0.95), with an achievability guard that refuses to silently configure always-escalate. - -### Planned - -- `gads` directive (Google Ads agentic analysis: lens-based decomposition, - read-only MCP integration, evidence-bound recommendation verification). -- `orc eval consistency|perturb|retrieval|regression` reliability commands. -- Voyage-AI / OpenAI embedding backends behind the existing `Embedder` protocol. -- Hosted runtime (scheduled triggers, web dashboard, team workspaces). -- Decomposition + arithmetic combined for DROP-shaped multi-step claims. - -## [0.2.0] — unreleased - -Packaged for PyPI as **`orc-ai`** (`orc` is taken by an unrelated project); -the import package (`import orc`) and CLI command (`orc`) are unchanged. The -release workflow publishes on a `v0.2.0` tag once the trusted publisher is -configured — not yet tagged or published. - -### Added - - **PDF ingestion** — `orc ingest report.pdf` now works alongside markdown, text, json, and URLs. Text is extracted page-by-page via `pypdf`, and the PDF metadata title is used when the body carries no markdown-style heading diff --git a/README.md b/README.md index 8371691..177275d 100644 --- a/README.md +++ b/README.md @@ -75,6 +75,7 @@ claude mcp add orc -- uv run --directory $(pwd) orc mcp serve ``` orc workspace create create a new workspace orc workspace list list workspaces +orc workspace embed [-w ] backfill vector embeddings (embeddings extra) orc ingest [-w ] add evidence (md, txt, json, pdf, urls) orc search "" [-w ] BM25 retrieval, no LLM orc verify "" [-w ] verify a single claim @@ -86,14 +87,19 @@ orc verify "" --mode tiered cheap judge first, escalate only when uns orc eval import [-w ] seed a labelled gold set orc eval label --verdict promote/correct a real verdict into gold orc eval run [-w ] [--json] score the gate (accuracy, calibration, recall) +orc eval show [-w ] reprint a persisted eval report orc eval calibrate [-w ] tune the tiered escalation threshold orc trace show full trace JSON orc trace list [-w ] recent runs orc replay [--live] re-execute a recorded run orc propose --params stage an action for human approval orc approve list [-w ] [--json] list pending approval items +orc approve show full payload for an approval orc approve accept [--note] accept a pending recommendation orc approve reject [--note] reject one +orc execute [-w ] execute one approved action +orc worker [-w ] auto-drain daemon for approved actions +orc audit export [-w ] bundle traces + evidence for an auditor orc mcp serve start the MCP stdio server ``` @@ -117,6 +123,13 @@ orc approve accept -w research orc execute -w research # lands in ~/.orc/workspaces/research/out/ ``` +> **Approver identity is self-reported.** `orc approve accept --by ` +> records whatever name the caller passes (default `$USER`) — orc does not +> authenticate it. Multi-approver gates (`approvers_required > 1`, e.g. for EU +> AI Act Article 14(5)) are an honor system on a shared shell; for a real +> separation-of-duties guarantee, route decisions through an authenticated +> surface that pins `--by` to a verified identity. + ## Architecture ``` diff --git a/src/orc/cli_commands/_shared.py b/src/orc/cli_commands/_shared.py new file mode 100644 index 0000000..0fe24fa --- /dev/null +++ b/src/orc/cli_commands/_shared.py @@ -0,0 +1,17 @@ +"""Helpers shared by CLI command modules.""" + +from __future__ import annotations + +import click + +from orc.errors import WorkspaceNotFoundError +from orc.storage import workspace as ws_module + + +def resolve_workspace(name: str | None) -> ws_module.Workspace: + """Resolve a workspace name (or the env default) to a Workspace, mapping + WorkspaceNotFoundError to a clean CLI error.""" + try: + return ws_module.resolve(name) + except WorkspaceNotFoundError as exc: + raise click.ClickException(str(exc)) from exc diff --git a/src/orc/cli_commands/approve.py b/src/orc/cli_commands/approve.py index c548bd7..8920e9b 100644 --- a/src/orc/cli_commands/approve.py +++ b/src/orc/cli_commands/approve.py @@ -8,14 +8,13 @@ from rich.console import Console from rich.table import Table -from orc.errors import WorkspaceNotFoundError +from orc.cli_commands._shared import resolve_workspace from orc.queue import approval as approval_module from orc.queue.approval import ( ApprovalAlreadyDecidedError, ApprovalNotFoundError, DuplicateApproverError, ) -from orc.storage import workspace as ws_module console = Console() @@ -44,10 +43,7 @@ def approve_group() -> None: @click.option("--json", "as_json", is_flag=True, help="Machine-readable JSON output") def list_command(workspace: str | None, status: str, limit: int, as_json: bool) -> None: """List approvals.""" - try: - ws = ws_module.resolve(workspace) - except WorkspaceNotFoundError as exc: - raise click.ClickException(str(exc)) from exc + ws = resolve_workspace(workspace) items = approval_module.list_approvals( ws.name, status=None if status == "all" else status, limit=limit ) @@ -109,10 +105,7 @@ def list_command(workspace: str | None, status: str, limit: int, as_json: bool) @click.option("--workspace", "-w", default=None) def show_command(approval_id: str, workspace: str | None) -> None: """Print full payload for an approval.""" - try: - ws = ws_module.resolve(workspace) - except WorkspaceNotFoundError as exc: - raise click.ClickException(str(exc)) from exc + ws = resolve_workspace(workspace) try: a = approval_module.get(ws.name, approval_id) except ApprovalNotFoundError as exc: @@ -158,11 +151,25 @@ def show_command(approval_id: str, workspace: str | None) -> None: @click.argument("approval_id") @click.option("--workspace", "-w", default=None) @click.option("--note", default=None, help="Optional decision note") -@click.option("--by", "decided_by", default=None, help="Who decided (defaults to $USER)") +@click.option( + "--by", + "decided_by", + default=None, + help="Who decided (defaults to $USER). Self-reported and unauthenticated: " + "anyone with shell access can pass any name, so multi-approver gates are " + "honor-system unless an authenticated layer supplies this value.", +) def accept_command( approval_id: str, workspace: str | None, note: str | None, decided_by: str | None ) -> None: - """Accept a pending approval.""" + """Accept a pending approval. + + The recorded approver name comes from --by (or $USER) and is not + authenticated by orc. Deployments using approvers_required > 1 as a + compliance control (e.g. EU AI Act Article 14(5)) must ensure decisions + are submitted through an authenticated surface that pins --by to a + verified identity. + """ _decide(approval_id, workspace, note, decided_by, accept=True) @@ -170,7 +177,13 @@ def accept_command( @click.argument("approval_id") @click.option("--workspace", "-w", default=None) @click.option("--note", default=None, help="Optional decision note") -@click.option("--by", "decided_by", default=None) +@click.option( + "--by", + "decided_by", + default=None, + help="Who decided (defaults to $USER). Self-reported and unauthenticated; " + "see `orc approve accept --help`.", +) def reject_command( approval_id: str, workspace: str | None, note: str | None, decided_by: str | None ) -> None: @@ -190,10 +203,7 @@ def _decide( if decided_by is None: decided_by = os.environ.get("USER") or "user" - try: - ws = ws_module.resolve(workspace) - except WorkspaceNotFoundError as exc: - raise click.ClickException(str(exc)) from exc + ws = resolve_workspace(workspace) try: if accept: a = approval_module.accept(ws.name, approval_id, decided_by=decided_by, note=note) diff --git a/src/orc/cli_commands/audit.py b/src/orc/cli_commands/audit.py index b1b8190..308dfec 100644 --- a/src/orc/cli_commands/audit.py +++ b/src/orc/cli_commands/audit.py @@ -62,12 +62,9 @@ def export_command( """Bundle a workspace's traces, run rows, evidence manifest, approvals, and runtime metadata into a single tar.gz for handoff to a regulator, auditor, or customer.""" - from orc.storage import workspace as ws_module + from orc.cli_commands._shared import resolve_workspace - try: - ws = ws_module.resolve(workspace) - except Exception as exc: # noqa: BLE001 — surface as ClickException - raise click.ClickException(str(exc)) from exc + ws = resolve_workspace(workspace) if output_path is None: from orc.core.clock import now_iso diff --git a/src/orc/cli_commands/eval_cmd.py b/src/orc/cli_commands/eval_cmd.py index f140e70..0d573ce 100644 --- a/src/orc/cli_commands/eval_cmd.py +++ b/src/orc/cli_commands/eval_cmd.py @@ -8,9 +8,8 @@ import click import yaml -from orc.errors import WorkspaceNotFoundError +from orc.cli_commands._shared import resolve_workspace from orc.eval import gold -from orc.storage import workspace as ws_module from orc.storage.trace_store import load_trace _LABELS = ["supported", "contradicted", "not_found", "partial"] @@ -26,7 +25,7 @@ def eval_group() -> None: @click.option("--workspace", "-w", default=None, help="Workspace name (env: ORC_DEFAULT_WORKSPACE)") def import_command(path: Path, workspace: str | None) -> None: """Seed gold claims from a YAML file (id/text/expected[/relevant_chunk_ids/note]).""" - ws = _resolve(workspace) + ws = resolve_workspace(workspace) items = yaml.safe_load(path.read_text()) or [] n = 0 for item in items: @@ -69,7 +68,7 @@ def label_command( raise click.ClickException(f"Run {run_id} has no claim to label") # Resolve the workspace (not just read its name from the trace) so a # workspace whose db predates schema v2 gets migrated before we write gold. - _resolve(trace["workspace"]) + resolve_workspace(trace["workspace"]) gold.add( trace["workspace"], claim=claim, @@ -92,7 +91,7 @@ def run_command(workspace: str | None, mode: str, k: int, as_json: bool) -> None """Score the gate against the workspace's gold set.""" from orc.eval.runner import run_eval - ws = _resolve(workspace) + ws = resolve_workspace(workspace) try: report = run_eval(ws.name, mode=mode, k=k) except ValueError as exc: @@ -128,7 +127,7 @@ def show_command(eval_id: str, workspace: str | None, as_json: bool) -> None: """Reprint a persisted eval report.""" from orc.eval.runner import load_eval - ws = _resolve(workspace) + ws = resolve_workspace(workspace) try: report = load_eval(ws.name, eval_id) except KeyError as exc: @@ -165,7 +164,7 @@ def calibrate_command( from orc.eval.calibrate import DEFAULT_TIER1_MODEL, DEFAULT_TIER2_MODEL, calibrate from orc.eval.policy import save_policy - ws = _resolve(workspace) + ws = resolve_workspace(workspace) t1 = tier1_model or DEFAULT_TIER1_MODEL t2 = tier2_model or DEFAULT_TIER2_MODEL result = calibrate(ws.name, target=target, tier1_model=t1) @@ -207,7 +206,7 @@ def calibrate_command( @click.option("--json", "as_json", is_flag=True) def gold_command(action: str, workspace: str | None, as_json: bool) -> None: """Inspect the gold set (currently: list).""" - ws = _resolve(workspace) + ws = resolve_workspace(workspace) items = gold.list_gold(ws.name) stale = { g.gold_id @@ -239,9 +238,3 @@ def gold_command(action: str, workspace: str | None, as_json: bool) -> None: flag = " [stale chunk labels]" if g.gold_id in stale else "" click.echo(f"{g.gold_id} {g.expected_label:<12} {g.claim[:60]}{flag}") - -def _resolve(workspace: str | None) -> ws_module.Workspace: - try: - return ws_module.resolve(workspace) - except WorkspaceNotFoundError as exc: - raise click.ClickException(str(exc)) from exc diff --git a/src/orc/cli_commands/execute.py b/src/orc/cli_commands/execute.py index 3ac1ec6..f9334f8 100644 --- a/src/orc/cli_commands/execute.py +++ b/src/orc/cli_commands/execute.py @@ -13,9 +13,9 @@ from rich.console import Console from orc import effects +from orc.cli_commands._shared import resolve_workspace from orc.effects.action import Action from orc.effects.base import MissingCredentialError -from orc.errors import WorkspaceNotFoundError from orc.queue import approval as approval_module from orc.queue.approval import ( ActionDeadError, @@ -23,7 +23,6 @@ ApprovalNotFoundError, NotApprovedError, ) -from orc.storage import workspace as ws_module console = Console() @@ -33,10 +32,7 @@ @click.option("--workspace", "-w", default=None, help="Workspace name (env: ORC_DEFAULT_WORKSPACE)") def execute_command(approval_id: str, workspace: str | None) -> None: """Execute an approved action by approval_id.""" - try: - ws = ws_module.resolve(workspace) - except WorkspaceNotFoundError as exc: - raise click.ClickException(str(exc)) from exc + ws = resolve_workspace(workspace) existing = approval_module.get_execution(ws.name, approval_id) if existing is not None and existing["exec_status"] == "succeeded": diff --git a/src/orc/cli_commands/ingest.py b/src/orc/cli_commands/ingest.py index d43eeca..f32a0d0 100644 --- a/src/orc/cli_commands/ingest.py +++ b/src/orc/cli_commands/ingest.py @@ -5,9 +5,9 @@ import click from rich.console import Console -from orc.errors import IngestError, WorkspaceNotFoundError +from orc.cli_commands._shared import resolve_workspace +from orc.errors import IngestError from orc.ingest.pipeline import ingest as do_ingest -from orc.storage import workspace as ws_module console = Console() @@ -18,10 +18,7 @@ @click.option("--no-recursive", is_flag=True, help="Skip recursing into subdirectories") def ingest_command(source: str, workspace: str | None, no_recursive: bool) -> None: """Ingest a file, directory, or URL into the workspace's evidence corpus.""" - try: - ws = ws_module.resolve(workspace) - except WorkspaceNotFoundError as exc: - raise click.ClickException(str(exc)) from exc + ws = resolve_workspace(workspace) try: ids = do_ingest(ws, source, recursive=not no_recursive) except IngestError as exc: diff --git a/src/orc/cli_commands/propose.py b/src/orc/cli_commands/propose.py index c854048..a4d819e 100644 --- a/src/orc/cli_commands/propose.py +++ b/src/orc/cli_commands/propose.py @@ -15,10 +15,9 @@ import click from orc import effects -from orc.errors import WorkspaceNotFoundError +from orc.cli_commands._shared import resolve_workspace from orc.paths import config_path from orc.runs import open_run -from orc.storage import workspace as ws_module def _load_params(raw: str) -> dict[str, Any]: @@ -61,10 +60,7 @@ def propose_command( wrong (it would silently stage a duplicate effect). """ params = _load_params(params_raw) - try: - ws = ws_module.resolve(workspace) - except WorkspaceNotFoundError as exc: - raise click.ClickException(str(exc)) from exc + ws = resolve_workspace(workspace) with open_run( ws, diff --git a/src/orc/cli_commands/research.py b/src/orc/cli_commands/research.py index 41397d7..4949c8c 100644 --- a/src/orc/cli_commands/research.py +++ b/src/orc/cli_commands/research.py @@ -8,9 +8,8 @@ from rich.console import Console from orc import directives -from orc.errors import WorkspaceNotFoundError +from orc.cli_commands._shared import resolve_workspace from orc.runs import open_run -from orc.storage import workspace as ws_module console = Console() @@ -29,10 +28,7 @@ def research_command( as_json: bool, ) -> None: """Research a topic against the workspace's evidence corpus.""" - try: - ws = ws_module.resolve(workspace) - except WorkspaceNotFoundError as exc: - raise click.ClickException(str(exc)) from exc + ws = resolve_workspace(workspace) spec = directives.get("research") skill = spec.skills["research_topic"] diff --git a/src/orc/cli_commands/search.py b/src/orc/cli_commands/search.py index e680487..9dfc16c 100644 --- a/src/orc/cli_commands/search.py +++ b/src/orc/cli_commands/search.py @@ -9,9 +9,8 @@ from rich.table import Table from orc import directives -from orc.errors import WorkspaceNotFoundError +from orc.cli_commands._shared import resolve_workspace from orc.runs import open_run -from orc.storage import workspace as ws_module console = Console() @@ -23,10 +22,7 @@ @click.option("--json", "as_json", is_flag=True, help="Emit raw JSON instead of a table") def search_command(query: str, workspace: str | None, k: int, as_json: bool) -> None: """Retrieve top chunks for a query (no LLM).""" - try: - ws = ws_module.resolve(workspace) - except WorkspaceNotFoundError as exc: - raise click.ClickException(str(exc)) from exc + ws = resolve_workspace(workspace) spec = directives.get("research") skill = spec.skills["search_evidence"] diff --git a/src/orc/cli_commands/trace.py b/src/orc/cli_commands/trace.py index d01e2ab..27062d7 100644 --- a/src/orc/cli_commands/trace.py +++ b/src/orc/cli_commands/trace.py @@ -8,8 +8,8 @@ from rich.console import Console from rich.table import Table -from orc.errors import TraceNotFoundError, WorkspaceNotFoundError -from orc.storage import workspace as ws_module +from orc.cli_commands._shared import resolve_workspace +from orc.errors import TraceNotFoundError from orc.storage.trace_store import list_runs, load_trace console = Console() @@ -37,10 +37,7 @@ def show_command(run_id: str) -> None: @click.option("--limit", type=int, default=20, help="Max rows to show") def list_command(workspace: str | None, skill: str | None, limit: int) -> None: """List recent runs in a workspace.""" - try: - ws = ws_module.resolve(workspace) - except WorkspaceNotFoundError as exc: - raise click.ClickException(str(exc)) from exc + ws = resolve_workspace(workspace) rows = list_runs(ws.name, skill=skill, limit=limit) if not rows: console.print("[dim]No runs yet.[/dim]") diff --git a/src/orc/cli_commands/verify.py b/src/orc/cli_commands/verify.py index 17be062..27f90ed 100644 --- a/src/orc/cli_commands/verify.py +++ b/src/orc/cli_commands/verify.py @@ -10,11 +10,10 @@ from rich.console import Console from orc import directives +from orc.cli_commands._shared import resolve_workspace from orc.directives.research.routing import UnknownDomainError -from orc.errors import WorkspaceNotFoundError from orc.ingest.loaders import load_file, load_url from orc.runs import open_run -from orc.storage import workspace as ws_module from orc.storage.workspace import Workspace console = Console() @@ -69,10 +68,7 @@ def verify_command( if not claim and not from_file and not from_url: raise click.UsageError("Provide CLAIM, --file , or --url .") - try: - ws = ws_module.resolve(workspace) - except WorkspaceNotFoundError as exc: - raise click.ClickException(str(exc)) from exc + ws = resolve_workspace(workspace) if from_file or from_url: _verify_from_document( diff --git a/src/orc/cli_commands/worker.py b/src/orc/cli_commands/worker.py index 7ef8a98..ed4a39a 100644 --- a/src/orc/cli_commands/worker.py +++ b/src/orc/cli_commands/worker.py @@ -9,9 +9,8 @@ import click from rich.console import Console +from orc.cli_commands._shared import resolve_workspace from orc.effects.worker import drain_once, run_worker -from orc.errors import WorkspaceNotFoundError -from orc.storage import workspace as ws_module console = Console() @@ -25,10 +24,7 @@ def worker_command( workspace: str | None, once: bool, poll_interval: float, max_attempts: int ) -> None: """Execute approved actions from the queue.""" - try: - ws = ws_module.resolve(workspace) - except WorkspaceNotFoundError as exc: - raise click.ClickException(str(exc)) from exc + ws = resolve_workspace(workspace) if once: summary = drain_once(ws.name, max_attempts=max_attempts) diff --git a/src/orc/cli_commands/workspace.py b/src/orc/cli_commands/workspace.py index 9be64c9..4e77911 100644 --- a/src/orc/cli_commands/workspace.py +++ b/src/orc/cli_commands/workspace.py @@ -12,7 +12,8 @@ from rich.markup import escape from rich.table import Table -from orc.errors import EmbeddingsUnavailableError, WorkspaceExistsError, WorkspaceNotFoundError +from orc.cli_commands._shared import resolve_workspace +from orc.errors import EmbeddingsUnavailableError, WorkspaceExistsError from orc.paths import workspace_db_path from orc.retrieval.embedder import DEFAULT_EMBEDDING_MODEL, embedder_available, get_embedder from orc.storage import workspace as ws_module @@ -84,10 +85,7 @@ def create_command(name: str, embeddings: bool, embedding_model: str | None) -> ) def embed_command(name: str, model: str | None) -> None: """Backfill vector embeddings for all unembedded chunks in a workspace.""" - try: - ws = ws_module.resolve(name) - except WorkspaceNotFoundError as exc: - raise click.ClickException(str(exc)) from exc + ws = resolve_workspace(name) if ws.embedding_model is None: effective_model = model or DEFAULT_EMBEDDING_MODEL diff --git a/src/orc/config.py b/src/orc/config.py deleted file mode 100644 index 44b15ec..0000000 --- a/src/orc/config.py +++ /dev/null @@ -1,48 +0,0 @@ -"""Global configuration. Read from ~/.orc/config.toml when present, env vars override.""" - -from __future__ import annotations - -import os -import tomllib -from dataclasses import dataclass - -from orc.paths import config_path - - -@dataclass(frozen=True) -class Config: - default_workspace: str = "default" - default_verify_model: str = "claude-sonnet-4-6" - default_research_model: str = "claude-sonnet-4-6" - default_extract_model: str = "claude-haiku-4-5" - default_retrieval_k: int = 10 - default_retrieval_pool: int = 50 - - @classmethod - def load(cls) -> Config: - path = config_path() - data: dict[str, object] = {} - if path.exists(): - with path.open("rb") as f: - data = tomllib.load(f) - - return cls( - default_workspace=os.environ.get( - "ORC_DEFAULT_WORKSPACE", - str(data.get("default_workspace", "default")), - ), - default_verify_model=os.environ.get( - "ORC_VERIFY_MODEL", - str(data.get("verify_model", "claude-sonnet-4-6")), - ), - default_research_model=os.environ.get( - "ORC_RESEARCH_MODEL", - str(data.get("research_model", "claude-sonnet-4-6")), - ), - default_extract_model=os.environ.get( - "ORC_EXTRACT_MODEL", - str(data.get("extract_model", "claude-haiku-4-5")), - ), - default_retrieval_k=int(data.get("retrieval_k", 10)), - default_retrieval_pool=int(data.get("retrieval_pool", 50)), - ) diff --git a/src/orc/directives/research/routing.py b/src/orc/directives/research/routing.py index 40f5072..39ca26c 100644 --- a/src/orc/directives/research/routing.py +++ b/src/orc/directives/research/routing.py @@ -16,12 +16,10 @@ from __future__ import annotations -from orc.errors import OrcError +# Re-exported for callers that historically imported it from here. +from orc.errors import UnknownDomainError - -class UnknownDomainError(OrcError): - """Raised when a caller passes a domain that is neither a product domain - (DOMAIN_TO_MODE) nor a benchmark source alias (BENCHMARK_SOURCE_TO_MODE).""" +__all__ = ["BENCHMARK_SOURCE_TO_MODE", "DOMAIN_TO_MODE", "UnknownDomainError", "route_to_mode"] # Product domains. Each mode is derived from the benchmark family the domain diff --git a/src/orc/directives/research/skills/verify_claim.py b/src/orc/directives/research/skills/verify_claim.py index 7a52c28..66bd6d0 100644 --- a/src/orc/directives/research/skills/verify_claim.py +++ b/src/orc/directives/research/skills/verify_claim.py @@ -574,6 +574,7 @@ def _run_decomposed( atom_results: list[dict[str, Any]] = [] valid_ids: set[str] = set() + atom_retrievals: list[dict[str, Any]] = [] for i, atom in enumerate(atoms): # Re-enter verify_claim in binary mode for each atom against the SAME # workspace + same retrieved chunks. Each atom's verdict is recorded @@ -591,6 +592,10 @@ def _run_decomposed( mode="binary", evidence_id=evidence_id, ) + # Each sub-run overwrites run.retrieval; snapshot it so the parent + # trace can aggregate over every atom instead of just the last one. + if run.retrieval is not None: + atom_retrievals.append(run.retrieval) valid_ids.update(sub_result.get("retrieval_chunk_ids") or []) atom_results.append( { @@ -602,6 +607,19 @@ def _run_decomposed( ) run.record(f"decomposed_atom_{i}", atom_results[-1]) + if atom_retrievals: + returned_by_id: dict[str, dict[str, Any]] = {} + for rec in atom_retrievals: + for summary in rec["returned"]: + returned_by_id.setdefault(summary["chunk_id"], summary) + run.retrieval = { + "method": atom_retrievals[-1]["method"], + "candidates_considered": sum( + rec["candidates_considered"] for rec in atom_retrievals + ), + "returned": list(returned_by_id.values()), + } + # Majority aggregation, confidence-weighted. Atoms run in binary mode, # which only ever yields "supported" (faithful) or "not_found" # (unfaithful — contradicted OR silent, indistinguishable without a diff --git a/src/orc/errors.py b/src/orc/errors.py index 7cd9fba..0f2e74c 100644 --- a/src/orc/errors.py +++ b/src/orc/errors.py @@ -32,3 +32,8 @@ class IngestError(OrcError): class EmbeddingsUnavailableError(OrcError): """Embeddings were requested but the optional dependencies are missing.""" + + +class UnknownDomainError(OrcError): + """Raised when a caller passes a domain that is neither a product domain + nor a benchmark source alias (see orc.directives.research.routing).""" diff --git a/src/orc/runs/runner.py b/src/orc/runs/runner.py index f45e699..dcdb251 100644 --- a/src/orc/runs/runner.py +++ b/src/orc/runs/runner.py @@ -227,6 +227,15 @@ def close( total_cache_read = sum(c.cache_read_input_tokens for c in self.llm_calls) total_cache_creation = sum(c.cache_creation_input_tokens for c in self.llm_calls) + # Trace JSON first, db row second: if the JSON write fails the row stays + # 'running', never 'ok' with no trace file (which audit export treats as + # corruption). The reverse failure (row stuck 'running' with a trace on + # disk) is recoverable. + payload = self.build_trace_payload( + ended_at=ended_at, status=status, output=out, error_message=error_message + ) + write_trace_json(self.workspace.name, self.run_id, self.started_at, payload) + with transaction(self.conn): finalize_run_row( self.conn, @@ -242,11 +251,6 @@ def close( error_message=error_message, ) - payload = self.build_trace_payload( - ended_at=ended_at, status=status, output=out, error_message=error_message - ) - write_trace_json(self.workspace.name, self.run_id, self.started_at, payload) - def build_trace_payload( self, *, diff --git a/tests/unit/test_cli_shared.py b/tests/unit/test_cli_shared.py new file mode 100644 index 0000000..c639e8c --- /dev/null +++ b/tests/unit/test_cli_shared.py @@ -0,0 +1,22 @@ +"""Shared CLI helper tests.""" + +from __future__ import annotations + +from pathlib import Path + +import click +import pytest + +from orc.cli_commands._shared import resolve_workspace +from orc.storage import workspace as ws_module + + +def test_resolve_workspace_returns_existing_workspace(orc_home: Path) -> None: + ws_module.create("demo") + ws = resolve_workspace("demo") + assert ws.name == "demo" + + +def test_resolve_workspace_maps_missing_workspace_to_click_exception(orc_home: Path) -> None: + with pytest.raises(click.ClickException, match="nope"): + resolve_workspace("nope") diff --git a/tests/unit/test_run_context.py b/tests/unit/test_run_context.py index 88678b9..7cb3ce8 100644 --- a/tests/unit/test_run_context.py +++ b/tests/unit/test_run_context.py @@ -65,6 +65,33 @@ def test_run_records_error_on_exception(orc_home: Path) -> None: assert "boom" in row["error_message"] +def test_failed_trace_write_does_not_finalize_run_row(orc_home: Path, monkeypatch) -> None: + """A run whose trace JSON cannot be written must never show status='ok' in the + db — audit export treats an ok row with no trace file as corruption.""" + import orc.runs.runner as runner_module + + ws = ws_module.create("demo") + + def _disk_full(*args: object, **kwargs: object) -> None: + raise OSError("disk full") + + monkeypatch.setattr(runner_module, "write_trace_json", _disk_full) + + run_id = None + write_failed = False + try: + with open_run(ws, directive="research", skill="search_evidence", inputs={}) as run: + run_id = run.run_id + run.close(output={}) + except OSError: + write_failed = True + + assert write_failed + with open_connection(workspace_db_path("demo")) as conn: + row = conn.execute("SELECT status FROM run WHERE run_id = ?", (run_id,)).fetchone() + assert row["status"] != "ok" + + def test_list_runs_orders_newest_first(orc_home: Path) -> None: ws = ws_module.create("demo") for i in range(3): diff --git a/tests/unit/test_verify_claim_modes.py b/tests/unit/test_verify_claim_modes.py index ed75a35..71d432a 100644 --- a/tests/unit/test_verify_claim_modes.py +++ b/tests/unit/test_verify_claim_modes.py @@ -111,6 +111,33 @@ def test_verify_decomposed_all_unfaithful_atoms_yields_not_found( assert result["label"] == "not_found" +def test_verify_decomposed_trace_records_retrieval_across_all_atoms( + orc_home: Path, tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """Each atom's binary sub-run overwrote run.retrieval, so the trace audited + only the last atom's retrieval. The aggregate must account for every atom: + candidates_considered sums across atoms, returned is the deduped union.""" + name = _setup_corpus(orc_home, tmp_path) + fake = FakeAnthropic( + responses=[ + _make_decomposition_response(["A", "B"]), + _make_binary_response(faithful=True, confidence=0.9), + _make_binary_response(faithful=True, confidence=0.9), + ] + ) + _install_fake_client(monkeypatch, fake) + + result = _run_skill(name, claim="x", mode="decomposed") + + traces = list(workspace_traces_dir(name).rglob(f"{result['_run_id']}.json")) + assert len(traces) == 1 + retrieval = json.loads(traces[0].read_text())["retrieval"] + # 2 atoms x 1 candidate each: both retrievals must be accounted for. + assert retrieval["candidates_considered"] == 2 + assert {r["chunk_id"] for r in retrieval["returned"]} == {_real_chunk_id(name)} + assert retrieval["method"] == "binary_all" + + def test_verify_decomposed_negative_majority_yields_not_found_not_contradicted( orc_home: Path, tmp_path: Path, monkeypatch: pytest.MonkeyPatch ) -> None: