Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions src/orc/cli_commands/eval_cmd.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,9 @@ def label_command(
claim = (trace.get("inputs") or {}).get("claim") or (trace.get("output") or {}).get("claim")
if not claim:
raise click.ClickException(f"Run {run_id} has no claim to label")
# Resolve the workspace (not just read its name from the trace) so a
# workspace whose db predates schema v2 gets migrated before we write gold.
_resolve(trace["workspace"])
gold.add(
trace["workspace"],
claim=claim,
Expand Down
32 changes: 32 additions & 0 deletions tests/unit/test_eval_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,3 +147,35 @@ def _binary(faithful, confidence):
assert policy is not None
assert policy.target == 0.95
assert 0.0 < policy.escalation_threshold <= 0.97


def test_eval_label_migrates_a_pre_v2_workspace(orc_home, tmp_path, monkeypatch) -> None:
# A workspace whose db was built by pre-v2 code has no gold_claim table and
# is stamped schema_version=1. Labelling a run must migrate it (resolve runs
# ensure_schema), not crash with "no such table".
from orc.paths import workspace_db_path
from orc.storage.db import open_connection

ws = ws_module.create("legacy")
corpus = tmp_path / "corpus"
corpus.mkdir()
(corpus / "doc.md").write_text("# Doc\n\nThe sky is blue.\n")
do_ingest(ws, str(corpus))
fake = FakeAnthropic(responses=[make_verdict_response(label="supported", confidence=0.9)])
monkeypatch.setattr(client_module, "_client", fake)
monkeypatch.setattr(client_module, "_factory", None)
skill = directives.get("research").skills["verify_claim"]
rws = ws_module.resolve("legacy")
with open_run(rws, directive="research", skill="verify_claim", inputs={}) as run:
result = skill.run(workspace=rws, run=run, claim="The sky is blue")
run.close(output=result)
run_id = run.run_id

# Simulate a genuinely pre-v2 db: drop the v2 tables and reset the stamp.
with open_connection(workspace_db_path("legacy")) as conn:
conn.execute("DROP TABLE gold_claim")
conn.execute("UPDATE schema_meta SET value='1' WHERE key='schema_version'")

res = CliRunner().invoke(main, ["eval", "label", run_id, "--verdict", "supported", "-w", "legacy"])
assert res.exit_code == 0, res.output
assert len(gold.list_gold("legacy")) == 1
Loading