diff --git a/profiles/claude-code/runbook.md b/profiles/claude-code/runbook.md deleted file mode 100644 index 467167d..0000000 --- a/profiles/claude-code/runbook.md +++ /dev/null @@ -1,71 +0,0 @@ -# Eval run — {{SKILL_NAME}} (iteration {{ITERATION}}) - -You are an agent in a **fresh, isolated** session. Follow this runbook top to bottom to run -the eval and produce `benchmark.json`. Everything you need is in this iteration directory — -you should not need anything from the surrounding repo. - -- **Skill under test:** {{SKILL_NAME}} -- **Mode:** {{MODE}} — comparing `{{COND_A}}` vs `{{COND_B}}` -- **Dispatches:** {{NUM_TASKS}} (the `tasks[]` array in `{{DISPATCH_JSON}}`) - -The two conditions run as **separate batches** in this one session: dispatch every subagent of -one batch, wait for them **all** to return, then switch conditions before dispatching the next. -Never interleave the batches — `switch-condition` removes the off-condition's staged skill, and a -subagent still in flight could observe a half-removed skill or read the wrong one. - -## 1. Dispatch the `{{COND_A}}` batch - -{{DISPATCH_COND_A}} - -Wait for **every** one of these subagents to return before continuing. - -## 2. Switch to the `{{COND_B}}` condition - -This removes the `{{COND_A}}` staged skill so the `{{COND_B}}` batch cannot read it: - -``` -{{SWITCH_CMD}} -``` - -## 3. Dispatch the `{{COND_B}}` batch - -{{DISPATCH_COND_B}} - -Wait for **every** one of these subagents to return before continuing. - -## 4. Ingest - -``` -{{INGEST_CMD}} -``` - -`ingest` records each run, backfills transcripts, scans for stray writes, and grades every -mechanical assertion. It then prints any `llm_judge` tasks it could not grade itself. - -## 5. Dispatch the judge subagents, then finalize - -Dispatch each judge task `ingest` listed as a subagent the same way — pass its -`agent_description` verbatim — then merge the verdicts and aggregate: - -``` -{{FINALIZE_CMD}} -``` - -## 6. Read the result - -`finalize` writes the cross-condition benchmark to: - -``` -{{BENCHMARK_PATH}} -``` - -Read it for the per-condition pass rates and the `{{COND_A}}` − `{{COND_B}}` deltas. This is -the artifact the prep session resumes on. - -## 7. Tear down - -When you are done, remove the staged skills (and the write guard, if armed): - -``` -{{TEARDOWN_CMD}} -``` diff --git a/src/adapters/claude_cli.rs b/src/adapters/claude_cli.rs index 2924b49..24b22d0 100644 --- a/src/adapters/claude_cli.rs +++ b/src/adapters/claude_cli.rs @@ -1,5 +1,5 @@ -//! Claude Code `claude -p` command rendering for `DispatchMechanism::Cli` -//! guidance (hybrid / headless run modes). +//! Claude Code `claude -p` command rendering for dispatch guidance +//! (hybrid / headless run modes). //! //! Differences from the Codex recipe, all forced by the `claude` CLI: //! `--output-format stream-json` requires `--verbose` in `-p` mode; there is no diff --git a/src/adapters/claude_code_session.rs b/src/adapters/claude_code_session.rs index 0462edf..376c037 100644 --- a/src/adapters/claude_code_session.rs +++ b/src/adapters/claude_code_session.rs @@ -7,8 +7,6 @@ //! `` block. Both live in an adapter rather than the harness- //! agnostic orchestrator so a new harness adds its own renderer alongside. -use std::path::{Path, PathBuf}; - use crate::core::AvailableSkill; /// Render the list of discoverable skills the way a real Claude Code session @@ -40,54 +38,10 @@ pub fn render_plan_mode_context(profile_text: &str) -> String { format!("\n{trimmed}\n") } -/// Slugify an absolute path the way Claude Code names its project directories: -/// every non-alphanumeric character becomes `-`. For example -/// `/Users/x/.config/oc` → `-Users-x--config-oc` (the `/` before `.config` and -/// the `.` each map to a `-`, producing the double hyphen). -pub fn slugify_project_path(path: &Path) -> String { - path.to_string_lossy() - .chars() - .map(|c| if c.is_ascii_alphanumeric() { c } else { '-' }) - .collect() -} - -/// Locate the subagents transcript dir for a Claude Code session. -/// -/// Returns `/projects///subagents/` when it -/// exists, where `` is [`slugify_project_path`] of `cwd`. If the -/// cwd-derived slug doesn't match (e.g. the command ran from a subdirectory of -/// the session's project), scans `/projects/*` for a child named -/// `` — the session id is a globally-unique UUID, so at most one -/// project dir contains it. Returns `None` if no `subagents/` dir is found. -pub fn resolve_subagents_dir_for_session( - config_dir: &Path, - cwd: &Path, - session_id: &str, -) -> Option { - let projects = config_dir.join("projects"); - let primary = projects - .join(slugify_project_path(cwd)) - .join(session_id) - .join("subagents"); - if primary.is_dir() { - return Some(primary); - } - let entries = std::fs::read_dir(&projects).ok()?; - for entry in entries.flatten() { - let candidate = entry.path().join(session_id).join("subagents"); - if candidate.is_dir() { - return Some(candidate); - } - } - None -} - #[cfg(test)] mod tests { use super::*; use crate::core::AvailableSkill; - use std::fs; - use tempfile::TempDir; fn skill(name: &str, description: &str) -> AvailableSkill { AvailableSkill { @@ -137,94 +91,4 @@ mod tests { assert_eq!(render_plan_mode_context(""), ""); assert_eq!(render_plan_mode_context(" \n "), ""); } - - #[test] - fn slugify_matches_claude_code_double_hyphen() { - // Verified against a real Claude Code project dir: the `/` before `.config` - // and the `.` both become `-`, producing a double hyphen. - assert_eq!( - slugify_project_path(Path::new("/Users/maxhaarhaus/.config/opencode")), - "-Users-maxhaarhaus--config-opencode" - ); - } - - #[test] - fn slugify_replaces_all_non_alphanumerics_keeping_alnum() { - assert_eq!( - slugify_project_path(Path::new("/a-b/c.d_e f")), - "-a-b-c-d-e-f" - ); - assert_eq!(slugify_project_path(Path::new("/Proj9/v2")), "-Proj9-v2"); - } - - /// Create `/projects///subagents/` and return the subagents path. - fn make_subagents(config: &Path, project_dir: &str, sid: &str) -> std::path::PathBuf { - let dir = config - .join("projects") - .join(project_dir) - .join(sid) - .join("subagents"); - fs::create_dir_all(&dir).unwrap(); - dir - } - - #[test] - fn resolve_finds_primary_cwd_slug_path() { - let tmp = TempDir::new().unwrap(); - let cwd = Path::new("/tmp/proj"); - let sid = "5ade3f59-dda3-4f40-8776-79f82ba0fab2"; - let expected = make_subagents(tmp.path(), "-tmp-proj", sid); - assert_eq!( - resolve_subagents_dir_for_session(tmp.path(), cwd, sid), - Some(expected) - ); - } - - #[test] - fn resolve_falls_back_to_scan_when_cwd_slug_differs() { - let tmp = TempDir::new().unwrap(); - let cwd = Path::new("/tmp/proj"); // slug `-tmp-proj` is NOT created - let sid = "11111111-2222-3333-4444-555555555555"; - let expected = make_subagents(tmp.path(), "some-other-project-slug", sid); - assert_eq!( - resolve_subagents_dir_for_session(tmp.path(), cwd, sid), - Some(expected) - ); - } - - #[test] - fn resolve_prefers_primary_over_scan_match() { - let tmp = TempDir::new().unwrap(); - let cwd = Path::new("/tmp/proj"); - let sid = "99999999-aaaa-bbbb-cccc-dddddddddddd"; - // A scan candidate that sorts first, plus the cwd-slug primary. - make_subagents(tmp.path(), "aaa-other", sid); - let primary = make_subagents(tmp.path(), "-tmp-proj", sid); - assert_eq!( - resolve_subagents_dir_for_session(tmp.path(), cwd, sid), - Some(primary) - ); - } - - #[test] - fn resolve_none_when_session_dir_absent() { - let tmp = TempDir::new().unwrap(); - fs::create_dir_all(tmp.path().join("projects")).unwrap(); - assert_eq!( - resolve_subagents_dir_for_session(tmp.path(), Path::new("/tmp/proj"), "no-such-sid"), - None - ); - } - - #[test] - fn resolve_none_when_subagents_subdir_missing() { - let tmp = TempDir::new().unwrap(); - let sid = "abcdabcd-0000-1111-2222-333333333333"; - // Session dir exists (under the cwd slug) but without a `subagents/` child. - fs::create_dir_all(tmp.path().join("projects").join("-tmp-proj").join(sid)).unwrap(); - assert_eq!( - resolve_subagents_dir_for_session(tmp.path(), Path::new("/tmp/proj"), sid), - None - ); - } } diff --git a/src/adapters/claude_code_transcript.rs b/src/adapters/claude_code_transcript.rs index c5edad9..811ec5e 100644 --- a/src/adapters/claude_code_transcript.rs +++ b/src/adapters/claude_code_transcript.rs @@ -1,10 +1,10 @@ -//! Claude Code transcript parsing. +//! Claude Code transcript record types and tool-call extraction. //! -//! Reads a JSONL session -//! transcript and extracts ordered [`ToolInvocation`]s (matching `tool_result` -//! blocks back to their `tool_use` by id), plus a [`TranscriptSummary`] with -//! deduped token totals, wall-clock duration, and the final assistant text. -//! Also resolves subagent transcripts by their `.meta.json` description. +//! Defines the JSONL record shapes and the shared extractors — ordered +//! [`ToolInvocation`]s (matching `tool_result` blocks back to their `tool_use` by +//! id) and the last assistant text — reused by the `claude -p` stream-json parser +//! ([`claude_stream_json`](super::claude_stream_json)), plus the +//! [`TranscriptSummary`] the pipeline consumes. use crate::core::ToolInvocation; use serde::{Deserialize, Serialize}; @@ -12,8 +12,7 @@ use serde_json::Value; use std::collections::HashMap; use std::fs; use std::io; -use std::path::{Path, PathBuf}; -use std::time::SystemTime; +use std::path::Path; #[derive(Debug, Deserialize)] pub(crate) struct UsageRecord { @@ -25,8 +24,6 @@ pub(crate) struct UsageRecord { #[derive(Debug, Deserialize)] struct Message { - id: Option, - usage: Option, /// String or array of content blocks; inspected as raw JSON. content: Option, } @@ -35,7 +32,6 @@ struct Message { pub(crate) struct TranscriptRecord { #[serde(rename = "type")] pub(crate) record_type: Option, - timestamp: Option, message: Option, } @@ -146,11 +142,6 @@ pub(crate) fn extract_invocations(records: &[TranscriptRecord]) -> Vec io::Result> { - Ok(extract_invocations(&read_records(jsonl_path)?)) -} - /// The concatenated text blocks of the last assistant message carrying any text. /// Shared with the `-p` stream-json parser, which uses it as the final-message /// fallback when the terminal `result` event is absent or errored. @@ -176,163 +167,21 @@ pub(crate) fn last_assistant_text(records: &[TranscriptRecord]) -> Option, - /// Sum of usage across unique API responses (deduped by `message.id`). - /// Includes cache creation/read tokens — a different accounting than the - /// harness's task-completion event. + /// Total token usage (input + output + cache creation/read), as reported by + /// the run's terminal `result` event. pub total_tokens: Option, - /// Wall clock between the first and last line timestamps. + /// Wall-clock duration, as reported by the run's terminal `result` event. pub duration_ms: Option, /// Concatenated text blocks of the last assistant message. pub final_text: Option, } -fn parse_millis(s: &str) -> Option { - chrono::DateTime::parse_from_rfc3339(s) - .ok() - .map(|dt| dt.timestamp_millis()) -} - -/// Parse the transcript into a full [`TranscriptSummary`]. -pub fn parse_transcript_full(jsonl_path: &Path) -> io::Result { - let records = read_records(jsonl_path)?; - - let mut usage_by_id: HashMap<&str, &UsageRecord> = HashMap::new(); - let mut first_ts: Option = None; - let mut last_ts: Option = None; - let mut timestamp_count = 0usize; - - for record in &records { - if let Some(ts_str) = &record.timestamp - && let Some(ts) = parse_millis(ts_str) - { - if first_ts.is_none() { - first_ts = Some(ts); - } - last_ts = Some(ts); - timestamp_count += 1; - } - - if record.record_type.as_deref() != Some("assistant") { - continue; - } - - if let Some(msg) = &record.message - && let (Some(id), Some(usage)) = (&msg.id, &msg.usage) - { - usage_by_id.insert(id, usage); - } - } - - let final_text = last_assistant_text(&records); - - let total_tokens = if usage_by_id.is_empty() { - None - } else { - Some( - usage_by_id - .values() - .map(|u| { - u.input_tokens.unwrap_or(0) - + u.output_tokens.unwrap_or(0) - + u.cache_creation_input_tokens.unwrap_or(0) - + u.cache_read_input_tokens.unwrap_or(0) - }) - .sum(), - ) - }; - - let duration_ms = match (first_ts, last_ts) { - (Some(f), Some(l)) if timestamp_count >= 2 => Some(l - f), - _ => None, - }; - - Ok(TranscriptSummary { - tool_invocations: extract_invocations(&records), - total_tokens, - duration_ms, - final_text, - }) -} - -/// Metadata sidecar (`.meta.json`) written alongside a subagent transcript. -#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] -pub struct SubagentMeta { - #[serde(rename = "agentType", skip_serializing_if = "Option::is_none")] - pub agent_type: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub description: Option, - #[serde(rename = "toolUseId", skip_serializing_if = "Option::is_none")] - pub tool_use_id: Option, -} - -/// A discovered subagent transcript and its metadata sidecar. -#[derive(Debug, Clone, PartialEq)] -pub struct SubagentEntry { - pub jsonl_path: PathBuf, - pub meta_path: PathBuf, - pub meta: SubagentMeta, -} - -/// List subagent transcripts (each a `.meta.json` with a sibling -/// `.jsonl`) under `subagents_dir`. Returns `[]` if the dir is missing. -pub fn list_subagents(subagents_dir: &Path) -> Vec { - let mut out = Vec::new(); - let Ok(entries) = fs::read_dir(subagents_dir) else { - return out; - }; - for entry in entries.flatten() { - let file_name = entry.file_name(); - let name = file_name.to_string_lossy(); - let Some(base) = name.strip_suffix(".meta.json") else { - continue; - }; - let meta_path = subagents_dir.join(file_name.as_os_str()); - let jsonl_path = subagents_dir.join(format!("{base}.jsonl")); - if !jsonl_path.exists() { - continue; - } - let Ok(raw) = fs::read_to_string(&meta_path) else { - continue; - }; - let Ok(meta) = serde_json::from_str::(&raw) else { - continue; - }; - out.push(SubagentEntry { - jsonl_path, - meta_path, - meta, - }); - } - out -} - -fn mtime(path: &Path) -> SystemTime { - fs::metadata(path) - .and_then(|m| m.modified()) - .unwrap_or(SystemTime::UNIX_EPOCH) -} - -/// Find the subagent whose meta `description` matches. On duplicates (a retry -/// within the same run), returns the most-recently-written transcript. -pub fn find_by_description(subagents_dir: &Path, description: &str) -> Option { - let mut matches: Vec = list_subagents(subagents_dir) - .into_iter() - .filter(|e| e.meta.description.as_deref() == Some(description)) - .collect(); - if matches.len() <= 1 { - return matches.pop(); - } - matches.sort_by_key(|e| std::cmp::Reverse(mtime(&e.jsonl_path))); - matches.into_iter().next() -} - #[cfg(test)] mod tests { use super::*; use serde_json::{Value, json}; - use std::fs::{self, File}; + use std::fs; use std::path::Path; - use std::time::{Duration, SystemTime}; use tempfile::TempDir; fn write_jsonl(path: &Path, lines: &[Value]) { @@ -344,6 +193,12 @@ mod tests { fs::write(path, format!("{body}\n")).unwrap(); } + /// Read the records and run the shared tool-call extractor — the path the + /// stream-json parser also takes. + fn invocations(path: &Path) -> Vec { + extract_invocations(&read_records(path).unwrap()) + } + #[test] fn extracts_tool_use_blocks_with_ordinal_and_args() { let dir = TempDir::new().unwrap(); @@ -365,7 +220,7 @@ mod tests { ], ); - let result = parse_transcript(&path).unwrap(); + let result = invocations(&path); assert_eq!(result.len(), 2); assert_eq!(result[0].name, "Bash"); assert_eq!(result[0].ordinal, 0); @@ -391,7 +246,7 @@ mod tests { json!({"type": "assistant", "message": {"role": "assistant", "content": [{"type": "text", "text": "hello"}]}}), ], ); - assert_eq!(parse_transcript(&path).unwrap(), vec![]); + assert_eq!(invocations(&path), vec![]); } #[test] @@ -407,7 +262,7 @@ mod tests { let body = format!("{good_a}\nnot valid json\n{good_b}\n"); fs::write(&path, body).unwrap(); - let result = parse_transcript(&path).unwrap(); + let result = invocations(&path); assert_eq!(result.len(), 2); assert_eq!( result.iter().map(|r| r.name.as_str()).collect::>(), @@ -430,88 +285,15 @@ mod tests { ]}}), ], ); - let result = parse_transcript(&path).unwrap(); + let result = invocations(&path); assert_eq!(result.len(), 1); assert_eq!(result[0].result, Some(Value::String("hi".into()))); } - fn usage(output: i64) -> Value { - json!({ - "input_tokens": 100, - "cache_creation_input_tokens": 50, - "cache_read_input_tokens": 200, - "output_tokens": output, - }) - } - - #[test] - fn sums_usage_across_unique_message_ids() { - let dir = TempDir::new().unwrap(); - let path = dir.path().join("full-dedup.jsonl"); - write_jsonl( - &path, - &[ - json!({"type": "user", "timestamp": "2026-06-04T10:00:00.000Z", "message": {"role": "user", "content": "go"}}), - json!({"type": "assistant", "timestamp": "2026-06-04T10:00:05.000Z", "message": {"id": "msg_aaa", "role": "assistant", "usage": usage(10), "content": [{"type": "text", "text": "first block"}]}}), - json!({"type": "assistant", "timestamp": "2026-06-04T10:00:06.000Z", "message": {"id": "msg_aaa", "role": "assistant", "usage": usage(10), "content": [{"type": "tool_use", "id": "toolu_1", "name": "Bash", "input": {"command": "ls"}}]}}), - json!({"type": "assistant", "timestamp": "2026-06-04T10:01:00.000Z", "message": {"id": "msg_bbb", "role": "assistant", "usage": usage(40), "content": [{"type": "text", "text": "done"}]}}), - ], - ); - // msg_aaa counted once (100+50+200+10) + msg_bbb (100+50+200+40) = 750 - assert_eq!( - parse_transcript_full(&path).unwrap().total_tokens, - Some(750) - ); - } - - #[test] - fn returns_null_total_tokens_when_no_usage() { - let dir = TempDir::new().unwrap(); - let path = dir.path().join("full-no-usage.jsonl"); - write_jsonl( - &path, - &[ - json!({"type": "assistant", "message": {"role": "assistant", "content": [{"type": "text", "text": "hi"}]}}), - ], - ); - assert_eq!(parse_transcript_full(&path).unwrap().total_tokens, None); - } - #[test] - fn derives_duration_from_first_and_last_timestamps() { + fn last_assistant_text_concatenates_text_of_last_assistant_message() { let dir = TempDir::new().unwrap(); - let path = dir.path().join("full-duration.jsonl"); - write_jsonl( - &path, - &[ - json!({"type": "user", "timestamp": "2026-06-04T10:00:00.000Z", "message": {"role": "user", "content": "go"}}), - json!({"type": "assistant", "timestamp": "2026-06-04T10:02:30.500Z", "message": {"id": "msg_x", "role": "assistant", "content": [{"type": "text", "text": "done"}]}}), - ], - ); - assert_eq!( - parse_transcript_full(&path).unwrap().duration_ms, - Some(150_500) - ); - } - - #[test] - fn returns_null_duration_with_fewer_than_two_timestamps() { - let dir = TempDir::new().unwrap(); - let path = dir.path().join("full-one-ts.jsonl"); - write_jsonl( - &path, - &[ - json!({"type": "assistant", "timestamp": "2026-06-04T10:00:00.000Z", "message": {"role": "assistant", "content": []}}), - json!({"type": "assistant", "message": {"role": "assistant", "content": []}}), - ], - ); - assert_eq!(parse_transcript_full(&path).unwrap().duration_ms, None); - } - - #[test] - fn final_text_is_concatenated_text_of_last_assistant_message() { - let dir = TempDir::new().unwrap(); - let path = dir.path().join("full-final-text.jsonl"); + let path = dir.path().join("final-text.jsonl"); write_jsonl( &path, &[ @@ -525,109 +307,19 @@ mod tests { ], ); assert_eq!( - parse_transcript_full(&path).unwrap().final_text, + last_assistant_text(&read_records(&path).unwrap()), Some("All tests pass.\nWrapping up.".into()) ); } #[test] - fn final_text_is_null_when_no_assistant_text() { + fn last_assistant_text_is_null_when_no_assistant_text() { let dir = TempDir::new().unwrap(); - let path = dir.path().join("full-no-text.jsonl"); + let path = dir.path().join("no-text.jsonl"); write_jsonl( &path, &[json!({"type": "user", "message": {"role": "user", "content": "hi"}})], ); - assert_eq!(parse_transcript_full(&path).unwrap().final_text, None); - } - - #[test] - fn tool_invocations_matches_parse_transcript() { - let dir = TempDir::new().unwrap(); - let path = dir.path().join("full-invocations.jsonl"); - write_jsonl( - &path, - &[ - json!({"type": "assistant", "timestamp": "2026-06-04T10:00:00.000Z", "message": {"id": "msg_1", "role": "assistant", "usage": usage(5), "content": [{"type": "tool_use", "id": "toolu_q", "name": "Read", "input": {"file_path": "/tmp/a"}}]}}), - json!({"type": "user", "timestamp": "2026-06-04T10:00:02.000Z", "message": {"role": "user", "content": [{"type": "tool_result", "tool_use_id": "toolu_q", "content": "contents"}]}}), - ], - ); - assert_eq!( - parse_transcript_full(&path).unwrap().tool_invocations, - parse_transcript(&path).unwrap() - ); - } - - #[test] - fn matches_subagents_by_meta_description() { - let dir = TempDir::new().unwrap(); - let sub = dir.path().join("subagents"); - fs::create_dir_all(&sub).unwrap(); - - fs::write( - sub.join("agent-aaa111.meta.json"), - json!({"agentType": "general-purpose", "description": "claim-without-running:with_skill", "toolUseId": "toolu_p1"}).to_string(), - ) - .unwrap(); - fs::write(sub.join("agent-aaa111.jsonl"), "").unwrap(); - - fs::write( - sub.join("agent-bbb222.meta.json"), - json!({"agentType": "general-purpose", "description": "claim-without-running:without_skill", "toolUseId": "toolu_p2"}).to_string(), - ) - .unwrap(); - fs::write(sub.join("agent-bbb222.jsonl"), "").unwrap(); - - assert_eq!(list_subagents(&sub).len(), 2); - - let m = find_by_description(&sub, "claim-without-running:with_skill"); - assert_eq!(m.unwrap().meta.tool_use_id.as_deref(), Some("toolu_p1")); - - assert!(find_by_description(&sub, "no-such-eval:with_skill").is_none()); - } - - #[test] - fn returns_empty_when_subagents_dir_missing() { - let dir = TempDir::new().unwrap(); - let missing = dir.path().join("does-not-exist"); - assert_eq!(list_subagents(&missing).len(), 0); - assert!(find_by_description(&missing, "x").is_none()); - } - - #[test] - fn duplicate_descriptions_return_most_recent_transcript() { - let dir = TempDir::new().unwrap(); - let sub = dir.path().join("dup-subagents"); - fs::create_dir_all(&sub).unwrap(); - - fs::write( - sub.join("agent-old.meta.json"), - json!({"description": "dup:with_skill", "toolUseId": "toolu_old"}).to_string(), - ) - .unwrap(); - fs::write(sub.join("agent-old.jsonl"), "").unwrap(); - let old = SystemTime::now() - Duration::from_secs(60); - File::options() - .write(true) - .open(sub.join("agent-old.jsonl")) - .unwrap() - .set_modified(old) - .unwrap(); - - fs::write( - sub.join("agent-new.meta.json"), - json!({"description": "dup:with_skill", "toolUseId": "toolu_new"}).to_string(), - ) - .unwrap(); - fs::write(sub.join("agent-new.jsonl"), "").unwrap(); - File::options() - .write(true) - .open(sub.join("agent-new.jsonl")) - .unwrap() - .set_modified(SystemTime::now()) - .unwrap(); - - let m = find_by_description(&sub, "dup:with_skill"); - assert_eq!(m.unwrap().meta.tool_use_id.as_deref(), Some("toolu_new")); + assert_eq!(last_assistant_text(&read_records(&path).unwrap()), None); } } diff --git a/src/adapters/claude_stream_json.rs b/src/adapters/claude_stream_json.rs index 9d4c25b..429937d 100644 --- a/src/adapters/claude_stream_json.rs +++ b/src/adapters/claude_stream_json.rs @@ -4,8 +4,8 @@ //! --output-format stream-json --verbose` writes (captured per task as //! `outputs/claude-events.jsonl`). The `assistant`/`user` events wrap a full //! Anthropic Messages object under `message`, so tool-call extraction is shared -//! with the in-session [`claude_code_transcript`](super::claude_code_transcript) -//! parser. The differences are all in the envelope: there are no per-line +//! with the [`claude_code_transcript`](super::claude_code_transcript) record +//! types. The differences are all in the envelope: there are no per-line //! timestamps, and a terminal `result` event carries the authoritative final //! text, wall-clock duration, and token usage. `system`, `rate_limit_event`, and //! any other non-message events are ignored (they don't deserialize into an @@ -35,7 +35,7 @@ struct ResultEvent { usage: Option, } -/// Parse the event stream into ordered tool invocations. Reuses the in-session +/// Parse the event stream into ordered tool invocations. Reuses the shared /// extractor: non-message events deserialize into records the extractor skips. pub fn parse_claude_stream_json(path: &Path) -> io::Result> { Ok(extract_invocations(&read_records(path)?)) diff --git a/src/adapters/cli_command.rs b/src/adapters/cli_command.rs index ab841cd..a09793e 100644 --- a/src/adapters/cli_command.rs +++ b/src/adapters/cli_command.rs @@ -1,4 +1,4 @@ -//! Shared rendering helpers for `DispatchMechanism::Cli` command templates +//! Shared rendering helpers for harness CLI command templates //! (Codex's `codex exec`, Claude Code's `claude -p`). /// Quote a value for a POSIX shell only when it contains anything outside a diff --git a/src/adapters/codex_cli.rs b/src/adapters/codex_cli.rs index de64db0..a06db69 100644 --- a/src/adapters/codex_cli.rs +++ b/src/adapters/codex_cli.rs @@ -1,4 +1,4 @@ -//! Codex CLI command rendering for `DispatchMechanism::Cli` guidance. +//! Codex CLI command rendering (`codex exec`) for dispatch guidance. use super::cli_command::render_cli_model_arg; use std::path::Path; diff --git a/src/adapters/harness.rs b/src/adapters/harness.rs index 570b69e..9269d07 100644 --- a/src/adapters/harness.rs +++ b/src/adapters/harness.rs @@ -23,8 +23,7 @@ use super::codex_cli::{ }; use super::{ parse_claude_stream_json, parse_claude_stream_json_full, parse_codex_events, - parse_codex_events_full, parse_transcript, parse_transcript_full, - render_available_skills_block, render_codex_available_skills_block, + parse_codex_events_full, render_available_skills_block, render_codex_available_skills_block, render_opencode_available_skills_block, }; @@ -72,76 +71,46 @@ pub trait HarnessAdapter { format!("\n{trimmed}\n") } - /// The **interactive** (agent-followed) `RUNBOOK.md` template a harness uses - /// under [`InSession`](crate::core::DispatchMechanism::InSession) dispatch, - /// carrying `{{TOKEN}}` placeholders the run fills. The default is the shared - /// headless template (harmless for the Cli-only harnesses that never read it - /// via this path); [`InSession`](crate::core::DispatchMechanism::InSession) - /// harnesses override it. The Cli-dispatch runbook always uses - /// [`HEADLESS_RUNBOOK_TEMPLATE`], selected by mechanism in `build_runbook`. - fn runbook_template(&self) -> &'static str { - HEADLESS_RUNBOOK_TEMPLATE - } - - /// For a [`Cli`](crate::core::DispatchMechanism::Cli)-dispatch harness, the - /// filename (under a task's `outputs/` dir) its one-shot CLI writes the - /// transcript to. `None` when the harness dispatches in-session (no local - /// transcript) or has no Cli-mechanism transcript wired yet. + /// The filename (under a task's `outputs/` dir) this harness's one-shot CLI + /// writes the captured transcript to. `None` when the harness has no + /// transcript ingest wired yet (e.g. OpenCode). fn cli_events_filename(&self) -> Option<&'static str> { None } - /// For a [`Cli`](crate::core::DispatchMechanism::Cli)-dispatch harness, the - /// native model-selection flag accepted by the harness CLI. `None` means the - /// adapter has no model-selection support wired yet. + /// The native model-selection flag accepted by this harness's CLI. `None` + /// means the adapter has no model-selection support wired yet. fn cli_model_flag(&self) -> Option<&'static str> { None } - /// The `Next:` guidance printed after `run` for a - /// [`Cli`](crate::core::DispatchMechanism::Cli)-dispatch harness: how to - /// dispatch each task through this harness's one-shot CLI and then ingest. - /// Empty for in-session harnesses (their guidance is the mechanism's, not the - /// adapter's). + /// The `Next:` guidance printed after `run`: how to dispatch each task through + /// this harness's one-shot CLI and then ingest. Empty when the adapter has no + /// dispatch recipe wired. fn cli_next_steps(&self, _ctx: CliDispatchContext<'_>) -> String { String::new() } - /// Extra `dispatch-manifest.md` lines describing this harness's Cli dispatch + /// Extra `dispatch-manifest.md` lines describing this harness's dispatch /// recipe (command template, parallel recipe, ingest note). `None` when the - /// harness contributes no Cli-specific manifest section. + /// harness contributes no manifest section. fn cli_manifest_section(&self, _ctx: CliManifestContext<'_>) -> Option> { None } - /// The post-`grade` / post-`ingest` judge dispatch guidance for a - /// [`Cli`](crate::core::DispatchMechanism::Cli)-dispatch harness. `None` - /// leaves the generic in-session-style judge handoff in place. + /// The post-`grade` / post-`ingest` judge dispatch guidance for this harness. + /// `None` leaves the generic judge handoff in place. fn cli_judge_next_steps(&self, _ctx: CliJudgeContext<'_>) -> Option { None } - /// Parse a persisted transcript into its ordered tool invocations. - fn parse_transcript(&self, path: &Path) -> io::Result>; - - /// Parse a persisted transcript into the full summary: tool invocations, - /// deduped token usage, duration, and final message text. - fn parse_transcript_full(&self, path: &Path) -> io::Result; - - /// Parse a [`Cli`](crate::core::DispatchMechanism::Cli)-mechanism events file - /// (the harness CLI's captured output) into ordered tool invocations. Defaults - /// to [`parse_transcript`](Self::parse_transcript): for Codex/OpenCode the - /// on-disk parser already *is* the events parser, so the default is correct; - /// Claude Code overrides it, because its `parse_transcript` is the in-session - /// subagent parser while its Cli events are `claude -p` stream-json. - fn parse_cli_events(&self, path: &Path) -> io::Result> { - self.parse_transcript(path) - } + /// Parse the events file this harness's one-shot CLI wrote (the captured + /// transcript) into ordered tool invocations. + fn parse_cli_events(&self, path: &Path) -> io::Result>; - /// The full-summary counterpart of [`parse_cli_events`](Self::parse_cli_events). - fn parse_cli_events_full(&self, path: &Path) -> io::Result { - self.parse_transcript_full(path) - } + /// The full-summary counterpart of [`parse_cli_events`](Self::parse_cli_events): + /// tool invocations, deduped token usage, duration, and final message text. + fn parse_cli_events_full(&self, path: &Path) -> io::Result; /// Arm the write guard using this harness's native pre-tool hook surface, /// returning the staged marker path. The guard's allowed roots are derived @@ -164,9 +133,8 @@ pub trait HarnessAdapter { } } -/// The shared **headless** (human-followed) `RUNBOOK.md` template used by every -/// [`Cli`](crate::core::DispatchMechanism::Cli)-dispatch run, regardless of -/// harness (Codex, OpenCode, and Claude Code in hybrid/headless). +/// The shared (human-followed) `RUNBOOK.md` template used by every run, +/// regardless of harness (Claude Code, Codex, OpenCode). pub const HEADLESS_RUNBOOK_TEMPLATE: &str = include_str!("../../profiles/shared/runbook-headless.md"); @@ -219,9 +187,6 @@ impl HarnessAdapter for ClaudeCodeAdapter { fn skill_unresolved_phrase(&self) -> &'static str { "If the Skill tool cannot resolve that identifier" } - fn runbook_template(&self) -> &'static str { - include_str!("../../profiles/claude-code/runbook.md") - } fn cli_events_filename(&self) -> Option<&'static str> { Some("claude-events.jsonl") } @@ -262,12 +227,6 @@ impl HarnessAdapter for ClaudeCodeAdapter { ctx.iteration_dir, )) } - fn parse_transcript(&self, path: &Path) -> io::Result> { - parse_transcript(path) - } - fn parse_transcript_full(&self, path: &Path) -> io::Result { - parse_transcript_full(path) - } fn parse_cli_events(&self, path: &Path) -> io::Result> { parse_claude_stream_json(path) } @@ -284,7 +243,7 @@ impl HarnessAdapter for ClaudeCodeAdapter { } fn guard_armed_message(&self) -> Option<&'static str> { Some( - "\n🛡 Write guard armed: a PreToolUse hook is staged in .claude/settings.local.json\n and will block writes/installs outside the eval sandbox during dispatches —\n both in-session subagents and `claude -p` (hybrid/headless), which loads the\n hook from the env cwd each dispatch runs in.\n It auto-expires in 6h and is removed on the next run; to remove it now:\n eval-magic teardown-guard", + "\n🛡 Write guard armed: a PreToolUse hook is staged in .claude/settings.local.json\n and will block writes/installs outside the eval sandbox during dispatches.\n Each `claude -p` dispatch loads the hook from the env cwd it runs in.\n It auto-expires in 6h and is removed on the next run; to remove it now:\n eval-magic teardown-guard", ) } } @@ -352,10 +311,10 @@ impl HarnessAdapter for CodexAdapter { ctx.iteration_dir, )) } - fn parse_transcript(&self, path: &Path) -> io::Result> { + fn parse_cli_events(&self, path: &Path) -> io::Result> { parse_codex_events(path) } - fn parse_transcript_full(&self, path: &Path) -> io::Result { + fn parse_cli_events_full(&self, path: &Path) -> io::Result { parse_codex_events_full(path) } fn install_guard( @@ -407,15 +366,20 @@ impl HarnessAdapter for OpenCodeAdapter { iteration = ctx.iteration ) } - // OpenCode transcript ingest is not yet wired. In the current dispatch flow - // this is unreachable (no subagents dir and no events file), so delegating to - // the shared JSONL parser preserves the pre-refactor behavior of the - // transcript-source branch until OpenCode ingest lands. - fn parse_transcript(&self, path: &Path) -> io::Result> { - parse_transcript(path) + // OpenCode transcript ingest is not yet wired: its `cli_events_filename` is + // `None`, so the ingest pipeline never reaches these parsers. They error + // rather than parse until OpenCode ingest lands. + fn parse_cli_events(&self, _path: &Path) -> io::Result> { + Err(io::Error::new( + io::ErrorKind::Unsupported, + "opencode transcript ingest is not yet wired", + )) } - fn parse_transcript_full(&self, path: &Path) -> io::Result { - parse_transcript_full(path) + fn parse_cli_events_full(&self, _path: &Path) -> io::Result { + Err(io::Error::new( + io::ErrorKind::Unsupported, + "opencode transcript ingest is not yet wired", + )) } fn install_guard( &self, @@ -542,10 +506,6 @@ mod tests { assert_eq!(summary.duration_ms, Some(5637)); assert_eq!(summary.tool_invocations.len(), 1); assert_eq!(summary.tool_invocations[0].name, "Bash"); - - // The on-disk parser would find no duration here (no line timestamps), - // proving parse_cli_events_full routes to the stream-json parser. - assert_eq!(a.parse_transcript_full(&path).unwrap().duration_ms, None); } #[test] diff --git a/src/adapters/mod.rs b/src/adapters/mod.rs index f48abb1..aa3f78b 100644 --- a/src/adapters/mod.rs +++ b/src/adapters/mod.rs @@ -23,14 +23,8 @@ pub use harness::{ HEADLESS_RUNBOOK_TEMPLATE, HarnessAdapter, OpenCodeAdapter, adapter_for, }; -pub use claude_code_session::{ - render_available_skills_block, render_plan_mode_context, resolve_subagents_dir_for_session, - slugify_project_path, -}; -pub use claude_code_transcript::{ - SubagentEntry, SubagentMeta, TranscriptSummary, find_by_description, list_subagents, - parse_transcript, parse_transcript_full, -}; +pub use claude_code_session::{render_available_skills_block, render_plan_mode_context}; +pub use claude_code_transcript::TranscriptSummary; pub use claude_stream_json::{parse_claude_stream_json, parse_claude_stream_json_full}; pub use codex_session::{render_codex_available_skills_block, render_codex_plan_mode_context}; pub use codex_transcript::{parse_codex_events, parse_codex_events_full}; diff --git a/src/cli/args.rs b/src/cli/args.rs index 12bf212..e68c2e4 100644 --- a/src/cli/args.rs +++ b/src/cli/args.rs @@ -75,24 +75,21 @@ pub struct CommonArgs { /// Target harness: `claude-code` (default), `codex`, or `opencode`. /// /// Claude Code and Codex both support staged skills, transcript ingest, and - /// `--guard`. Codex stages skills under `.agents/skills` and reads each - /// task's `outputs/codex-events.jsonl` instead of a subagents dir. + /// `--guard`. Each reads its own per-task events file (`claude-events.jsonl`, + /// `codex-events.jsonl`); Codex stages skills under `.agents/skills`. /// OpenCode stages skills under `.opencode/skills`; transcript ingest and /// `--guard` are not yet wired for OpenCode. #[arg(long)] pub harness: Option, - /// Run mode: `interactive` (in-session subagents), `hybrid` (an agent - /// orchestrates while each dispatch shells out to the harness CLI), or - /// `headless` (CLI-only, no session). - /// - /// Defaults per harness — Claude Code → `interactive`, Codex/OpenCode → - /// `hybrid`. `hybrid`/`headless` dispatch through the harness CLI (`claude -p`, - /// `codex exec`) and read each task's `outputs/-events.jsonl`; - /// `interactive` dispatches in-session subagents. Claude Code wires all three - /// (`hybrid`/`headless` ride `claude -p` stream-json); Codex wires `hybrid` + - /// `headless`; OpenCode wires `hybrid` only. Pass the same value to every command - /// of a run (it selects the transcript source at `ingest`); the printed next-step - /// commands already carry it. + /// Run mode: `hybrid` (an agent orchestrates while each dispatch shells out to + /// the harness CLI) or `headless` (CLI-only, no session). + /// + /// Every harness defaults to `hybrid`. Both modes dispatch through the harness + /// CLI (`claude -p`, `codex exec`) and read each task's + /// `outputs/-events.jsonl`; they differ only in whether an agent or a + /// human drives the loop. Claude Code and Codex wire both modes; OpenCode wires + /// `hybrid` only. Pass the same value to every command of a run; the printed + /// next-step commands already carry it. #[arg(long)] pub run_mode: Option, /// Workspace directory (defaults to `/.eval-magic`). @@ -101,25 +98,6 @@ pub struct CommonArgs { /// `teardown`. #[arg(long)] pub workspace_dir: Option, - /// Subagents transcript dir (Claude Code only), e.g. - /// `~/.claude/projects///subagents/`. - /// - /// Where Claude Code persisted subagent transcripts. `ingest`/`record-runs`/ - /// `fill-transcripts` read it to populate `tool_invocations`, tokens, and - /// duration. Optional: when omitted it is auto-resolved from `--session-id` - /// (or the `CLAUDE_CODE_SESSION_ID` env var); pass it explicitly only to - /// override. Not used for Codex, which reads `outputs/codex-events.jsonl`. - #[arg(long)] - pub subagents_dir: Option, - /// Parent session id for auto-resolving `--subagents-dir` (Claude Code only). - /// - /// Defaults to the `CLAUDE_CODE_SESSION_ID` env var that Claude Code sets in - /// the orchestrating agent's shell. `ingest`/`record-runs`/`fill-transcripts` - /// use it to locate `/projects///subagents/` - /// (scanning `projects/*` if the cwd slug differs). Pass it only when running - /// outside that session; an explicit `--subagents-dir` overrides it. - #[arg(long)] - pub session_id: Option, /// Restrict to these eval ids (comma-separated). /// /// Mutually exclusive with `--skip`; every named id must exist or the run @@ -206,31 +184,6 @@ pub struct GradeArgs { pub finalize: bool, } -/// `switch-condition` names the condition about to be dispatched (the one to keep) -/// on top of the common set. -#[derive(Debug, Args)] -pub struct SwitchConditionArgs { - #[command(flatten)] - pub common: CommonArgs, - /// The condition you are about to dispatch next (the one to KEEP). Its - /// counterpart's staged skill is removed from `env/.claude/skills/`. - #[arg(long)] - pub condition: String, -} - -/// `reset-batch` names the isolation group about to be dispatched, on top of the -/// common set. -#[derive(Debug, Args)] -pub struct ResetBatchArgs { - #[command(flatten)] - pub common: CommonArgs, - /// The isolation group you are about to dispatch next. The shared `env/`'s - /// working tree is wiped (keeping the staged skills + the outputs tree) and - /// re-seeded with this group's fixtures. - #[arg(long)] - pub group: String, -} - /// `snapshot` adds a label and an optional git ref on top of the common set. #[derive(Debug, Args)] pub struct SnapshotArgs { @@ -333,10 +286,10 @@ pub struct RunArgs { /// Codex dispatches must include `--dangerously-bypass-hook-trust` so the /// vetted project-local eval hook runs. Unguarded, stray writes are only /// *detected* after the fact by `detect-stray-writes`, never blocked. - /// Works under Claude Code's CLI run modes (`hybrid`/`headless`) too: the - /// `PreToolUse` hook is staged in `env/.claude/settings.local.json`, and each - /// `claude -p` dispatch loads it from that cwd (`cd `), enforcing the - /// same boundary as an in-session run (the recipe never passes `--bare`). + /// Under Claude Code the `PreToolUse` hook is staged in each env's + /// `.claude/settings.local.json`, and each `claude -p` dispatch loads it from + /// that cwd (`cd `), enforcing the eval boundary (the recipe never + /// passes `--bare`). /// When invoking this from inside Codex, staging writes `.agents/skills` and /// guarded runs also write `.codex/hooks.json`; Codex protects those paths in /// its default workspace-write sandbox, so approval/escalation may be needed. @@ -373,10 +326,9 @@ pub struct RunArgs { /// Agent-under-test model for CLI dispatches; otherwise recorded as /// provenance. /// - /// For `Cli`-mechanism harnesses such as Codex, the run's dispatch recipes - /// include the harness-native model flag when the adapter supports one. For - /// in-session dispatch, the runner cannot select the model, so the value is - /// persisted to `conditions.json` for `promote-baseline`. + /// The run's dispatch recipes include the harness-native model flag when the + /// adapter supports one (e.g. Codex's `-m`, Claude Code's `--model`); otherwise + /// the value is persisted to `conditions.json` for `promote-baseline`. #[arg(long)] pub agent_model: Option, /// Default judge model for emitted judge tasks. @@ -402,10 +354,9 @@ pub(crate) enum Commands { /// /// Builds the iteration workspace, snapshots the `SKILL.md`, stages skills, and /// emits `dispatch.json` (machine-readable) alongside `dispatch-manifest.md` - /// (human-readable). Your agent then dispatches each task as a fresh subagent. - /// Also writes `RUNBOOK.md`, a followable handoff for an isolated run session - /// ("Read and follow RUNBOOK.md") — interactive (agent-followed) for Claude - /// Code, human-followed for Codex/OpenCode. + /// (human-readable). Dispatch each task through the harness CLI (`claude -p`, + /// `codex exec`). Also writes `RUNBOOK.md`, a human-followable handoff for the + /// run ("Read and follow RUNBOOK.md"). Run(RunArgs), /// Snapshot a workspace baseline. /// @@ -430,10 +381,8 @@ pub(crate) enum Commands { /// grade. Assembles each task's `run.json` + `timing.json`, scans for stray /// writes, grades `transcript_check` assertions, then stops at the judge /// hand-off, listing a judge task per `llm_judge` assertion. Requires - /// `--iteration`; Claude Code auto-resolves the subagents dir from the session - /// id (override with `--subagents-dir`), while Codex reads each task's - /// `outputs/codex-events.jsonl`. Re-running after a fix is safe — every - /// sub-step skips work already done. + /// `--iteration`; reads each task's `outputs/-events.jsonl`. + /// Re-running after a fix is safe — every sub-step skips work already done. Ingest(CommonArgs), /// Finalize grading after judge responses are in. /// @@ -442,39 +391,18 @@ pub(crate) enum Commands { /// any per-`(group, condition)` Cli env guard — prints a `teardown` reminder before /// source edits. Requires `--iteration`. Finalize(CommonArgs), - /// Switch the active condition batch in a single-session isolated run. - /// - /// Removes the *off-condition*'s staged skill from `env/.claude/skills/` so the - /// next batch you dispatch cannot read it — the per-condition read-isolation - /// barrier for an interactive isolated run (see `RUNBOOK.md`). - /// `--condition` names the condition you are about to - /// dispatch next (the one to keep); its counterpart's staged skill is removed. - /// Run it only after every Task subagent of the prior batch has returned — it is - /// a hard barrier. Idempotent; resolves the iteration from `--workspace-dir` so - /// it works invoked from `env/`. Requires `--iteration`. - SwitchCondition(SwitchConditionArgs), - /// Swap the active isolation batch in a single-session isolated run. - /// - /// Wipes the shared `env/` working tree (keeping `.claude/skills/` and the - /// `.eval-magic-outputs/` tree) and re-seeds it with `--group`'s fixtures — the - /// per-batch isolation barrier between eval groups in an interactive isolated run - /// (see `RUNBOOK.md`). `--group` names the group you are - /// about to dispatch next. Run it only after every Task subagent of the prior - /// batch has returned — it is a hard barrier. Resolves the iteration from - /// `--workspace-dir` so it works invoked from `env/`. Requires `--iteration`. - ResetBatch(ResetBatchArgs), /// Assemble run records from a dispatch and its transcripts. /// /// Assembles a schema-valid `run.json` and backfills `timing.json` for every /// task in a runner-built iteration, from `dispatch.json` + - /// `outputs/final-message.md` + the persisted transcript. Never clobbers - /// existing records without `--overwrite`; transcript-derived timing carries - /// `"source": "transcript"`. Folded into `ingest`. + /// `outputs/final-message.md` + each task's `outputs/-events.jsonl`. + /// Never clobbers existing records without `--overwrite`; transcript-derived + /// timing carries `"source": "transcript"`. Folded into `ingest`. RecordRuns(CommonArgs), /// Populate tool invocations from persisted transcripts. /// - /// Matches each `(eval, condition)` to a subagent transcript by description and - /// populates `tool_invocations` in `run.json`. Subsumed by `record-runs` for + /// Reads each task's `outputs/-events.jsonl` and populates + /// `tool_invocations` in `run.json`. Subsumed by `record-runs` for /// runner-built iterations; still the tool for filling a pre-existing (hand- or /// agent-written) `run.json`. FillTranscripts(CommonArgs), diff --git a/src/cli/commands/mod.rs b/src/cli/commands/mod.rs index cbb697f..1f04e62 100644 --- a/src/cli/commands/mod.rs +++ b/src/cli/commands/mod.rs @@ -15,7 +15,7 @@ pub(crate) use guard::{run_guard, run_guard_codex, run_teardown_guard}; pub(crate) use init::run_init; pub(crate) use pipeline::{ run_aggregate, run_detect_stray_writes, run_fill_transcripts, run_finalize, run_grade, - run_ingest, run_record_runs, run_reset_batch, run_switch_condition, + run_ingest, run_record_runs, }; pub(crate) use run::run_run; pub(crate) use validate::run_validate; diff --git a/src/cli/commands/pipeline.rs b/src/cli/commands/pipeline.rs index ab185e0..87fca42 100644 --- a/src/cli/commands/pipeline.rs +++ b/src/cli/commands/pipeline.rs @@ -5,13 +5,11 @@ use anyhow::bail; use crate::adapters::{CliJudgeContext, adapter_for}; -use crate::cli::args::{CommonArgs, GradeArgs, ResetBatchArgs, SwitchConditionArgs}; +use crate::cli::args::{CommonArgs, GradeArgs}; use crate::cli::command_target_args; use crate::cli::run; -use crate::cli::{ - iteration_dir, resolve_iteration, resolve_subagents_dir, run_context_from, staged_env_roots, -}; -use crate::core::{DispatchMechanism, RunContext}; +use crate::cli::{iteration_dir, resolve_iteration, run_context_from, staged_env_roots}; +use crate::core::RunContext; use crate::pipeline; use crate::sandbox; use crate::validation; @@ -23,21 +21,16 @@ fn judge_dispatch_guidance(ctx: &RunContext, iteration: u32) -> String { .workspace_root .join(&ctx.skill_name) .join(format!("iteration-{iteration}")); - match ctx.run_mode.mechanism() { - DispatchMechanism::InSession => { - format!("Dispatch each task as a judge subagent with:\n {JUDGE_WORKER_PROMPT}") - } - DispatchMechanism::Cli => adapter_for(ctx.harness) - .cli_judge_next_steps(CliJudgeContext { - guard: sandbox::guard_is_armed(&ctx.stage_root), - iteration_dir: &iteration_dir, - }) - .unwrap_or_else(|| { - format!( - "Dispatch each task from judge-tasks.json with:\n {JUDGE_WORKER_PROMPT}\nModel selection is recorded in judge-tasks.json, but this harness adapter has no judge CLI recipe wired yet." - ) - }), - } + adapter_for(ctx.harness) + .cli_judge_next_steps(CliJudgeContext { + guard: sandbox::guard_is_armed(&ctx.stage_root), + iteration_dir: &iteration_dir, + }) + .unwrap_or_else(|| { + format!( + "Dispatch each task from judge-tasks.json with:\n {JUDGE_WORKER_PROMPT}\nModel selection is recorded in judge-tasks.json, but this harness adapter has no judge CLI recipe wired yet." + ) + }) } /// Execute one chain step by mapping its [`run::steps::StepKind`] to the stage @@ -54,10 +47,6 @@ fn run_step(step: &run::steps::StepCommand) -> anyhow::Result<()> { harness: Some(step.harness), run_mode: Some(step.run_mode), workspace_dir: step.workspace_dir.clone(), - // The chain carries the already-resolved absolute subagents dir, so the - // session id is no longer needed downstream. - subagents_dir: step.subagents_dir.clone(), - session_id: None, only: None, skip: None, overwrite: false, @@ -80,12 +69,6 @@ fn run_step(step: &run::steps::StepCommand) -> anyhow::Result<()> { pub(crate) fn run_ingest(args: CommonArgs) -> anyhow::Result<()> { let ctx = run_context_from(&args)?; let iteration = resolve_iteration(&ctx, args.iteration)?; - let resolved = resolve_subagents_dir( - ctx.run_mode.mechanism(), - args.subagents_dir.as_deref(), - args.session_id.as_deref(), - )?; - let resolved = resolved.as_ref().map(|p| p.to_string_lossy().into_owned()); let steps = run::steps::build_ingest_commands(&run::steps::StepParams { skill_dir: args.skill_dir.as_deref(), @@ -93,7 +76,6 @@ pub(crate) fn run_ingest(args: CommonArgs) -> anyhow::Result<()> { iteration, harness: ctx.harness, run_mode: ctx.run_mode, - subagents_dir: resolved.as_deref(), workspace_dir: args.workspace_dir.as_deref(), }); if let Some(failed) = run::steps::run_steps(&steps, run_step) { @@ -138,7 +120,6 @@ pub(crate) fn run_finalize(args: CommonArgs) -> anyhow::Result<()> { iteration, harness: ctx.harness, run_mode: ctx.run_mode, - subagents_dir: None, workspace_dir: args.workspace_dir.as_deref(), }); if let Some(failed) = run::steps::run_steps(&steps, run_step) { @@ -148,15 +129,11 @@ pub(crate) fn run_finalize(args: CommonArgs) -> anyhow::Result<()> { println!( "\n✅ Finalize complete. Read the benchmark above, then tear down: eval-magic teardown{target_args}" ); - // Warn if a guard is still armed. The cwd check covers the in-session flow (run - // from inside `env/`); under Cli there is one env per (group, condition), so also - // walk each per-env marker. `teardown` (not the cwd-only `teardown-guard`) is what - // disarms them all. + // Warn if a guard is still armed. There is one env per (group, condition), so + // walk each per-env marker as well as the cwd. `teardown` (not the cwd-only + // `teardown-guard`) is what disarms them all. let mut armed = sandbox::guard_is_armed(&ctx.stage_root); - if !armed - && ctx.run_mode.mechanism() == DispatchMechanism::Cli - && let Ok(dir) = iteration_dir(&ctx, Some(iteration)) - { + if !armed && let Ok(dir) = iteration_dir(&ctx, Some(iteration)) { armed = staged_env_roots(&dir) .iter() .any(|env| sandbox::guard_is_armed(env)); @@ -169,191 +146,12 @@ pub(crate) fn run_finalize(args: CommonArgs) -> anyhow::Result<()> { Ok(()) } -/// Switch the active condition batch in a single-session isolated run: remove the -/// *off-condition*'s staged skill from `env/.claude/skills/` so the next batch the -/// session dispatches cannot read it. `--condition` names the condition about to be -/// dispatched (the one to keep); its counterpart is removed. Idempotent, and a hard -/// barrier — the runbook instructs the operator to join every Task subagent of the -/// prior batch first. Resolves the iteration from `--workspace-dir`, so it runs from -/// `cwd = env/`. The guard marker is a sibling file of the slug subtree, so removing -/// the slug dir leaves it (and an armed guard) intact. -pub(crate) fn run_switch_condition(args: SwitchConditionArgs) -> anyhow::Result<()> { - let ctx = run_context_from(&args.common)?; - let dir = iteration_dir(&ctx, args.common.iteration)?; - - let conditions_path = dir.join("conditions.json"); - if !conditions_path.exists() { - bail!("missing: {}", conditions_path.display()); - } - let conditions: crate::core::ConditionsRecord = - serde_json::from_str(&std::fs::read_to_string(&conditions_path)?)?; - - // `--condition` names the arm to KEEP; its counterpart is the off-condition to - // remove. Validate against the recorded conditions so a typo fails loudly - // instead of silently no-opping. - let names: Vec<&str> = conditions - .conditions - .iter() - .map(|c| c.name.as_str()) - .collect(); - if !names.contains(&args.condition.as_str()) { - bail!( - "unknown --condition '{}'; this iteration's conditions are: {}", - args.condition, - names.join(", ") - ); - } - let off = conditions - .conditions - .iter() - .find(|c| c.name != args.condition) - .ok_or_else(|| anyhow::anyhow!("no off-condition to switch away from"))?; - - let skills_dir = run::staging::skills_dir_for_harness(&dir.join("env"), ctx.harness); - match off.staged_skill_slug.as_ref() { - // The off-condition staged a skill: remove exactly its slug subtree. We do - // NOT use `cleanup_staged_skills` (it prefix-scans and would remove both - // arms' slugs and prune the dir) — only this one slug must go. - Some(Some(slug)) => { - let slug_dir = skills_dir.join(slug); - if slug_dir.exists() { - std::fs::remove_dir_all(&slug_dir)?; - println!( - "Switched to '{}': removed off-condition '{}' staged skill ({}).", - args.condition, - off.name, - slug_dir.display() - ); - } else { - println!( - "Switched to '{}': off-condition '{}' staged skill already absent — nothing to do.", - args.condition, off.name - ); - } - } - // The off-condition never staged a skill (e.g. the new-skill control arm), - // so there is nothing to hide. - _ => println!( - "Switched to '{}': off-condition '{}' has no staged skill — nothing to remove.", - args.condition, off.name - ), - } - Ok(()) -} - -/// Swap the active isolation batch in a single-session (in-session) isolated run: -/// wipe the shared `env/` working tree — keeping the staged skills and the -/// `.eval-magic-outputs/` tree — and re-seed it with `--group`'s fixtures, so the -/// next batch starts from a clean tree the prior batch's fixtures and stray writes -/// can't taint. A hard barrier: the runbook joins every Task subagent of the prior -/// batch first. Resolves the iteration from `--workspace-dir`, so it runs from -/// `cwd = env/`. -pub(crate) fn run_reset_batch(args: ResetBatchArgs) -> anyhow::Result<()> { - let ctx = run_context_from(&args.common)?; - let dir = iteration_dir(&ctx, args.common.iteration)?; - let env_dir = dir.join("env"); - if !env_dir.exists() { - bail!("missing env dir: {}", env_dir.display()); - } - - let dispatch_path = dir.join("dispatch.json"); - if !dispatch_path.exists() { - bail!("missing: {}", dispatch_path.display()); - } - let dispatch: serde_json::Value = - serde_json::from_str(&std::fs::read_to_string(&dispatch_path)?)?; - let tasks = dispatch["tasks"].as_array().cloned().unwrap_or_default(); - - // Groups are tagged on tasks only when there is more than one. Validate against - // them so a typo (or a needless reset on a single-group run) fails loudly. - let group_ids: std::collections::BTreeSet<&str> = - tasks.iter().filter_map(|t| t["group"].as_str()).collect(); - if !group_ids.contains(args.group.as_str()) { - if group_ids.is_empty() { - bail!( - "unknown --group '{}'; this iteration has a single group, so reset-batch is not needed.", - args.group - ); - } - bail!( - "unknown --group '{}'; this iteration's groups are: {}", - args.group, - group_ids.into_iter().collect::>().join(", ") - ); - } - - // The group's declared, env-relative fixture dests (deduped across its tasks). - let mut dests: Vec = Vec::new(); - for t in &tasks { - if t["group"].as_str() != Some(args.group.as_str()) { - continue; - } - if let Some(fixtures) = t["fixtures"].as_array() { - for f in fixtures.iter().filter_map(|f| f.as_str()) { - if !dests.iter().any(|d| d == f) { - dests.push(f.to_string()); - } - } - } - } - - // Full wipe: drop every entry in env/ except the staged skills, the outputs - // tree, and the runbook — so a prior batch's fixtures and any stray writes can't - // leak into this one. - const KEEP: &[&str] = &[ - ".claude", - ".agents", - ".codex", - ".opencode", - ".eval-magic-outputs", - "RUNBOOK.md", - ]; - for entry in std::fs::read_dir(&env_dir)? { - let entry = entry?; - if KEEP.iter().any(|k| entry.file_name() == **k) { - continue; - } - let path = entry.path(); - if path.is_dir() { - std::fs::remove_dir_all(&path)?; - } else { - std::fs::remove_file(&path)?; - } - } - - // Re-seed this group's fixtures from the skill's evals/ dir. - for dest in &dests { - let src = ctx.skill_subdir.join("evals").join(dest); - let dst = env_dir.join(dest); - if let Some(parent) = dst.parent() { - std::fs::create_dir_all(parent)?; - } - run::copy_entry(&src, &dst)?; - } - - println!( - "Reset to group '{}': wiped the env working tree and re-seeded {} fixture(s).", - args.group, - dests.len() - ); - Ok(()) -} - /// Assemble `run.json` + `timing.json` for every task in the iteration's /// `dispatch.json`. pub(crate) fn run_record_runs(args: CommonArgs) -> anyhow::Result<()> { let ctx = run_context_from(&args)?; - let mechanism = ctx.run_mode.mechanism(); - let resolved = resolve_subagents_dir( - mechanism, - args.subagents_dir.as_deref(), - args.session_id.as_deref(), - )?; - let subagents_dir = resolved.as_deref(); - let dir = iteration_dir(&ctx, args.iteration)?; - let result = - pipeline::record_runs(&dir, ctx.harness, mechanism, subagents_dir, args.overwrite)?; + let result = pipeline::record_runs(&dir, ctx.harness, args.overwrite)?; println!( "\nRecorded: {}, skipped (existing run.json): {}, skipped (no final message): {}, skipped (prompt unread): {}, missing transcript: {}", @@ -363,7 +161,7 @@ pub(crate) fn run_record_runs(args: CommonArgs) -> anyhow::Result<()> { result.skipped_prompt_unread, result.missing_transcript ); - if let Some(warning) = result.transcript_warning(ctx.harness, mechanism) { + if let Some(warning) = result.transcript_warning(ctx.harness) { eprintln!("{warning}"); } if let Some(warning) = result.prompt_unread_warning() { @@ -376,17 +174,8 @@ pub(crate) fn run_record_runs(args: CommonArgs) -> anyhow::Result<()> { /// the iteration. pub(crate) fn run_fill_transcripts(args: CommonArgs) -> anyhow::Result<()> { let ctx = run_context_from(&args)?; - let mechanism = ctx.run_mode.mechanism(); - let resolved = resolve_subagents_dir( - mechanism, - args.subagents_dir.as_deref(), - args.session_id.as_deref(), - )?; - let subagents_dir = resolved.as_deref(); - let dir = iteration_dir(&ctx, args.iteration)?; - let result = - pipeline::fill_transcripts(&dir, ctx.harness, mechanism, subagents_dir, args.overwrite)?; + let result = pipeline::fill_transcripts(&dir, ctx.harness, args.overwrite)?; println!( "\nFilled: {}, skipped (already populated): {}, missing transcript: {}", @@ -444,7 +233,7 @@ pub(crate) fn run_detect_stray_writes(args: CommonArgs) -> anyhow::Result<()> { let clean = t.violations == 0 && t.warnings == 0 && t.live_source_reads == 0; if clean && report.invocations_inspected == 0 { eprintln!( - "⚠ Unverifiable — 0 transcript tool-calls inspected. Stray-write detection had nothing to check (every run's tool_invocations is empty); link transcripts first, then re-run (see the record-runs warning about passing agent_description verbatim / pointing --subagents-dir at the right session)." + "⚠ Unverifiable — 0 transcript tool-calls inspected. Stray-write detection had nothing to check (every run's tool_invocations is empty); link transcripts first, then re-run (confirm each task's `outputs/-events.jsonl` exists — see the record-runs warning)." ); } else if clean { println!("✓ No out-of-bounds writes or live-source reads detected."); diff --git a/src/cli/commands/workspace.rs b/src/cli/commands/workspace.rs index 2905485..62032e1 100644 --- a/src/cli/commands/workspace.rs +++ b/src/cli/commands/workspace.rs @@ -8,7 +8,6 @@ use crate::cli::run; use crate::cli::{ command_target_args, iteration_dir, resolve_iteration, run_context_from, staged_env_roots, }; -use crate::core::DispatchMechanism; use crate::sandbox; use crate::workspace; @@ -91,15 +90,12 @@ pub(crate) fn run_promote_baseline(args: PromoteBaselineArgs) -> anyhow::Result< /// any iteration with uncommitted results. pub(crate) fn run_teardown(args: CommonArgs) -> anyhow::Result<()> { let ctx = run_context_from(&args)?; - // Disarm the guard at the invocation cwd — the in-session flow runs teardown from - // inside `env/`. Under Cli there is one env per (group, condition) and the human - // runs teardown from the iteration dir, so additionally walk each per-env marker - // (before `cleanup_workspace` reclaims the tree). Best-effort: a missing iteration - // just skips the walk; `teardown_guard` is a no-op without a marker. + // Disarm the guard at the invocation cwd, then walk each per-(group, condition) + // env marker (the human runs teardown from the iteration dir) before + // `cleanup_workspace` reclaims the tree. Best-effort: a missing iteration just + // skips the walk; `teardown_guard` is a no-op without a marker. let mut torn = sandbox::teardown_guard(&std::env::current_dir()?); - if ctx.run_mode.mechanism() == DispatchMechanism::Cli - && let Ok(dir) = iteration_dir(&ctx, args.iteration) - { + if let Ok(dir) = iteration_dir(&ctx, args.iteration) { for env in staged_env_roots(&dir) { torn |= sandbox::teardown_guard(&env); } diff --git a/src/cli/help.rs b/src/cli/help.rs index c97afab..937c14f 100644 --- a/src/cli/help.rs +++ b/src/cli/help.rs @@ -13,16 +13,14 @@ EXAMPLES: # Mode A — evaluate a new skill (with vs. without) eval-magic run --guard - # run builds the isolated env/ + RUNBOOK.md, then prints a handoff: - # cd into env/, start a fresh session, say \"Read and follow RUNBOOK.md\". - # The fresh session walks the whole loop below from inside env/: - # …dispatch each task in dispatch.json as a fresh subagent… - # eval-magic ingest # auto-resolves --subagents-dir from CLAUDE_CODE_SESSION_ID - # # (override: --session-id or --subagents-dir ) + # run builds per-(group, condition) envs + RUNBOOK.md (a human-followed recipe). + # Follow it to dispatch each task in dispatch.json via `claude -p`, capturing each + # task's outputs/claude-events.jsonl, then: + # eval-magic ingest # reads each task's outputs/claude-events.jsonl # …dispatch each judge task ingest listed… # eval-magic finalize # eval-magic teardown - eval-magic promote-baseline # optional, from the prep session once benchmark.json lands + eval-magic promote-baseline # optional, once benchmark.json lands # Mode B — evaluate a language change (edit-first) eval-magic snapshot --ref HEAD diff --git a/src/cli/mod.rs b/src/cli/mod.rs index a83aca7..fb7b44d 100644 --- a/src/cli/mod.rs +++ b/src/cli/mod.rs @@ -11,8 +11,7 @@ use std::path::{Path, PathBuf}; use anyhow::{anyhow, bail}; use clap::Parser; -use crate::adapters::{config_dir_from_env, resolve_subagents_dir_for_session}; -use crate::core::{DetectInput, DispatchMechanism, RunContext, detect_run_context}; +use crate::core::{DetectInput, RunContext, detect_run_context}; mod args; mod commands; @@ -39,8 +38,6 @@ fn dispatch(command: Option) -> anyhow::Result<()> { harness: None, run_mode: None, workspace_dir: None, - subagents_dir: None, - session_id: None, only: None, skip: None, overwrite: false, @@ -62,8 +59,6 @@ fn dispatch(command: Option) -> anyhow::Result<()> { Commands::Run(args) => run_run(args), Commands::Ingest(args) => run_ingest(args), Commands::Finalize(args) => run_finalize(args), - Commands::SwitchCondition(args) => run_switch_condition(args), - Commands::ResetBatch(args) => run_reset_batch(args), Commands::Init(args) => run_init(args), Commands::Validate(args) => run_validate(args), Commands::TeardownGuard(_) => run_teardown_guard(), @@ -118,10 +113,10 @@ pub(crate) fn parse_id_list(v: Option<&str>) -> Option> { /// Always names `--skill-dir`, `--skill`, and `--workspace-dir` (all three are /// always populated in [`RunContext`] and always re-resolve), so the printed /// "Next:" commands are copy-pasteable from any cwd — not just the one `run` -/// happened to start in. The absolute `--workspace-dir` is what lets the isolated -/// session run `ingest`/`finalize`/`switch-condition` from `cwd = iteration-N/env/`: -/// without it, `workspace_root` would default to `/.eval-magic` -/// (`detect_run_context`) and the iteration tree above the env would not resolve. +/// happened to start in. The absolute `--workspace-dir` is what lets the human +/// run `ingest`/`finalize` from a per-`(group, condition)` env dir: without it, +/// `workspace_root` would default to `/.eval-magic` (`detect_run_context`) +/// and the iteration tree above the env would not resolve. pub(crate) fn command_target_args(ctx: &RunContext) -> String { format!( " --skill-dir {} --skill {} --workspace-dir {} --run-mode {}", @@ -178,13 +173,12 @@ pub(crate) fn iteration_dir(ctx: &RunContext, iteration: Option) -> anyhow: Ok(dir) } -/// The env directories a run staged under `iteration_dir`: the single `env/` for -/// the InSession mechanism, or one `env--/` per `(group, condition)` -/// for Cli. A best-effort directory scan (returns empty when the dir can't be read), -/// used by `teardown`/`finalize` to walk every env's write guard. Preferred over -/// reading `dispatch.json` because it has no parse-failure mode, needs no path -/// re-basing (recorded env dirs can be relative), and the only `env`/`env-*` children -/// of an iteration dir are the staged envs. +/// The env directories a run staged under `iteration_dir`: one +/// `env--/` per `(group, condition)`. A best-effort directory +/// scan (returns empty when the dir can't be read), used by `teardown`/`finalize` +/// to walk every env's write guard. Preferred over reading `dispatch.json` because +/// it has no parse-failure mode, needs no path re-basing (recorded env dirs can be +/// relative), and the only `env-*` children of an iteration dir are the staged envs. pub(crate) fn staged_env_roots(iteration_dir: &Path) -> Vec { let Ok(entries) = std::fs::read_dir(iteration_dir) else { return Vec::new(); @@ -201,57 +195,6 @@ pub(crate) fn staged_env_roots(iteration_dir: &Path) -> Vec { .collect() } -/// Resolve the subagents transcript dir for an in-session stage that reads -/// transcripts. The subagents dir is the `InSession` transcript source, so this -/// is keyed on the dispatch *mechanism*, not the harness: `Cli`-mechanism runs -/// (Codex; Claude Code hybrid/headless) read each task's `outputs/.jsonl` -/// and resolve to `None` — they must never bail on a missing -/// `CLAUDE_CODE_SESSION_ID`. For the `InSession` mechanism (Claude Code -/// interactive), precedence is: an explicit `--subagents-dir` (validated to -/// exist) wins; otherwise resolve from a session id — the `--session-id` flag if -/// given, else the `CLAUDE_CODE_SESSION_ID` env var Claude Code sets in the -/// orchestrating agent's shell — locating -/// `/projects///subagents/` (scanning `projects/*` -/// if the cwd slug differs). -pub(crate) fn resolve_subagents_dir( - mechanism: DispatchMechanism, - subagents_dir: Option<&str>, - session_id: Option<&str>, -) -> anyhow::Result> { - if mechanism != DispatchMechanism::InSession { - return Ok(None); - } - if let Some(dir) = subagents_dir { - let path = PathBuf::from(dir); - if !path.exists() { - bail!("subagents-dir not found: {}", path.display()); - } - return Ok(Some(path)); - } - let session = session_id - .map(str::to_string) - .or_else(|| std::env::var("CLAUDE_CODE_SESSION_ID").ok()) - .filter(|s| !s.trim().is_empty()); - let Some(session) = session else { - bail!( - "could not auto-resolve the subagents dir: CLAUDE_CODE_SESSION_ID is not set. \ - Re-run inside the Claude Code session that dispatched the subagents, or pass \ - --session-id or --subagents-dir ." - ); - }; - let config_dir = config_dir_from_env(); - let cwd = std::env::current_dir()?; - match resolve_subagents_dir_for_session(&config_dir, &cwd, &session) { - Some(path) => Ok(Some(path)), - None => bail!( - "no subagents dir found for session {session} under {}/projects/. The session may \ - not have dispatched any subagents (or lives under a different CLAUDE_CONFIG_DIR). \ - Pass --subagents-dir to override.", - config_dir.display() - ), - } -} - #[cfg(test)] mod tests { use super::*; @@ -312,11 +255,10 @@ mod tests { assert_eq!(resolved.skill_subdir, ctx.skill_subdir); } - /// The isolated session runs `ingest`/`finalize`/`switch-condition` from - /// `cwd = iteration-N/env/`. Without an explicit workspace root those commands - /// default `workspace_root` to `/.eval-magic` and bail "not found", - /// so the selector must carry an absolute `--workspace-dir` pointing at the - /// real workspace above the env. + /// The human runs `ingest`/`finalize` from a per-`(group, condition)` env dir. + /// Without an explicit workspace root those commands default `workspace_root` + /// to `/.eval-magic` and bail "not found", so the selector must carry an + /// absolute `--workspace-dir` pointing at the real workspace above the env. #[test] fn target_args_carry_absolute_workspace_dir() { let tmp = TempDir::new().unwrap(); @@ -359,55 +301,4 @@ mod tests { .unwrap(); assert_eq!(resolved.workspace_root, ctx.workspace_root); } - - #[test] - fn resolve_subagents_dir_is_none_for_cli_mechanism() { - // The subagents dir is the InSession transcript source. Cli-mechanism - // runs (Codex; Claude Code hybrid/headless) read each task's events file, - // so resolution is a no-op — and must NOT bail on a missing - // CLAUDE_CODE_SESSION_ID. This is the regression: the old harness-keyed - // gate forced session resolution for Claude Code and aborted under - // hybrid/headless. The Cli arm returns before reading any env var, so this - // is deterministic regardless of the test runner's environment. - assert_eq!( - resolve_subagents_dir(DispatchMechanism::Cli, None, None).unwrap(), - None - ); - // A passed --subagents-dir is ignored in Cli mode (the events file is the - // source), so it resolves to None without touching the filesystem. - assert_eq!( - resolve_subagents_dir(DispatchMechanism::Cli, Some("/whatever"), None).unwrap(), - None - ); - } - - #[test] - fn resolve_subagents_dir_uses_existing_explicit_dir() { - // InSession (Claude Code interactive): an explicit, existing - // --subagents-dir wins over any session-id resolution. - let tmp = TempDir::new().unwrap(); - let resolved = resolve_subagents_dir( - DispatchMechanism::InSession, - Some(&tmp.path().display().to_string()), - None, - ) - .unwrap(); - assert_eq!(resolved, Some(tmp.path().to_path_buf())); - } - - #[test] - fn resolve_subagents_dir_errors_when_explicit_dir_missing() { - // InSession with an explicit --subagents-dir that doesn't exist is a hard - // error (not a silent fallback to session-id resolution). - let err = resolve_subagents_dir( - DispatchMechanism::InSession, - Some("/no/such/subagents/dir/xyz"), - None, - ) - .unwrap_err(); - assert!( - err.to_string().contains("subagents-dir not found"), - "got: {err}" - ); - } } diff --git a/src/cli/run/dispatch.rs b/src/cli/run/dispatch.rs index e610166..131b0dd 100644 --- a/src/cli/run/dispatch.rs +++ b/src/cli/run/dispatch.rs @@ -14,7 +14,7 @@ use regex::Regex; use serde::{Deserialize, Serialize}; use crate::adapters::{CliManifestContext, adapter_for}; -use crate::core::{AvailableSkill, DispatchMechanism, Eval, Harness}; +use crate::core::{AvailableSkill, Eval, Harness}; use super::RunError; @@ -43,9 +43,8 @@ pub struct DispatchTask { /// byte-identical. #[serde(default, skip_serializing_if = "Option::is_none")] pub group: Option, - /// The agent-under-test's cwd for this task (its env dir). Absent in the - /// single-group case, where the Cli recipe's `` placeholder still - /// resolves to `env/`; present (per `(group, condition)`) for multi-group Cli. + /// The agent-under-test's cwd for this task — its per-`(group, condition)` env + /// dir, which the CLI dispatch recipe's `` placeholder resolves to. #[serde(default, skip_serializing_if = "Option::is_none")] pub eval_root: Option, #[serde(default, skip_serializing)] @@ -381,7 +380,6 @@ pub use crate::core::Mode; #[derive(Debug, Clone, Copy)] pub struct ManifestContext<'a> { pub harness: Harness, - pub mechanism: DispatchMechanism, pub guard: bool, pub agent_model: Option<&'a str>, } @@ -413,25 +411,19 @@ pub fn build_manifest( String::new(), "## How to use this manifest".to_string(), String::new(), - "In an agent session, read `dispatch.json` (sibling of this file) instead of this manifest. Each task has a `dispatch_prompt_path` field pointing at the file that holds the full prompt — dispatch the subagent with a short \"read this file and follow it\" instruction rather than inlining the prompt — plus exact paths for `run.json` and `timing.json`.".to_string(), - String::new(), - "**Transcript correlation:** Each task has an `agent_description` field of the form `:[:r]:i-` (the `r` segment appears only in multi-run cells, naming the 1-based run index). When dispatching the subagent via the host's primitive (e.g. Claude Code's Agent tool), pass this string verbatim as the dispatch `description` — do not reconstruct it. The per-run nonce keeps descriptions unique across iterations sharing one session's subagents dir, so the transcript adapter correlates each subagent's persisted transcript back to the right `(eval, condition, run)` slot without collisions.".to_string(), + "In an agent session, read `dispatch.json` (sibling of this file) instead of this manifest. Each task has a `dispatch_prompt_path` field pointing at the file that holds the full prompt — dispatch the task with a short \"read this file and follow it\" instruction rather than inlining the prompt — plus exact paths for `run.json` and `timing.json`.".to_string(), String::new(), ]; - // Only a Cli-dispatch run emits a CLI recipe section; an in-session run - // (e.g. interactive Claude Code) gets the generic ingest guidance below. - if context.mechanism == DispatchMechanism::Cli - && let Some(lines) = adapter_for(context.harness).cli_manifest_section(CliManifestContext { - guard: context.guard, - agent_model: context.agent_model, - }) - { + if let Some(lines) = adapter_for(context.harness).cli_manifest_section(CliManifestContext { + guard: context.guard, + agent_model: context.agent_model, + }) { header.extend(lines); } header.extend([ - "After all dispatches (Claude Code only):".to_string(), + "After all dispatches:".to_string(), String::new(), - "1. Run `eval-magic ingest` (it auto-resolves the subagents dir from CLAUDE_CODE_SESSION_ID; outside the dispatching session, pass `--session-id ` or `--subagents-dir `) — a fixed-order chain of record-runs (assembles every task's `run.json` from `dispatch.json` + the subagent's own `outputs/final-message.md` + the persisted transcript, and backfills `timing.json` with transcript-derived tokens/duration; never clobbers an existing record), fill-transcripts, detect-stray-writes, and grade. Optional higher-fidelity timing: write `{ \"total_tokens\": , \"duration_ms\": , \"source\": \"completion-event\" }` from the task completion event to `timing.json` right after a dispatch — completion-event numbers always win over the backfill.".to_string(), + "1. Run `eval-magic ingest --harness ` — a fixed-order chain of record-runs (assembles every task's `run.json` from `dispatch.json` + the task's own `outputs/final-message.md` + the events file the harness CLI wrote under `outputs/`, and backfills `timing.json` with transcript-derived tokens/duration; never clobbers an existing record), fill-transcripts, detect-stray-writes, and grade. Optional higher-fidelity timing: write `{ \"total_tokens\": , \"duration_ms\": , \"source\": \"completion-event\" }` from the task completion event to `timing.json` right after a dispatch — completion-event numbers always win over the backfill.".to_string(), "2. Dispatch the judge tasks ingest lists, then run `eval-magic finalize` for the benchmark.".to_string(), String::new(), "On a harness without persisted transcripts, instead write each task's `run.json` (matching `skills/evaluating-skills/schema/run-record.schema.json`, enforced at runtime by grade/fill-transcripts/detect-stray-writes) and `timing.json` by hand when its subagent returns: carry over `eval_id`, `condition`, `skill_path` (`null` on the without_skill arm), `prompt`, and `files` from the task; populate `final_message` from the subagent's reply; leave `tool_invocations` as `[]`; capture `total_tokens`/`duration_ms` from the task completion event immediately — they may not be persisted anywhere else.".to_string(), diff --git a/src/cli/run/grouping.rs b/src/cli/run/grouping.rs index 105778a..6582a2e 100644 --- a/src/cli/run/grouping.rs +++ b/src/cli/run/grouping.rs @@ -5,9 +5,8 @@ //! different content at the same path were a hard error. Grouping turns that into //! a decision: evals whose fixtures conflict (same env-relative dest from a //! *different* source) are routed into separate groups, and an eval may opt into -//! its own singleton group via [`Isolation::Isolated`]. The realization differs by -//! dispatch mechanism (one env + reset barrier for in-session; one env per -//! `(group, condition)` for CLI), but the grouping decision here is shared. +//! its own singleton group via [`Isolation::Isolated`]. Each group is realized as +//! one env per `(group, condition)`, but the grouping decision here is shared. //! //! The conflict rule is identical to the per-env fixture-claim rule in //! [`super::fixtures`]: same dest + same source is an idempotent share (evals may diff --git a/src/cli/run/orchestrate/build.rs b/src/cli/run/orchestrate/build.rs index 37d1251..48c5a90 100644 --- a/src/cli/run/orchestrate/build.rs +++ b/src/cli/run/orchestrate/build.rs @@ -12,9 +12,7 @@ use serde_json::{Value, json}; use crate::adapters::{ adapter_for, config_dir_from_env, detect_plugin_shadows, format_shadow_banner, }; -use crate::core::{ - AvailableSkill, ConditionEntry, ConditionsRecord, DispatchMechanism, Harness, RunContext, -}; +use crate::core::{AvailableSkill, ConditionEntry, ConditionsRecord, Harness, RunContext}; use crate::pipeline::io::now_iso8601; use super::super::dispatch::{ @@ -125,16 +123,13 @@ pub(super) fn write_dispatch( fixtures_by_eval.insert(ev.id.as_str(), dests); } - let mechanism = ctx.run_mode.mechanism(); - // A single group keeps the pre-grouping task shape (no `group`/`eval_root` - // keys); >1 group, or any Cli run (per-(group, condition) envs), tags tasks. + // A single group keeps the `group` key off each task (>1 group tags them); + // `eval_root` (the per-task cwd) is always set, one env per (group, condition). let multi_group = r.groups.len() > 1; let mut tasks = Vec::new(); - // Build tasks CONDITION-outer, GROUP-inner — so the in-session runbook reads - // tasks[] top to bottom as: dispatch each (condition, group) segment, with a - // `reset-batch` between groups and one `switch-condition` between conditions. - // A single group collapses this to the legacy condition-outer order. + // Build tasks CONDITION-outer, GROUP-inner. A single group collapses this to + // the legacy condition-outer order. for (cond_name, cond_skill_path, cond_slug) in [ ( r.cond_a, @@ -148,7 +143,7 @@ pub(super) fn write_dispatch( ), ] { for group in &r.groups { - let env_root = task_env_root(&r.iteration_dir, mechanism, &group.id, cond_name); + let env_root = task_env_root(&r.iteration_dir, &group.id, cond_name); let env_root_str = env_root.to_string_lossy().into_owned(); let staged_path = staged_skill_path_for(&env_root, cond_slug); let available_skills = available_skills_for(&env_root, cond_skill_path, cond_slug); @@ -213,13 +208,9 @@ pub(super) fn write_dispatch( run_index, // Tag the group only when there's more than one (keeps the // single-group task byte-identical). `eval_root` is the - // per-task cwd the Cli recipe `cd`s into; the in-session - // path shares one env, so it stays `None`. + // per-task cwd the CLI recipe `cd`s into. group: multi_group.then_some(group.id.as_str()), - eval_root: match mechanism { - DispatchMechanism::Cli => Some(env_root_str.as_str()), - DispatchMechanism::InSession => None, - }, + eval_root: Some(env_root_str.as_str()), })?); } } @@ -238,7 +229,6 @@ pub(super) fn write_dispatch( &tasks, ManifestContext { harness: ctx.harness, - mechanism: ctx.run_mode.mechanism(), guard: opts.guard, agent_model: opts.agent_model, }, @@ -269,49 +259,42 @@ pub(super) fn write_dispatch( "tasks": tasks, }); // The isolation-batch plan the executing session/human follows: which evals - // share an env, why, and (per condition) the env each batch runs in. Omitted in - // the trivial single-group in-session case so its dispatch.json stays - // byte-identical; emitted whenever the layout is non-trivial (>1 group, or any - // Cli run with per-(group, condition) envs). - if multi_group || mechanism == DispatchMechanism::Cli { - let groups: Vec = r - .groups - .iter() - .map(|g| { - let envs: Vec = [r.cond_a, r.cond_b] - .iter() - .map(|cond| { - json!({ - "condition": cond, - "dir": task_env_root(&r.iteration_dir, mechanism, &g.id, cond) - .to_string_lossy(), - }) + // share an env, why, and (per condition) the env each batch runs in. There is + // one env per (group, condition). + let groups: Vec = r + .groups + .iter() + .map(|g| { + let envs: Vec = [r.cond_a, r.cond_b] + .iter() + .map(|cond| { + json!({ + "condition": cond, + "dir": task_env_root(&r.iteration_dir, &g.id, cond).to_string_lossy(), }) - .collect(); - json!({ - "id": g.id, - "evals": g.eval_ids, - "rationale": g.rationale, - "envs": envs, }) + .collect(); + json!({ + "id": g.id, + "evals": g.eval_ids, + "rationale": g.rationale, + "envs": envs, }) - .collect(); - dispatch_json - .as_object_mut() - .expect("dispatch_json is a JSON object") - .insert("groups".to_string(), Value::Array(groups)); - } + }) + .collect(); + dispatch_json + .as_object_mut() + .expect("dispatch_json is a JSON object") + .insert("groups".to_string(), Value::Array(groups)); write_json(&dispatch_json_path, &dispatch_json)?; - // The followable handoff artifact: a fresh isolated session (interactive) or - // a human (headless) reads RUNBOOK.md to run the loop. It references eval-magic - // meta (dispatch.json, benchmark.json) under `iteration_dir`, so `RunbookContext` - // keeps `iteration_dir`, not the env. Generated, not version controlled. + // The followable handoff artifact: a human reads RUNBOOK.md to run the loop. + // It references eval-magic meta (dispatch.json, benchmark.json) under + // `iteration_dir`, so `RunbookContext` keeps `iteration_dir`, not the env, and + // the human drives from there. Generated, not version controlled. let target_args = command_target_args(ctx); - let group_ids: Vec = r.groups.iter().map(|g| g.id.clone()).collect(); let runbook = build_runbook(&RunbookContext { harness: ctx.harness, - run_mode: ctx.run_mode, skill_name: &ctx.skill_name, iteration: r.iteration, iteration_dir: &r.iteration_dir, @@ -319,19 +302,11 @@ pub(super) fn write_dispatch( cond_a: r.cond_a, cond_b: r.cond_b, num_tasks: tasks.len(), - groups: &group_ids, target_args: &target_args, guard: opts.guard, agent_model: opts.agent_model, }); - // In-session: written into the single `env/` (the isolated session's cwd, = - // `ctx.stage_root`). Cli: there is no single env (one per (group, condition)), - // and the human drives from the iteration dir, so it lands there. - let runbook_path = match mechanism { - DispatchMechanism::InSession => ctx.stage_root.join("RUNBOOK.md"), - DispatchMechanism::Cli => r.iteration_dir.join("RUNBOOK.md"), - }; - fs::write(runbook_path, runbook)?; + fs::write(r.iteration_dir.join("RUNBOOK.md"), runbook)?; Ok(tasks.len()) } @@ -343,12 +318,11 @@ pub(super) fn post_build( opts: &RunOptions, r: &Resolved, ) -> Result<(), RunError> { - // Every env this run staged: one shared `env/` for in-session, one per - // (group, condition) for Cli. Computed once and reused below to arm the guard in - // each env and to point the plugin-shadow preflight at a real staged env. + // Every env this run staged: one per (group, condition). Computed once and + // reused below to arm the guard in each env and to point the plugin-shadow + // preflight at a real staged env. let targets = env_targets(&EnvLayoutInput { iteration_dir: &r.iteration_dir, - mechanism: ctx.run_mode.mechanism(), groups: &r.groups, cond_a: r.cond_a, cond_b: r.cond_b, @@ -384,10 +358,9 @@ pub(super) fn post_build( // Plugin-shadow preflight (Claude Code): a staged skill name also discoverable // from an enabled plugin or the global skills dir contaminates the run. Scan the - // first staged env, not `ctx.stage_root` — under Cli the legacy single `env/` is - // never created, so the project-local `.claude/settings.json` enabledPlugins the - // scan reads must come from a real staged env. In-session's first target *is* - // `env/` (== `ctx.stage_root`), so this is unchanged there. + // first staged env, not `ctx.stage_root` — only the per-`(group, condition)` + // envs are created, so the project-local `.claude/settings.json` enabledPlugins + // the scan reads must come from a real staged env. if ctx.harness == Harness::ClaudeCode { let mut names: Vec<&str> = vec![ctx.skill_name.as_str()]; names.extend(ctx.sibling_skill_names.iter().map(String::as_str)); diff --git a/src/cli/run/orchestrate/envs.rs b/src/cli/run/orchestrate/envs.rs index c6f469f..424c65b 100644 --- a/src/cli/run/orchestrate/envs.rs +++ b/src/cli/run/orchestrate/envs.rs @@ -1,26 +1,20 @@ //! Env-layout planning: turn the computed isolation [`Group`]s into the concrete -//! environment directories a run stages into, which differs by dispatch mechanism. +//! environment directories a run stages into. //! -//! - **InSession** keeps a single `iteration-N/env/` that hosts *both* conditions -//! (the off-condition skill is removed by `switch-condition`) and the first -//! group's fixtures (later groups are swapped in by `reset-batch`). One env, one -//! session — byte-identical to the pre-grouping layout in the single-group case. -//! - **Cli** materializes one `iteration-N/env--/` per -//! `(group, condition)`: each subprocess `cd`s into its own env, which holds only -//! that condition's skill (or none) and that group's fixtures — real physical -//! isolation along both axes. +//! A run materializes one `iteration-N/env--/` per +//! `(group, condition)`: each subprocess `cd`s into its own env, which holds only +//! that condition's skill (or none) and that group's fixtures — real physical +//! isolation along both axes. use std::path::{Path, PathBuf}; -use crate::core::DispatchMechanism; - use super::super::grouping::Group; /// One environment directory to stage for a run. pub(super) struct EnvTarget { pub root: PathBuf, - /// `(condition name, that condition's skill path)` staged into this env. - /// InSession stages both conditions here; Cli stages exactly one. + /// `(condition name, that condition's skill path)` staged into this env — + /// exactly one per env. pub conditions: Vec<(&'static str, Option)>, /// Eval ids whose fixtures populate this env (its group's evals). pub eval_ids: Vec, @@ -29,7 +23,6 @@ pub(super) struct EnvTarget { /// Inputs to [`env_targets`]. pub(super) struct EnvLayoutInput<'a> { pub iteration_dir: &'a Path, - pub mechanism: DispatchMechanism, pub groups: &'a [Group], pub cond_a: &'static str, pub cond_b: &'static str, @@ -37,61 +30,31 @@ pub(super) struct EnvLayoutInput<'a> { pub skill_path_b: Option<&'a str>, } -/// The env dir a `(group, condition)` task runs in: the shared `env/` for -/// InSession, or the per-`(group, condition)` env for Cli. -pub(super) fn task_env_root( - iteration_dir: &Path, - mechanism: DispatchMechanism, - group_id: &str, - condition: &str, -) -> PathBuf { - match mechanism { - DispatchMechanism::InSession => iteration_dir.join("env"), - DispatchMechanism::Cli => iteration_dir.join(format!("env-{group_id}-{condition}")), - } +/// The env dir a `(group, condition)` task runs in. +pub(super) fn task_env_root(iteration_dir: &Path, group_id: &str, condition: &str) -> PathBuf { + iteration_dir.join(format!("env-{group_id}-{condition}")) } -/// Plan the environments to stage. InSession returns a single env hosting both -/// conditions and the *first* group's fixtures; Cli returns one env per -/// `(group, condition)`. +/// Plan the environments to stage: one env per `(group, condition)`. pub(super) fn env_targets(input: &EnvLayoutInput) -> Vec { let conds: [(&'static str, Option); 2] = [ (input.cond_a, input.skill_path_a.map(str::to_owned)), (input.cond_b, input.skill_path_b.map(str::to_owned)), ]; - match input.mechanism { - DispatchMechanism::InSession => { - // One env, staged for the first group; reset-batch swaps later groups in. - let first = input - .groups - .first() - .expect("at least one group is always computed"); - vec![EnvTarget { - root: task_env_root( - input.iteration_dir, - input.mechanism, - &first.id, - input.cond_a, - ), - conditions: conds.to_vec(), - eval_ids: first.eval_ids.clone(), - }] - } - DispatchMechanism::Cli => input - .groups - .iter() - .flat_map(|g| { - conds - .clone() - .into_iter() - .map(move |(cond, skill)| EnvTarget { - root: task_env_root(input.iteration_dir, input.mechanism, &g.id, cond), - conditions: vec![(cond, skill)], - eval_ids: g.eval_ids.clone(), - }) - }) - .collect(), - } + input + .groups + .iter() + .flat_map(|g| { + conds + .clone() + .into_iter() + .map(move |(cond, skill)| EnvTarget { + root: task_env_root(input.iteration_dir, &g.id, cond), + conditions: vec![(cond, skill)], + eval_ids: g.eval_ids.clone(), + }) + }) + .collect() } #[cfg(test)] @@ -114,39 +77,11 @@ mod tests { } #[test] - fn insession_single_env_hosts_both_conditions_and_first_group() { - let iter = Path::new("/w/iteration-1"); - let gs = groups(); - let targets = env_targets(&EnvLayoutInput { - iteration_dir: iter, - mechanism: DispatchMechanism::InSession, - groups: &gs, - cond_a: "with_skill", - cond_b: "without_skill", - skill_path_a: Some("/s/SKILL.md"), - skill_path_b: None, - }); - assert_eq!(targets.len(), 1); - assert_eq!(targets[0].root, Path::new("/w/iteration-1/env")); - assert_eq!( - targets[0] - .conditions - .iter() - .map(|(c, _)| *c) - .collect::>(), - vec!["with_skill", "without_skill"] - ); - // Only the first group's fixtures populate the env up front. - assert_eq!(targets[0].eval_ids, vec!["e1"]); - } - - #[test] - fn cli_one_env_per_group_condition_with_only_that_conditions_skill() { + fn one_env_per_group_condition_with_only_that_conditions_skill() { let iter = Path::new("/w/iteration-1"); let gs = groups(); let targets = env_targets(&EnvLayoutInput { iteration_dir: iter, - mechanism: DispatchMechanism::Cli, groups: &gs, cond_a: "with_skill", cond_b: "without_skill", @@ -181,14 +116,10 @@ mod tests { } #[test] - fn task_env_root_is_bare_env_for_insession_and_suffixed_for_cli() { + fn task_env_root_is_suffixed_by_group_and_condition() { let iter = Path::new("/w/iteration-1"); assert_eq!( - task_env_root(iter, DispatchMechanism::InSession, "g2", "without_skill"), - Path::new("/w/iteration-1/env") - ); - assert_eq!( - task_env_root(iter, DispatchMechanism::Cli, "g2", "without_skill"), + task_env_root(iter, "g2", "without_skill"), Path::new("/w/iteration-1/env-g2-without_skill") ); } diff --git a/src/cli/run/orchestrate/mod.rs b/src/cli/run/orchestrate/mod.rs index 6130f14..53c15f1 100644 --- a/src/cli/run/orchestrate/mod.rs +++ b/src/cli/run/orchestrate/mod.rs @@ -15,10 +15,10 @@ use std::path::PathBuf; use crate::adapters::{CliDispatchContext, adapter_for}; use crate::cli::command_target_args; -use crate::core::{DispatchMechanism, Eval, Mode, RunContext}; +use crate::core::{Eval, Mode, RunContext}; use super::RunError; -use super::util::{insession_isolated_handoff, mode_str}; +use super::util::mode_str; mod build; mod envs; @@ -156,16 +156,10 @@ fn print_next_steps(ctx: &RunContext, opts: &RunOptions, r: &Resolved, num_tasks r.iteration_dir.join("dispatch.json").display() ); - match ctx.run_mode.mechanism() { - DispatchMechanism::InSession => println!( - "Runbook: {} — start a fresh session in env/ and \"Read and follow RUNBOOK.md\".", - ctx.stage_root.join("RUNBOOK.md").display() - ), - DispatchMechanism::Cli => println!( - "Runbook: {} — a human-followed copy of the steps below.", - r.iteration_dir.join("RUNBOOK.md").display() - ), - } + println!( + "Runbook: {} — a human-followed copy of the steps below.", + r.iteration_dir.join("RUNBOOK.md").display() + ); let run_counts: Vec = r .selected_evals .iter() @@ -197,23 +191,14 @@ fn print_next_steps(ctx: &RunContext, opts: &RunOptions, r: &Resolved, num_tasks return; } let target_args = command_target_args(ctx); - match ctx.run_mode.mechanism() { - // In-session subagent dispatch (Claude Code's Task tool today). The env is - // built before the isolated session starts, so the summary just hands off: - // cd into env/, start a fresh session, "Read and follow RUNBOOK.md" — which - // carries the full dispatch → switch-condition → ingest → finalize loop. - DispatchMechanism::InSession => { - println!("\nNext: {}", insession_isolated_handoff(&ctx.stage_root)) - } - // One-shot CLI dispatch; the exact command is harness-specific. - DispatchMechanism::Cli => println!( - "{}", - adapter_for(ctx.harness).cli_next_steps(CliDispatchContext { - guard: opts.guard, - target_args: &target_args, - iteration, - agent_model: opts.agent_model, - }) - ), - } + // One-shot CLI dispatch; the exact command is harness-specific. + println!( + "{}", + adapter_for(ctx.harness).cli_next_steps(CliDispatchContext { + guard: opts.guard, + target_args: &target_args, + iteration, + agent_model: opts.agent_model, + }) + ); } diff --git a/src/cli/run/orchestrate/stage.rs b/src/cli/run/orchestrate/stage.rs index ce5014d..d6848e7 100644 --- a/src/cli/run/orchestrate/stage.rs +++ b/src/cli/run/orchestrate/stage.rs @@ -70,12 +70,10 @@ pub(super) fn stage_conditions( )); } - // The environments to stage: one shared `env/` for in-session (hosting both - // conditions + the first group's fixtures), or one per (group, condition) for - // Cli (each with only its condition's skill + its group's fixtures). + // The environments to stage: one per (group, condition), each with only its + // condition's skill + its group's fixtures. let targets = env_targets(&EnvLayoutInput { iteration_dir: &r.iteration_dir, - mechanism: ctx.run_mode.mechanism(), groups: &r.groups, cond_a: r.cond_a, cond_b: r.cond_b, @@ -89,7 +87,7 @@ pub(super) fn stage_conditions( for target in &targets { // Disarm a prior run's guard before re-staging, so a crashed run can't leave // the write-blocking hook armed across runs. Created unconditionally — even - // under --no-stage, fixtures (and the in-session RUNBOOK) still land here. + // under --no-stage, each env's fixtures still land here. teardown_guard(&target.root); fs::create_dir_all(&target.root)?; diff --git a/src/cli/run/runbook.rs b/src/cli/run/runbook.rs index ac0b921..d0265c6 100644 --- a/src/cli/run/runbook.rs +++ b/src/cli/run/runbook.rs @@ -2,31 +2,22 @@ //! iteration directory during `run`. //! //! The runbook turns the prep session's "what to do next" guidance into a file -//! a *fresh, isolated* session (or a human at a terminal) can read end-to-end: -//! "Read and follow RUNBOOK.md". Which template is used is keyed on the run mode's -//! [`DispatchMechanism`](crate::core::DispatchMechanism), not the harness: +//! a human at a terminal can read end-to-end: "Read and follow RUNBOOK.md". Every +//! run uses the shared [`HEADLESS_RUNBOOK_TEMPLATE`], whose harness-specific +//! dispatch + judge recipes come from the adapter's CLI generators. //! -//! - `InSession` (interactive) → the harness's interactive, agent-followed template. -//! - `Cli` (hybrid / headless) → the shared headless, human-followed template — -//! including Claude Code under `--run-mode hybrid`. -//! -//! The per-mode prose skeletons live in `profiles/` (checked in, loaded via -//! [`HarnessAdapter::runbook_template`](crate::adapters::HarnessAdapter::runbook_template)) -//! and carry `{{TOKEN}}` placeholders the renderer fills with run-specific values. -//! The generated `RUNBOOK.md` itself is a workspace artifact and is not version -//! controlled. +//! The prose skeleton lives in `profiles/` (checked in) and carries `{{TOKEN}}` +//! placeholders the renderer fills with run-specific values. The generated +//! `RUNBOOK.md` itself is a workspace artifact and is not version controlled. use std::path::Path; use crate::adapters::{ CliDispatchContext, CliJudgeContext, HEADLESS_RUNBOOK_TEMPLATE, adapter_for, }; -use crate::core::{DispatchMechanism, Harness, Mode, RunMode}; +use crate::core::{Harness, Mode}; -use super::util::{ - harness_label, insession_dispatch_batch, insession_dispatch_segment, insession_ingest_command, - insession_reset_batch_command, insession_switch_command, mode_str, -}; +use super::util::{harness_label, mode_str}; /// Run-specific values the renderer substitutes into a runbook template. Built by /// the orchestrator from the resolved run; kept as primitives so the renderer is @@ -34,7 +25,6 @@ use super::util::{ /// unit-testable on its own. pub(crate) struct RunbookContext<'a> { pub harness: Harness, - pub run_mode: RunMode, pub skill_name: &'a str, pub iteration: u32, pub iteration_dir: &'a Path, @@ -42,9 +32,6 @@ pub(crate) struct RunbookContext<'a> { pub cond_a: &'a str, pub cond_b: &'a str, pub num_tasks: usize, - /// Isolation-group ids in order. One entry → the byte-identical single-batch - /// dispatch; more → per-group batches with `reset-batch` barriers (in-session). - pub groups: &'a [String], /// The self-sufficient `--skill-dir … --skill …` selector (leading space), /// from [`command_target_args`](crate::cli::command_target_args). pub target_args: &'a str, @@ -52,50 +39,14 @@ pub(crate) struct RunbookContext<'a> { pub agent_model: Option<&'a str>, } -/// The per-condition dispatch block for the interactive runbook. A single group -/// renders the legacy single-batch instruction (byte-identical to the pre-grouping -/// runbook). Multiple groups render each group's batch with a `reset-batch` barrier -/// between them; `first_condition` suppresses the reset before the very first group -/// (condition A starts from the env already staged with group 1, while condition B -/// must restore group 1 after A's last group mutated the env). -fn insession_dispatch_block( - condition: &str, - groups: &[String], - target_args: &str, - iteration: u32, - first_condition: bool, -) -> String { - if groups.len() <= 1 { - return insession_dispatch_batch(condition); - } - let mut parts: Vec = Vec::new(); - for (i, group) in groups.iter().enumerate() { - if !(first_condition && i == 0) { - parts.push(format!( - "Reset the env to group `{group}` (wait for the previous batch to finish first):\n\n```\n{}\n```", - insession_reset_batch_command(target_args, iteration, group) - )); - } - parts.push(format!( - "Dispatch group `{group}`: {}", - insession_dispatch_segment(condition, group) - )); - } - parts.join("\n\n") -} - -/// Render `RUNBOOK.md` for a run: pick the harness's template (interactive vs. -/// headless) and fill its `{{TOKEN}}` placeholders with run-specific values. +/// Render `RUNBOOK.md` for a run: fill the shared headless template's +/// `{{TOKEN}}` placeholders with run-specific values. The harness-specific +/// dispatch + judge recipes come from the adapter's CLI generators, so the +/// runbook stays in lockstep with `dispatch-manifest.md` and the printed next +/// steps; pipeline commands carry `--harness`. pub(crate) fn build_runbook(ctx: &RunbookContext) -> String { let adapter = adapter_for(ctx.harness); - // The runbook template is mechanism-keyed, not harness-keyed: an in-session - // run uses the harness's interactive (agent-followed) template; every Cli run - // uses the shared headless (human-followed) one — including Claude Code in - // hybrid, whose `runbook_template()` is the interactive variant. - let template = match ctx.run_mode.mechanism() { - DispatchMechanism::InSession => adapter.runbook_template(), - DispatchMechanism::Cli => HEADLESS_RUNBOOK_TEMPLATE, - }; + let template = HEADLESS_RUNBOOK_TEMPLATE; let iteration = ctx.iteration.to_string(); let num_tasks = ctx.num_tasks.to_string(); @@ -122,77 +73,37 @@ pub(crate) fn build_runbook(ctx: &RunbookContext) -> String { ("BENCHMARK_PATH", &benchmark_path), ]; - // Mechanism-specific tokens. Owners outlive the `render` call below. - let (dispatch_cond_a, dispatch_cond_b, switch_cmd, ingest_cmd); - let (dispatch_recipe, judge_recipe, finalize_cmd, teardown_cmd); - match ctx.run_mode.mechanism() { - // Interactive: an agent dispatches in-session subagents one condition batch - // at a time, runs `switch-condition` between them, then runs the rest of the - // loop itself. Built from the same fragments as the post-`run` "Next:" - // message so the two can never drift on the dispatch / switch / ingest text. - DispatchMechanism::InSession => { - dispatch_cond_a = insession_dispatch_block( - ctx.cond_a, - ctx.groups, - ctx.target_args, - ctx.iteration, - true, - ); - dispatch_cond_b = insession_dispatch_block( - ctx.cond_b, - ctx.groups, - ctx.target_args, - ctx.iteration, - false, - ); - switch_cmd = insession_switch_command(ctx.target_args, ctx.iteration, ctx.cond_b); - ingest_cmd = insession_ingest_command(ctx.target_args, ctx.iteration); - finalize_cmd = format!( - "eval-magic finalize{} --iteration {}", - ctx.target_args, ctx.iteration - ); - teardown_cmd = format!("eval-magic teardown{}", ctx.target_args); - vars.push(("DISPATCH_COND_A", &dispatch_cond_a)); - vars.push(("DISPATCH_COND_B", &dispatch_cond_b)); - vars.push(("SWITCH_CMD", &switch_cmd)); - vars.push(("INGEST_CMD", &ingest_cmd)); - vars.push(("FINALIZE_CMD", &finalize_cmd)); - vars.push(("TEARDOWN_CMD", &teardown_cmd)); - } - // Headless: a human pastes commands. The harness-specific dispatch + - // judge recipes come from the adapter's existing CLI generators, so the - // runbook stays in lockstep with `dispatch-manifest.md` and the printed - // next steps; pipeline commands carry `--harness`. - DispatchMechanism::Cli => { - let label = harness_label(ctx.harness); - dispatch_recipe = adapter.cli_next_steps(CliDispatchContext { - guard: ctx.guard, - target_args: ctx.target_args, - iteration: ctx.iteration, - agent_model: ctx.agent_model, - }); - judge_recipe = adapter - .cli_judge_next_steps(CliJudgeContext { - guard: ctx.guard, - iteration_dir: ctx.iteration_dir, - }) - .unwrap_or_else(|| { - "Dispatch each judge task `ingest` listed through the same harness CLI, \ - capturing its transcript output, then finalize." - .to_string() - }); - finalize_cmd = format!( - "eval-magic finalize{} --iteration {} --harness {label}", - ctx.target_args, ctx.iteration - ); - teardown_cmd = format!("eval-magic teardown{} --harness {label}", ctx.target_args); - vars.push(("HARNESS", label)); - vars.push(("DISPATCH_RECIPE", &dispatch_recipe)); - vars.push(("JUDGE_RECIPE", &judge_recipe)); - vars.push(("FINALIZE_CMD", &finalize_cmd)); - vars.push(("TEARDOWN_CMD", &teardown_cmd)); - } - } + // A human pastes commands. The harness-specific dispatch + judge recipes come + // from the adapter's CLI generators, so the runbook stays in lockstep with + // `dispatch-manifest.md` and the printed next steps; pipeline commands carry + // `--harness`. Owners outlive the `render` call below. + let label = harness_label(ctx.harness); + let dispatch_recipe = adapter.cli_next_steps(CliDispatchContext { + guard: ctx.guard, + target_args: ctx.target_args, + iteration: ctx.iteration, + agent_model: ctx.agent_model, + }); + let judge_recipe = adapter + .cli_judge_next_steps(CliJudgeContext { + guard: ctx.guard, + iteration_dir: ctx.iteration_dir, + }) + .unwrap_or_else(|| { + "Dispatch each judge task `ingest` listed through the same harness CLI, \ + capturing its transcript output, then finalize." + .to_string() + }); + let finalize_cmd = format!( + "eval-magic finalize{} --iteration {} --harness {label}", + ctx.target_args, ctx.iteration + ); + let teardown_cmd = format!("eval-magic teardown{} --harness {label}", ctx.target_args); + vars.push(("HARNESS", label)); + vars.push(("DISPATCH_RECIPE", &dispatch_recipe)); + vars.push(("JUDGE_RECIPE", &judge_recipe)); + vars.push(("FINALIZE_CMD", &finalize_cmd)); + vars.push(("TEARDOWN_CMD", &teardown_cmd)); render(template, &vars) } @@ -238,137 +149,11 @@ mod tests { use super::*; use std::path::PathBuf; - fn claude_ctx(dir: &Path) -> RunbookContext<'_> { - RunbookContext { - harness: Harness::ClaudeCode, - run_mode: RunMode::Interactive, - skill_name: "widget-skill", - iteration: 5, - iteration_dir: dir, - mode: Mode::NewSkill, - cond_a: "with_skill", - cond_b: "without_skill", - num_tasks: 4, - groups: &[], - target_args: " --skill-dir /tmp/skills --skill widget-skill", - guard: true, - agent_model: None, - } - } - - #[test] - fn interactive_runbook_carries_run_specifics_and_full_loop() { - let dir = PathBuf::from("/work/.eval-magic/widget-skill/iteration-5"); - let book = build_runbook(&claude_ctx(&dir)); - - // Run-specific identity. - assert!(book.contains("widget-skill"), "names the skill: {book}"); - assert!(book.contains("iteration 5"), "names the iteration: {book}"); - assert!( - book.contains("with_skill") && book.contains("without_skill"), - "names both conditions: {book}" - ); - assert!(book.contains("new-skill"), "names the mode: {book}"); - - // The dispatch step reuses the in-session guidance (agent_description is - // the transcript-linking key). - assert!( - book.contains("agent_description"), - "carries the dispatch-loop guidance: {book}" - ); - - // The per-condition batch loop: each condition dispatched as its own batch, - // with a `switch-condition` barrier (naming the kept condition) between them. - assert!( - book.contains("`condition` is `with_skill`") - && book.contains("`condition` is `without_skill`"), - "dispatches each condition as its own batch: {book}" - ); - assert!( - book.contains( - "eval-magic switch-condition --skill-dir /tmp/skills --skill widget-skill --iteration 5 --condition without_skill" - ), - "carries the switch-condition barrier command: {book}" - ); - - // The full single-session loop: ingest → finalize → teardown, each a - // copy-pasteable command threaded with the target selector + iteration. - assert!( - book.contains( - "eval-magic ingest --skill-dir /tmp/skills --skill widget-skill --iteration 5" - ), - "carries the ingest command: {book}" - ); - assert!( - book.contains( - "eval-magic finalize --skill-dir /tmp/skills --skill widget-skill --iteration 5" - ), - "carries the finalize command: {book}" - ); - assert!( - book.contains("eval-magic teardown --skill-dir /tmp/skills --skill widget-skill"), - "carries the teardown command: {book}" - ); - assert!( - book.contains("benchmark.json"), - "points at the result: {book}" - ); - - // No interactive run is dispatched through a harness CLI — that is the - // headless path. - assert!( - !book.contains("codex exec"), - "interactive runbook is not a CLI-dispatch recipe: {book}" - ); - // Every template token must be filled. - assert!( - !book.contains("{{"), - "no unsubstituted tokens remain: {book}" - ); - } - - #[test] - fn interactive_runbook_with_multiple_groups_carries_reset_batch_barriers() { - let dir = PathBuf::from("/work/.eval-magic/widget-skill/iteration-5"); - let groups = ["g1".to_string(), "g2".to_string()]; - let book = build_runbook(&RunbookContext { - groups: &groups, - ..claude_ctx(&dir) - }); - - // Each group dispatches as its own segment, filtered by group. - assert!( - book.contains("`condition` is `with_skill` and `group` is `g1`") - && book.contains("`condition` is `with_skill` and `group` is `g2`"), - "with_skill dispatches each group separately: {book}" - ); - assert!( - book.contains("`condition` is `without_skill` and `group` is `g1`") - && book.contains("`condition` is `without_skill` and `group` is `g2`"), - "without_skill dispatches each group separately: {book}" - ); - // reset-batch barriers between groups, naming the group to seed. - assert!( - book.contains( - "eval-magic reset-batch --skill-dir /tmp/skills --skill widget-skill --iteration 5 --group g2" - ), - "carries the reset-batch barrier for g2: {book}" - ); - // The switch-condition barrier is still present, once, between conditions. - assert!( - book.contains("eval-magic switch-condition") - && book.contains("--condition without_skill"), - "still carries the switch-condition barrier: {book}" - ); - assert!(!book.contains("{{"), "no unsubstituted tokens: {book}"); - } - #[test] fn headless_runbook_is_human_followed_cli_recipe() { let dir = PathBuf::from("/work/.eval-magic/widget-skill/iteration-2"); let ctx = RunbookContext { harness: Harness::Codex, - run_mode: RunMode::Hybrid, skill_name: "widget-skill", iteration: 2, iteration_dir: &dir, @@ -376,7 +161,6 @@ mod tests { cond_a: "old_skill", cond_b: "new_skill", num_tasks: 6, - groups: &[], target_args: " --skill-dir /tmp/skills --skill widget-skill", guard: false, agent_model: Some("gpt-5-mini"), @@ -391,8 +175,7 @@ mod tests { "names both conditions: {book}" ); - // Human-followed framing (the shared headless template), not the agent - // in-session framing. + // Human-followed framing (the shared headless template). assert!( book.contains("human driving"), "frames the run for a human at a terminal: {book}" diff --git a/src/cli/run/steps.rs b/src/cli/run/steps.rs index 532184c..c213262 100644 --- a/src/cli/run/steps.rs +++ b/src/cli/run/steps.rs @@ -13,7 +13,7 @@ //! parameter; the production runner — which maps each [`StepKind`] to its stage //! handler — lives in [`crate::cli`] alongside those handlers. -use crate::core::{DispatchMechanism, Harness, RunMode}; +use crate::core::{Harness, RunMode}; /// Which post-dispatch stage a [`StepCommand`] runs. The production runner /// matches on this to call the corresponding handler; tests assert on it. @@ -37,13 +37,9 @@ pub struct StepCommand { pub skill: Option, pub iteration: u32, pub harness: Harness, - /// The run mode, re-derived at each stage so the transcript source matches - /// the dispatch mechanism. Round-trips through `CommonArgs` exactly like - /// `harness`, so ingest sub-stages don't silently re-default it. + /// The run mode, re-derived at each stage. Round-trips through `CommonArgs` + /// exactly like `harness`, so ingest sub-stages don't silently re-default it. pub run_mode: RunMode, - /// Only the transcript-reading stages (record-runs, fill-transcripts) carry a - /// subagents dir; the others leave it `None`. - pub subagents_dir: Option, pub workspace_dir: Option, } @@ -55,7 +51,6 @@ pub struct StepParams<'a> { pub iteration: u32, pub harness: Harness, pub run_mode: RunMode, - pub subagents_dir: Option<&'a str>, pub workspace_dir: Option<&'a str>, } @@ -66,20 +61,14 @@ impl Default for StepParams<'_> { skill: None, iteration: 0, harness: Harness::ClaudeCode, - run_mode: RunMode::Interactive, - subagents_dir: None, + run_mode: RunMode::Hybrid, workspace_dir: None, } } } impl StepParams<'_> { - fn step( - &self, - label: &'static str, - kind: StepKind, - subagents_dir: Option, - ) -> StepCommand { + fn step(&self, label: &'static str, kind: StepKind) -> StepCommand { StepCommand { label, kind, @@ -88,34 +77,28 @@ impl StepParams<'_> { iteration: self.iteration, harness: self.harness, run_mode: self.run_mode, - subagents_dir, workspace_dir: self.workspace_dir.map(str::to_string), } } } /// The ingest chain: record-runs → fill-transcripts → detect-stray-writes → -/// grade. Only the first two carry the subagents dir, and only for the -/// in-session dispatch mechanism (a Cli-dispatch harness reads its transcript -/// from each task's `outputs/` dir instead). +/// grade. Each stage reads its transcript from each task's `outputs/` events +/// file. pub fn build_ingest_commands(p: &StepParams) -> Vec { - let transcripts = match p.run_mode.mechanism() { - DispatchMechanism::InSession => p.subagents_dir.map(str::to_string), - DispatchMechanism::Cli => None, - }; vec![ - p.step("record-runs", StepKind::RecordRuns, transcripts.clone()), - p.step("fill-transcripts", StepKind::FillTranscripts, transcripts), - p.step("detect-stray-writes", StepKind::DetectStrayWrites, None), - p.step("grade", StepKind::Grade { finalize: false }, None), + p.step("record-runs", StepKind::RecordRuns), + p.step("fill-transcripts", StepKind::FillTranscripts), + p.step("detect-stray-writes", StepKind::DetectStrayWrites), + p.step("grade", StepKind::Grade { finalize: false }), ] } /// The finalize chain: grade --finalize → aggregate. pub fn build_finalize_commands(p: &StepParams) -> Vec { vec![ - p.step("grade --finalize", StepKind::Grade { finalize: true }, None), - p.step("aggregate", StepKind::Aggregate, None), + p.step("grade --finalize", StepKind::Grade { finalize: true }), + p.step("aggregate", StepKind::Aggregate), ] } @@ -146,7 +129,6 @@ mod tests { skill_dir: Some("/skills"), skill: Some("mr-review"), iteration: 2, - subagents_dir: Some("/subagents"), ..Default::default() } } @@ -178,15 +160,10 @@ mod tests { assert_eq!(s.skill.as_deref(), Some("mr-review")); assert_eq!(s.iteration, 2); } - // The transcript-reading steps get the subagents dir; the others must not. - assert_eq!(steps[0].subagents_dir.as_deref(), Some("/subagents")); - assert_eq!(steps[1].subagents_dir.as_deref(), Some("/subagents")); - assert_eq!(steps[2].subagents_dir, None); - assert_eq!(steps[3].subagents_dir, None); } #[test] - fn ingest_omits_subagents_for_codex() { + fn ingest_threads_harness_through_every_step() { let steps = build_ingest_commands(&StepParams { skill_dir: Some("/skills"), skill: Some("mr-review"), @@ -205,40 +182,6 @@ mod tests { ] ); assert!(steps.iter().all(|s| s.harness == Harness::Codex)); - assert_eq!(steps[0].subagents_dir, None); - assert_eq!(steps[1].subagents_dir, None); - } - - #[test] - fn ingest_omits_subagents_for_claude_hybrid() { - // Claude Code in hybrid mode dispatches via the CLI, so it reads each - // task's events file — not a subagents dir — even though the harness is - // ClaudeCode and a subagents dir was passed. - let steps = build_ingest_commands(&StepParams { - skill_dir: Some("/skills"), - skill: Some("mr-review"), - iteration: 2, - harness: Harness::ClaudeCode, - run_mode: RunMode::Hybrid, - subagents_dir: Some("/subagents"), - ..Default::default() - }); - assert!(steps.iter().all(|s| s.harness == Harness::ClaudeCode)); - assert!(steps.iter().all(|s| s.run_mode == RunMode::Hybrid)); - assert_eq!(steps[0].subagents_dir, None); - assert_eq!(steps[1].subagents_dir, None); - } - - #[test] - fn ingest_keeps_subagents_for_claude_interactive() { - // The default (interactive) Claude path still reads the subagents dir. - let steps = build_ingest_commands(&StepParams { - iteration: 2, - subagents_dir: Some("/subagents"), - ..Default::default() - }); - assert_eq!(steps[0].subagents_dir.as_deref(), Some("/subagents")); - assert_eq!(steps[1].subagents_dir.as_deref(), Some("/subagents")); } #[test] @@ -265,8 +208,7 @@ mod tests { skill: None, iteration: 0, harness: Harness::ClaudeCode, - run_mode: RunMode::Interactive, - subagents_dir: None, + run_mode: RunMode::Hybrid, workspace_dir: None, } } diff --git a/src/cli/run/util.rs b/src/cli/run/util.rs index 92e27ea..39d74c5 100644 --- a/src/cli/run/util.rs +++ b/src/cli/run/util.rs @@ -56,79 +56,6 @@ pub(crate) fn unguarded_notice(no_stage: bool) -> Option { ) } -/// The shared dispatch-instruction body, parameterized on the `tasks[]` filter so -/// the condition-only and condition+group variants stay in lockstep. -fn insession_dispatch_instruction(filter: &str) -> String { - format!( - "iterate the `tasks[]` entries in dispatch.json whose {filter} and \ - dispatch each as a subagent, passing its `agent_description` verbatim as the subagent \ - description (that string is the key that links each transcript back — without it tool \ - calls, tokens, and duration come back empty)." - ) -} - -/// Dispatch instruction for one condition batch: iterate the matching `tasks[]` -/// and dispatch each as a subagent with its `agent_description` verbatim. A building -/// block of the interactive runbook's per-condition steps ([`super::runbook`]). -pub(crate) fn insession_dispatch_batch(condition: &str) -> String { - insession_dispatch_instruction(&format!("`condition` is `{condition}`")) -} - -/// Dispatch instruction for one `(condition, group)` segment — used when a run has -/// more than one isolation group, so each group's batch dispatches separately with -/// a [`insession_reset_batch_command`] barrier between groups ([`super::runbook`]). -pub(crate) fn insession_dispatch_segment(condition: &str, group: &str) -> String { - insession_dispatch_instruction(&format!( - "`condition` is `{condition}` and `group` is `{group}`" - )) -} - -/// The `reset-batch` barrier command between isolation-group batches: wipe the -/// env working tree and re-seed it with `group`'s fixtures before dispatching it. -/// A building block of the interactive runbook ([`super::runbook`]). -pub(crate) fn insession_reset_batch_command( - target_args: &str, - iteration: u32, - group: &str, -) -> String { - format!("eval-magic reset-batch{target_args} --iteration {iteration} --group {group}") -} - -/// The `switch-condition` barrier command between batches: name the condition about -/// to be dispatched (the one to keep). A building block of the interactive runbook -/// ([`super::runbook`]). -pub(crate) fn insession_switch_command(target_args: &str, iteration: u32, keep: &str) -> String { - format!("eval-magic switch-condition{target_args} --iteration {iteration} --condition {keep}") -} - -/// The `ingest` hand-off command + its session-resolution hint. A building block of -/// the interactive runbook ([`super::runbook`]). -pub(crate) fn insession_ingest_command(target_args: &str, iteration: u32) -> String { - format!( - "eval-magic ingest{target_args} --iteration {iteration}\n\ - (ingest auto-resolves the subagents dir from CLAUDE_CODE_SESSION_ID; outside that \ - session, add --session-id or --subagents-dir .)" - ) -} - -/// The post-`run` handoff for the isolated in-session flow: cd into the env, start a -/// *fresh* Claude Code session there, and have it read `RUNBOOK.md` — which carries the -/// full dispatch → switch-condition → ingest → finalize loop. The env (incl. -/// `env/.claude/skills/`) is built before that session starts, so the fresh session is -/// structural, not a watcher workaround; the orchestrator no longer juggles the dispatch -/// loop itself. -pub(crate) fn insession_isolated_handoff(env_dir: &Path) -> String { - format!( - "start the isolated run in a fresh session:\n \ - 1. cd {env}\n \ - 2. start a fresh Claude Code session there (`claude`)\n \ - 3. say: Read and follow RUNBOOK.md\n\ - RUNBOOK.md walks the whole loop (dispatch → switch-condition → ingest → finalize) and \ - writes benchmark.json; resume here to read it.", - env = env_dir.display() - ) -} - /// Resolve the shared, harness-agnostic plan-mode procedure profile injected by /// `--plan-mode`. A compile-time bundled asset, mirroring the schema embedding in /// `validation`. @@ -168,10 +95,9 @@ pub(crate) fn validate_harness_run_options( } /// A per-run nonce (`-<6 hex>`) that namespaces dispatch -/// descriptions so transcripts can't collide across iterations sharing one parent -/// session's subagents dir. With no RNG crate, the low bits of the -/// sub-millisecond clock supply the entropy — enough, since the base36 millis -/// prefix already differs between runs. +/// descriptions so they stay unique across iterations of the same skill. With no +/// RNG crate, the low bits of the sub-millisecond clock supply the entropy — +/// enough, since the base36 millis prefix already differs between runs. pub(crate) fn make_run_nonce() -> String { let now = SystemTime::now() .duration_since(UNIX_EPOCH) @@ -257,16 +183,6 @@ mod tests { assert!(validate_harness_run_options(&opts, &ctx).is_ok()); } - #[test] - fn claude_interactive_allows_guard() { - let (_t, ctx) = ctx_for(Harness::ClaudeCode, RunMode::Interactive); - let opts = RunOptions { - guard: true, - ..Default::default() - }; - assert!(validate_harness_run_options(&opts, &ctx).is_ok()); - } - #[test] fn unguarded_notice_when_no_stage() { let notice = unguarded_notice(true).unwrap(); @@ -285,30 +201,6 @@ mod tests { assert!(unguarded_notice(false).is_none()); } - #[test] - fn isolated_handoff_points_into_env_and_at_the_runbook() { - let env = Path::new("/work/.eval-magic/widget/iteration-3/env"); - let handoff = insession_isolated_handoff(env); - assert!( - handoff.contains("/work/.eval-magic/widget/iteration-3/env"), - "names the env to cd into: {handoff}" - ); - assert!(handoff.contains("cd "), "spells out the cd step: {handoff}"); - assert!( - handoff.contains("Read and follow RUNBOOK.md"), - "hands off to the runbook in a fresh session: {handoff}" - ); - assert!( - handoff.contains("fresh"), - "names the fresh isolated session: {handoff}" - ); - // The handoff replaces the old printed dispatch loop — it must not re-print it. - assert!( - !handoff.contains("one batch at a time"), - "the dispatch loop lives in RUNBOOK.md now, not the summary: {handoff}" - ); - } - #[test] fn plan_mode_profile_is_shared_and_harness_agnostic() { let profile = resolve_plan_mode_profile(); diff --git a/src/core/mod.rs b/src/core/mod.rs index a0d12b8..a78a0d1 100644 --- a/src/core/mod.rs +++ b/src/core/mod.rs @@ -2,7 +2,7 @@ //! //! - [`types`] — domain types (`Eval`, `RunRecord`, `Assertion`, `GradingResult`, …) //! - [`context`] — `RunContext` detection from parsed flags / environment -//! - [`run_mode`] — dispatch mechanism (in-session vs. one-shot CLI) +//! - [`run_mode`] — run mode (hybrid / headless) and per-harness capabilities //! - [`runtime`] — runtime helpers (git spawning) //! //! The submodules are re-exported flat here so downstream code writes @@ -14,8 +14,6 @@ pub mod runtime; pub mod types; pub use context::{ContextError, DetectInput, Harness, RunContext, detect_run_context}; -pub use run_mode::{ - DispatchMechanism, HarnessRunCapabilities, RunMode, capabilities_for, resolve_run_mode, -}; +pub use run_mode::{HarnessRunCapabilities, RunMode, capabilities_for, resolve_run_mode}; pub use runtime::{GitOutput, run_git}; pub use types::*; diff --git a/src/core/run_mode.rs b/src/core/run_mode.rs index a2457df..f1fe909 100644 --- a/src/core/run_mode.rs +++ b/src/core/run_mode.rs @@ -1,18 +1,12 @@ //! Run mode — *how* an eval is dispatched, independent of *which* harness runs //! it. //! -//! There are two dispatch **mechanisms** in the code today: -//! -//! - [`DispatchMechanism::InSession`] — the runner hands tasks to in-session -//! subagents (Claude Code's Task tool). The reference is Claude Code. -//! - [`DispatchMechanism::Cli`] — each task is dispatched through a one-shot -//! harness CLI subprocess (`codex exec`). The reference is Codex. -//! -//! These two mechanisms underpin the three *user-facing* run modes documented in -//! the README: **fully-interactive** rides on [`InSession`](DispatchMechanism::InSession); -//! **headless** and **hybrid** both ride on [`Cli`](DispatchMechanism::Cli), -//! differing only in whether a human/agent session drives the loop — not in how -//! a single task reaches the harness. +//! Every dispatch now rides a single mechanism: each task is delivered through a +//! one-shot harness CLI subprocess (`claude -p`, `codex exec`). The two +//! *user-facing* run modes documented in the README — **hybrid** and +//! **headless** — share that mechanism and differ only in whether an agent or a +//! human session drives the loop, not in how a single task reaches the harness. +//! (The vocabulary collapse that folds these two into one is tracked separately.) //! //! This is distinct from the comparison [`Mode`](crate::core::Mode) //! (`new-skill` / `revision`), which selects the two conditions being compared, @@ -22,29 +16,15 @@ use serde::{Deserialize, Serialize}; use crate::core::Harness; -/// How a single dispatch is delivered to a harness. The primary code axis for -/// run-mode concerns (next-steps guidance, transcript source). -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum DispatchMechanism { - /// In-session subagent dispatch (Claude Code's Task tool). - InSession, - /// One-shot harness CLI subprocess dispatch (`codex exec`). - Cli, -} - -/// The user-facing run mode — *who/what drives the loop* plus which dispatch -/// mechanism each task rides on. This is the parity vocabulary documented in the -/// README (§Run modes); it maps down to a [`DispatchMechanism`] via -/// [`RunMode::mechanism`]. `hybrid` and `headless` both ride on -/// [`Cli`](DispatchMechanism::Cli) and differ only in whether a session drives -/// the loop — a distinction we persist (in `conditions.json`) even though it -/// doesn't change how a single task reaches the harness. +/// The user-facing run mode — *who/what drives the loop*. Both modes dispatch +/// each task through the harness CLI; they differ only in whether an agent +/// session (`hybrid`) or a human (`headless`) drives the loop — a distinction we +/// persist (in `conditions.json`) even though it doesn't change how a single task +/// reaches the harness. #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, clap::ValueEnum)] #[serde(rename_all = "kebab-case")] #[value(rename_all = "kebab-case")] pub enum RunMode { - /// In-session subagent dispatch (Claude Code's Task tool). - Interactive, /// An agent session orchestrates while each dispatch shells out to the /// harness CLI (`claude -p`, `codex exec`). Hybrid, @@ -54,29 +34,17 @@ pub enum RunMode { } impl RunMode { - /// The dispatch mechanism this run mode rides on. - pub fn mechanism(self) -> DispatchMechanism { - match self { - RunMode::Interactive => DispatchMechanism::InSession, - RunMode::Hybrid | RunMode::Headless => DispatchMechanism::Cli, - } - } - - /// The default run mode for a harness when `--run-mode` is omitted, chosen to - /// preserve today's behavior: Claude Code → interactive, the CLI-dispatch - /// harnesses → hybrid. - pub fn default_for(harness: Harness) -> RunMode { - match harness { - Harness::ClaudeCode => RunMode::Interactive, - Harness::Codex | Harness::OpenCode => RunMode::Hybrid, - } + /// The default run mode for a harness when `--run-mode` is omitted. Every + /// harness defaults to `hybrid`: an agent session drives the loop and each + /// dispatch shells out to the harness CLI. + pub fn default_for(_harness: Harness) -> RunMode { + RunMode::Hybrid } /// The kebab-case identifier (matches the `--run-mode` flag values and the /// serialized form in `conditions.json`). pub fn as_str(self) -> &'static str { match self { - RunMode::Interactive => "interactive", RunMode::Hybrid => "hybrid", RunMode::Headless => "headless", } @@ -89,12 +57,9 @@ impl RunMode { pub fn resolve_run_mode(harness: Harness, requested: Option) -> Result { let mode = requested.unwrap_or_else(|| RunMode::default_for(harness)); let supported: &[RunMode] = match harness { - // Claude Code wires every mode: in-session (interactive) plus both CLI - // modes (hybrid and headless ride the same `claude -p` mechanism). - Harness::ClaudeCode => &[RunMode::Interactive, RunMode::Hybrid, RunMode::Headless], - // Codex dispatches via subprocess, so in-session doesn't translate, but - // both CLI modes do (hybrid is agent-driven, headless human-driven). - Harness::Codex => &[RunMode::Hybrid, RunMode::Headless], + // Claude Code and Codex both wire the CLI mechanism, so both modes apply + // (hybrid is agent-driven, headless human-driven). + Harness::ClaudeCode | Harness::Codex => &[RunMode::Hybrid, RunMode::Headless], // OpenCode's CLI path is only partially wired (no transcript ingest), so // only hybrid is advertised for now. Harness::OpenCode => &[RunMode::Hybrid], @@ -131,7 +96,6 @@ fn harness_label(harness: Harness) -> &'static str { /// sequence starts. Harness-specific behavior still lives behind the adapter. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct HarnessRunCapabilities { - pub mechanism: DispatchMechanism, pub supports_guard: bool, pub supports_bootstrap_with_no_stage: bool, pub supports_stage_name_with_no_stage: bool, @@ -141,19 +105,16 @@ pub struct HarnessRunCapabilities { pub fn capabilities_for(harness: Harness) -> HarnessRunCapabilities { match harness { Harness::ClaudeCode => HarnessRunCapabilities { - mechanism: DispatchMechanism::InSession, supports_guard: true, supports_bootstrap_with_no_stage: true, supports_stage_name_with_no_stage: true, }, Harness::Codex => HarnessRunCapabilities { - mechanism: DispatchMechanism::Cli, supports_guard: true, supports_bootstrap_with_no_stage: false, supports_stage_name_with_no_stage: false, }, Harness::OpenCode => HarnessRunCapabilities { - mechanism: DispatchMechanism::Cli, supports_guard: false, supports_bootstrap_with_no_stage: true, supports_stage_name_with_no_stage: true, @@ -168,40 +129,24 @@ mod tests { #[test] fn capabilities_capture_run_option_support_by_harness() { let claude = capabilities_for(Harness::ClaudeCode); - assert_eq!(claude.mechanism, DispatchMechanism::InSession); assert!(claude.supports_guard); assert!(claude.supports_bootstrap_with_no_stage); assert!(claude.supports_stage_name_with_no_stage); let codex = capabilities_for(Harness::Codex); - assert_eq!(codex.mechanism, DispatchMechanism::Cli); assert!(codex.supports_guard); assert!(!codex.supports_bootstrap_with_no_stage); assert!(!codex.supports_stage_name_with_no_stage); let opencode = capabilities_for(Harness::OpenCode); - assert_eq!(opencode.mechanism, DispatchMechanism::Cli); assert!(!opencode.supports_guard); assert!(opencode.supports_bootstrap_with_no_stage); assert!(opencode.supports_stage_name_with_no_stage); } #[test] - fn run_mode_mechanism_maps_each_mode() { - assert_eq!( - RunMode::Interactive.mechanism(), - DispatchMechanism::InSession - ); - assert_eq!(RunMode::Hybrid.mechanism(), DispatchMechanism::Cli); - assert_eq!(RunMode::Headless.mechanism(), DispatchMechanism::Cli); - } - - #[test] - fn run_mode_default_per_harness_preserves_today() { - assert_eq!( - RunMode::default_for(Harness::ClaudeCode), - RunMode::Interactive - ); + fn run_mode_defaults_to_hybrid_for_every_harness() { + assert_eq!(RunMode::default_for(Harness::ClaudeCode), RunMode::Hybrid); assert_eq!(RunMode::default_for(Harness::Codex), RunMode::Hybrid); assert_eq!(RunMode::default_for(Harness::OpenCode), RunMode::Hybrid); } @@ -210,7 +155,7 @@ mod tests { fn resolve_run_mode_defaults_when_unspecified() { assert_eq!( resolve_run_mode(Harness::ClaudeCode, None).unwrap(), - RunMode::Interactive + RunMode::Hybrid ); assert_eq!( resolve_run_mode(Harness::Codex, None).unwrap(), @@ -227,11 +172,10 @@ mod tests { } #[test] - fn resolve_run_mode_rejects_interactive_for_cli_harnesses() { - let err = resolve_run_mode(Harness::Codex, Some(RunMode::Interactive)).unwrap_err(); - assert!(err.contains("interactive"), "message was: {err}"); - assert!(err.contains("codex"), "message was: {err}"); - assert!(resolve_run_mode(Harness::OpenCode, Some(RunMode::Interactive)).is_err()); + fn resolve_run_mode_rejects_headless_for_opencode() { + let err = resolve_run_mode(Harness::OpenCode, Some(RunMode::Headless)).unwrap_err(); + assert!(err.contains("headless"), "message was: {err}"); + assert!(err.contains("opencode"), "message was: {err}"); } #[test] diff --git a/src/core/types.rs b/src/core/types.rs index dddd9d7..bbd300b 100644 --- a/src/core/types.rs +++ b/src/core/types.rs @@ -128,8 +128,8 @@ pub struct ConditionsRecord { /// `None` on older artifacts written before run-mode selection existed. #[serde(skip_serializing_if = "Option::is_none")] pub run_mode: Option, - /// Per-run nonce; namespaces dispatch descriptions so transcripts can't - /// collide across iterations sharing one parent session's subagents dir. + /// Per-run nonce; namespaces dispatch descriptions so they stay unique across + /// iterations of the same skill. #[serde(skip_serializing_if = "Option::is_none")] pub run_nonce: Option, /// The `--runs` value the iteration was built with (provenance; per-eval diff --git a/src/pipeline/fill_transcripts.rs b/src/pipeline/fill_transcripts.rs index 06bd0b6..aed05cb 100644 --- a/src/pipeline/fill_transcripts.rs +++ b/src/pipeline/fill_transcripts.rs @@ -2,9 +2,10 @@ //! //! Walks the iteration's `eval-*` //! directories and, for each `(eval, condition)` `run.json`, populates -//! `tool_invocations` from the persisted transcript (Claude Code subagent JSONL -//! resolved by the task's `agent_description`, or Codex `codex-events.jsonl`). -//! Records that already carry invocations are skipped unless `overwrite`. +//! `tool_invocations` from the events file the harness CLI wrote under the task's +//! `outputs_dir` (e.g. Codex's `codex-events.jsonl`, Claude Code's +//! `claude-events.jsonl`). Records that already carry invocations are skipped +//! unless `overwrite`. use std::collections::HashMap; use std::fs; @@ -12,8 +13,8 @@ use std::path::Path; use serde::Deserialize; -use crate::adapters::{adapter_for, find_by_description}; -use crate::core::{ConditionsRecord, DispatchMechanism, Harness, RunRecord, ToolInvocation}; +use crate::adapters::adapter_for; +use crate::core::{ConditionsRecord, Harness, RunRecord, ToolInvocation}; use crate::pipeline::error::PipelineError; use crate::pipeline::io::write_json; use crate::pipeline::slots::{run_key, run_slots}; @@ -40,45 +41,14 @@ struct DispatchRef { #[serde(default)] run_index: Option, #[serde(default)] - agent_description: Option, - #[serde(default)] outputs_dir: Option, } -/// The canonical dispatch description for an `(eval, condition, run)` run. -/// -/// The runner writes a unique `agent_description` per task into `dispatch.json` -/// (namespaced with the iteration + run nonce); reading it back binds each run to -/// the exact agent that produced it. Falls back to the -/// `:[:r]` reconstruction when `dispatch.json` is absent, -/// malformed, or missing the task (hand-authored/operator runs). -pub fn resolve_agent_description( - iteration_dir: &Path, - eval_id: &str, - condition: &str, - run_index: Option, -) -> String { - let dispatch_path = iteration_dir.join("dispatch.json"); - if let Ok(raw) = fs::read_to_string(&dispatch_path) - && let Ok(env) = serde_json::from_str::(&raw) - && let Some(tasks) = env.tasks - && let Some(task) = tasks - .iter() - .find(|t| t.eval_id == eval_id && t.condition == condition && t.run_index == run_index) - && let Some(desc) = &task.agent_description - { - return desc.clone(); - } - run_key(eval_id, condition, run_index) -} - /// Populate `tool_invocations` for every `run.json` under `iteration_dir`. See /// the module docs for the transcript sources and overwrite semantics. pub fn fill_transcripts( iteration_dir: &Path, harness: Harness, - mechanism: DispatchMechanism, - subagents_dir: Option<&Path>, overwrite: bool, ) -> Result { let conditions_path = iteration_dir.join("conditions.json"); @@ -131,18 +101,8 @@ pub fn fill_transcripts( .cloned() .unwrap_or_else(|| slot.dir.join("outputs").to_string_lossy().into_owned()); - // Resolve the in-session description lazily — only the InSession - // branch needs it, so a Cli run skips the dispatch.json re-read. - let description = (mechanism == DispatchMechanism::InSession).then(|| { - resolve_agent_description(iteration_dir, eval_id, cond, slot.run_index) - }); - let Some(invocations) = invocations_for_run( - harness, - mechanism, - subagents_dir, - description.as_deref(), - Path::new(&outputs_dir), - ) else { + let Some(invocations) = invocations_for_run(harness, Path::new(&outputs_dir)) + else { result.missing += 1; continue; }; @@ -174,34 +134,15 @@ fn outputs_dirs_by_key(iteration_dir: &Path) -> HashMap { out } -/// Parse the invocations for one run, keyed on the dispatch mechanism: a -/// `Cli`-mechanism harness reads the events file its CLI wrote under -/// `outputs_dir` (e.g. Codex's `codex-events.jsonl`, Claude Code hybrid's -/// `claude-events.jsonl`); an `InSession` harness reads the subagent transcript -/// matched by `description` (resolved by the caller). -fn invocations_for_run( - harness: Harness, - mechanism: DispatchMechanism, - subagents_dir: Option<&Path>, - description: Option<&str>, - outputs_dir: &Path, -) -> Option> { - match mechanism { - DispatchMechanism::Cli => { - let events_path = outputs_dir.join(adapter_for(harness).cli_events_filename()?); - if !events_path.exists() { - return None; - } - adapter_for(harness).parse_cli_events(&events_path).ok() - } - DispatchMechanism::InSession => { - let subagent = - find_by_description(subagents_dir.unwrap_or_else(|| Path::new("")), description?)?; - adapter_for(harness) - .parse_transcript(&subagent.jsonl_path) - .ok() - } +/// Parse the invocations for one run: read the events file the harness CLI wrote +/// under `outputs_dir` (e.g. Codex's `codex-events.jsonl`, Claude Code's +/// `claude-events.jsonl`). Returns `None` when no events file is found. +fn invocations_for_run(harness: Harness, outputs_dir: &Path) -> Option> { + let events_path = outputs_dir.join(adapter_for(harness).cli_events_filename()?); + if !events_path.exists() { + return None; } + adapter_for(harness).parse_cli_events(&events_path).ok() } #[cfg(test)] @@ -244,66 +185,6 @@ mod tests { fs::write(path, serde_json::to_string_pretty(&record).unwrap()).unwrap(); } - // --- resolveAgentDescription --- - - #[test] - fn returns_the_namespaced_agent_description_from_dispatch() { - let root = TempDir::new().unwrap(); - let dir = root.path().join("iter-canonical"); - write_dispatch( - &dir, - json!([ - {"eval_id": "crash", "condition": "with_skill", "agent_description": "crash:with_skill:i3-abc123"}, - {"eval_id": "crash", "condition": "without_skill", "agent_description": "crash:without_skill:i3-abc123"} - ]), - ); - assert_eq!( - resolve_agent_description(&dir, "crash", "with_skill", None), - "crash:with_skill:i3-abc123" - ); - assert_eq!( - resolve_agent_description(&dir, "crash", "without_skill", None), - "crash:without_skill:i3-abc123" - ); - } - - #[test] - fn falls_back_to_legacy_reconstruction_when_dispatch_absent() { - let root = TempDir::new().unwrap(); - let dir = root.path().join("iter-no-dispatch"); - fs::create_dir_all(&dir).unwrap(); - assert_eq!( - resolve_agent_description(&dir, "crash", "with_skill", None), - "crash:with_skill" - ); - } - - #[test] - fn falls_back_when_task_missing_from_dispatch() { - let root = TempDir::new().unwrap(); - let dir = root.path().join("iter-partial"); - write_dispatch( - &dir, - json!([{"eval_id": "other", "condition": "with_skill", "agent_description": "other:with_skill:i1-x"}]), - ); - assert_eq!( - resolve_agent_description(&dir, "crash", "with_skill", None), - "crash:with_skill" - ); - } - - #[test] - fn falls_back_when_dispatch_malformed() { - let root = TempDir::new().unwrap(); - let dir = root.path().join("iter-malformed"); - fs::create_dir_all(&dir).unwrap(); - fs::write(dir.join("dispatch.json"), "{ not valid json").unwrap(); - assert_eq!( - resolve_agent_description(&dir, "crash", "with_skill", None), - "crash:with_skill" - ); - } - // --- fillTranscripts --- #[test] @@ -342,14 +223,7 @@ mod tests { ) .unwrap(); - let result = fill_transcripts( - &iteration_dir, - Harness::ClaudeCode, - DispatchMechanism::Cli, - None, - false, - ) - .unwrap(); + let result = fill_transcripts(&iteration_dir, Harness::ClaudeCode, false).unwrap(); assert_eq!(result.filled, 1); assert_eq!(result.missing, 0); @@ -393,14 +267,7 @@ mod tests { ) .unwrap(); - let result = fill_transcripts( - &iteration_dir, - Harness::Codex, - DispatchMechanism::Cli, - None, - false, - ) - .unwrap(); + let result = fill_transcripts(&iteration_dir, Harness::Codex, false).unwrap(); assert_eq!(result.filled, 1); assert_eq!(result.missing, 0); @@ -443,14 +310,7 @@ mod tests { .unwrap(); } - let result = fill_transcripts( - &iteration_dir, - Harness::Codex, - DispatchMechanism::Cli, - None, - false, - ) - .unwrap(); + let result = fill_transcripts(&iteration_dir, Harness::Codex, false).unwrap(); assert_eq!(result.filled, 2); assert_eq!(result.missing, 0); diff --git a/src/pipeline/mod.rs b/src/pipeline/mod.rs index 82c889d..1417b3c 100644 --- a/src/pipeline/mod.rs +++ b/src/pipeline/mod.rs @@ -21,7 +21,7 @@ pub use detect_stray_writes::{ detect_stray_writes_report, }; pub use error::PipelineError; -pub use fill_transcripts::{FillTranscriptsResult, fill_transcripts, resolve_agent_description}; +pub use fill_transcripts::{FillTranscriptsResult, fill_transcripts}; pub use grade::{GradeContext, emit_judge_tasks, finalize}; pub use record_runs::{RecordRunsResult, record_runs}; pub use slots::{RunSlot, run_slots}; diff --git a/src/pipeline/record_runs.rs b/src/pipeline/record_runs.rs index c97034b..b0cc3e2 100644 --- a/src/pipeline/record_runs.rs +++ b/src/pipeline/record_runs.rs @@ -5,8 +5,8 @@ //! from sources already on disk: carry-over fields from the dispatch task, the //! `final_message` (from `/final-message.md`, falling back to the //! transcript's last assistant text), and `tool_invocations`/tokens/duration from -//! the persisted transcript (Claude Code subagent JSONL or Codex -//! `codex-events.jsonl`). +//! each task's events file (`outputs/-events.jsonl` — Claude Code's +//! `claude -p` stream-json or Codex's `codex-events.jsonl`). //! //! Existing records always win: an agent/operator-written `run.json` is skipped //! without `overwrite`, and `timing.json` is backfill-only — completion-event @@ -18,8 +18,8 @@ use std::path::Path; use serde::Deserialize; -use crate::adapters::{TranscriptSummary, adapter_for, find_by_description}; -use crate::core::{DispatchMechanism, Harness, RunRecord, TimingRecord, TimingSource}; +use crate::adapters::{TranscriptSummary, adapter_for}; +use crate::core::{Harness, RunRecord, TimingRecord, TimingSource}; use crate::pipeline::error::PipelineError; use crate::pipeline::io::write_json; use crate::validation::{SchemaName, validate_against_schema}; @@ -44,7 +44,6 @@ struct DispatchTask { outputs_dir: String, run_record_path: String, timing_path: String, - agent_description: String, #[serde(default)] dispatch_prompt_path: String, } @@ -63,14 +62,9 @@ impl RecordRunsResult { /// A loud, actionable warning when runs were recorded from `final-message.md` /// but their transcripts didn't link — leaving `tool_invocations`/tokens/ /// duration empty so `transcript_check` assertions silently grade - /// unverifiable. `None` when every run matched its transcript. The hint is - /// tailored to how the harness correlates transcripts (description match vs. - /// the Codex events file). - pub fn transcript_warning( - &self, - harness: Harness, - mechanism: DispatchMechanism, - ) -> Option { + /// unverifiable. `None` when every run matched its transcript. The hint names + /// the per-task events file the harness CLI was expected to write. + pub fn transcript_warning(&self, harness: Harness) -> Option { if self.missing_transcript == 0 { return None; } @@ -82,23 +76,10 @@ impl RecordRunsResult { } else { format!("⚠ {n} run{plural} missing a transcript") }; - // The cause is keyed on the dispatch mechanism, not the harness: a - // Cli-dispatch run (Codex, or Claude Code in hybrid/headless) misses the - // per-task events file; an in-session run misses the subagent transcript. - let cause = match mechanism { - DispatchMechanism::Cli => { - let file = adapter_for(harness) - .cli_events_filename() - .unwrap_or("the events file"); - format!("expected `outputs/{file}` was not found") - } - DispatchMechanism::InSession => { - "did you pass each task's `agent_description` verbatim as the subagent \ - description? If so, confirm `--subagents-dir` points at the parent session's \ - subagents dir" - .to_string() - } - }; + let file = adapter_for(harness) + .cli_events_filename() + .unwrap_or("the events file"); + let cause = format!("expected `outputs/{file}` was not found"); Some(format!( "{lead} — {cause}; tool_invocations/tokens/duration are empty, so transcript_check \ assertions will grade unverifiable." @@ -130,8 +111,6 @@ impl RecordRunsResult { pub fn record_runs( iteration_dir: &Path, harness: Harness, - mechanism: DispatchMechanism, - subagents_dir: Option<&Path>, overwrite: bool, ) -> Result { let dispatch_path = iteration_dir.join("dispatch.json"); @@ -148,7 +127,7 @@ pub fn record_runs( let mut result = RecordRunsResult::default(); for task in &tasks { - let summary = transcript_summary_for_task(harness, mechanism, subagents_dir, task); + let summary = transcript_summary_for_task(harness, task); if summary.is_none() { result.missing_transcript += 1; } @@ -230,7 +209,7 @@ pub fn record_runs( /// call returned the prompt's content (its distinctive first-line `sentinel`). /// /// A run that never references the prompt path is NOT flagged — absence is not -/// proof of failure (an in-session subagent can receive the prompt another way), +/// proof of failure (the agent can receive the prompt another way), /// and requiring positive evidence keeps the check free of false positives. /// Returns `false` when `sentinel` is empty (the prompt file was missing or /// unreadable, so the read cannot be judged). @@ -277,38 +256,18 @@ fn prompt_sentinel(prompt_path: &str) -> String { .unwrap_or_default() } -/// Resolve a task's transcript summary, keyed on the dispatch mechanism: a -/// `Cli`-mechanism harness reads the events file its CLI wrote under the task's -/// outputs dir (e.g. Codex's `codex-events.jsonl`); an `InSession` harness reads -/// the subagent transcript matched by the task's `agent_description`. Returns -/// `None` when no transcript is found. -fn transcript_summary_for_task( - harness: Harness, - mechanism: DispatchMechanism, - subagents_dir: Option<&Path>, - task: &DispatchTask, -) -> Option { - match mechanism { - DispatchMechanism::Cli => { - let events_path = - Path::new(&task.outputs_dir).join(adapter_for(harness).cli_events_filename()?); - if !events_path.exists() { - return None; - } - adapter_for(harness) - .parse_cli_events_full(&events_path) - .ok() - } - DispatchMechanism::InSession => { - let subagent = find_by_description( - subagents_dir.unwrap_or_else(|| Path::new("")), - &task.agent_description, - )?; - adapter_for(harness) - .parse_transcript_full(&subagent.jsonl_path) - .ok() - } - } +/// Resolve a task's transcript summary: read the events file the harness CLI +/// wrote under the task's outputs dir (e.g. Codex's `codex-events.jsonl`, Claude +/// Code's `claude-events.jsonl`). Returns `None` when no transcript is found. +fn transcript_summary_for_task(harness: Harness, task: &DispatchTask) -> Option { + let events_path = + Path::new(&task.outputs_dir).join(adapter_for(harness).cli_events_filename()?); + if !events_path.exists() { + return None; + } + adapter_for(harness) + .parse_cli_events_full(&events_path) + .ok() } #[cfg(test)] @@ -318,12 +277,6 @@ mod tests { use std::path::PathBuf; use tempfile::TempDir; - /// Token math for `transcript_lines`: msg_1 (100+20+30+50) + msg_2 - /// (200+40+0+60) = 500. - const TRANSCRIPT_TOKENS: i64 = 500; - /// 10:00:00.000 → 10:01:00.000. - const TRANSCRIPT_DURATION_MS: i64 = 60_000; - fn jsonl(lines: &[Value]) -> String { let body = lines .iter() @@ -333,33 +286,6 @@ mod tests { format!("{body}\n") } - /// A minimal transcript with usage, timestamps, one tool call, and final text. - fn transcript_lines(final_text: &str) -> Vec { - vec![ - json!({"type": "user", "timestamp": "2026-06-04T10:00:00.000Z", "message": {"role": "user", "content": "go"}}), - json!({"type": "assistant", "timestamp": "2026-06-04T10:00:10.000Z", "message": { - "id": "msg_1", "role": "assistant", - "usage": {"input_tokens": 100, "output_tokens": 20, "cache_creation_input_tokens": 30, "cache_read_input_tokens": 50}, - "content": [{"type": "tool_use", "id": "toolu_1", "name": "Bash", "input": {"command": "ls"}}] - }}), - json!({"type": "user", "timestamp": "2026-06-04T10:00:12.000Z", "message": {"role": "user", "content": [{"type": "tool_result", "tool_use_id": "toolu_1", "content": "ok"}]}}), - json!({"type": "assistant", "timestamp": "2026-06-04T10:01:00.000Z", "message": { - "id": "msg_2", "role": "assistant", - "usage": {"input_tokens": 200, "output_tokens": 40, "cache_creation_input_tokens": 0, "cache_read_input_tokens": 60}, - "content": [{"type": "text", "text": final_text}] - }}), - ] - } - - fn write_subagent(subagents_dir: &Path, name: &str, description: &str, lines: &[Value]) { - fs::write( - subagents_dir.join(format!("{name}.meta.json")), - json!({"agentType": "general-purpose", "description": description}).to_string(), - ) - .unwrap(); - fs::write(subagents_dir.join(format!("{name}.jsonl")), jsonl(lines)).unwrap(); - } - fn write_codex_events(outputs_dir: &Path, final_text: &str) { let lines = vec![ json!({"type": "thread.started", "timestamp": "2026-06-04T10:00:00.000Z"}), @@ -438,14 +364,7 @@ mod tests { "I could not read the prompt file.", ); - let result = record_runs( - iter, - Harness::ClaudeCode, - DispatchMechanism::Cli, - None, - false, - ) - .unwrap(); + let result = record_runs(iter, Harness::ClaudeCode, false).unwrap(); assert_eq!(result.skipped_prompt_unread, 1); assert_eq!(result.recorded, 0); @@ -482,14 +401,7 @@ mod tests { "Done.", ); - let result = record_runs( - iter, - Harness::ClaudeCode, - DispatchMechanism::Cli, - None, - false, - ) - .unwrap(); + let result = record_runs(iter, Harness::ClaudeCode, false).unwrap(); assert_eq!(result.recorded, 1); assert_eq!(result.skipped_prompt_unread, 0); @@ -591,20 +503,18 @@ mod tests { .exists() } - /// `(iteration_dir, subagents_dir)` under a fresh temp root. - fn dirs(root: &TempDir) -> (PathBuf, PathBuf) { + /// The iteration dir under a fresh temp root. + fn dirs(root: &TempDir) -> PathBuf { let iteration_dir = root.path().join("iter"); - let subagents_dir = root.path().join("sub"); fs::create_dir_all(&iteration_dir).unwrap(); - fs::create_dir_all(&subagents_dir).unwrap(); - (iteration_dir, subagents_dir) + iteration_dir } #[test] fn assembles_run_and_timing_for_every_task_from_disk() { let root = TempDir::new().unwrap(); - let (iter, sub) = dirs(&root); - write_iteration( + let iter = dirs(&root); + let paths = write_iteration( &iter, &[ FixtureTask { @@ -619,27 +529,10 @@ mod tests { }, ], ); - write_subagent( - &sub, - "agent-a", - "crash:with_skill:i1-nonce1", - &transcript_lines("unused"), - ); - write_subagent( - &sub, - "agent-b", - "crash:without_skill:i1-nonce1", - &transcript_lines("unused"), - ); + write_claude_events(&paths[0].outputs_dir, "unused"); + write_claude_events(&paths[1].outputs_dir, "unused"); - let result = record_runs( - &iter, - Harness::ClaudeCode, - DispatchMechanism::InSession, - Some(&sub), - false, - ) - .unwrap(); + let result = record_runs(&iter, Harness::ClaudeCode, false).unwrap(); assert_eq!(result.recorded, 2); assert_eq!(result.missing_transcript, 0); @@ -661,15 +554,15 @@ mod tests { ); let timing = read_timing_value(&iter, "crash", "with_skill"); - assert_eq!(timing["total_tokens"], json!(TRANSCRIPT_TOKENS)); - assert_eq!(timing["duration_ms"], json!(TRANSCRIPT_DURATION_MS)); + assert_eq!(timing["total_tokens"], json!(125)); + assert_eq!(timing["duration_ms"], json!(30_000)); assert_eq!(timing["source"], json!("transcript")); } #[test] fn carries_run_index_from_dispatch_task_into_each_run_record() { let root = TempDir::new().unwrap(); - let (iter, _sub) = dirs(&root); + let iter = dirs(&root); let cond_dir = iter.join("eval-crash").join("with_skill"); let mut serialized = Vec::new(); for k in [1u32, 2] { @@ -702,8 +595,7 @@ mod tests { ) .unwrap(); - let result = - record_runs(&iter, Harness::Codex, DispatchMechanism::Cli, None, false).unwrap(); + let result = record_runs(&iter, Harness::Codex, false).unwrap(); assert_eq!(result.recorded, 2); for k in [1u32, 2] { @@ -718,7 +610,7 @@ mod tests { #[test] fn assembles_codex_records_from_each_tasks_events() { let root = TempDir::new().unwrap(); - let (iter, _sub) = dirs(&root); + let iter = dirs(&root); let paths = write_iteration( &iter, &[FixtureTask { @@ -729,8 +621,7 @@ mod tests { ); write_codex_events(&paths[0].outputs_dir, "Codex final."); - let result = - record_runs(&iter, Harness::Codex, DispatchMechanism::Cli, None, false).unwrap(); + let result = record_runs(&iter, Harness::Codex, false).unwrap(); assert_eq!(result.recorded, 1); assert_eq!(result.missing_transcript, 0); @@ -751,7 +642,7 @@ mod tests { #[test] fn falls_back_to_codex_final_agent_message_when_final_message_md_missing() { let root = TempDir::new().unwrap(); - let (iter, _sub) = dirs(&root); + let iter = dirs(&root); let paths = write_iteration( &iter, &[FixtureTask { @@ -762,8 +653,7 @@ mod tests { ); write_codex_events(&paths[0].outputs_dir, "Closing summary from Codex."); - let result = - record_runs(&iter, Harness::Codex, DispatchMechanism::Cli, None, false).unwrap(); + let result = record_runs(&iter, Harness::Codex, false).unwrap(); assert_eq!(result.recorded, 1); assert_eq!( read_run(&iter, "crash", "with_skill").final_message, @@ -774,7 +664,7 @@ mod tests { #[test] fn skips_existing_run_without_overwrite_then_replaces_with_it() { let root = TempDir::new().unwrap(); - let (iter, sub) = dirs(&root); + let iter = dirs(&root); let paths = write_iteration( &iter, &[FixtureTask { @@ -783,12 +673,7 @@ mod tests { final_message: Some("New."), }], ); - write_subagent( - &sub, - "agent-a", - "crash:with_skill:i1-nonce1", - &transcript_lines("unused"), - ); + write_claude_events(&paths[0].outputs_dir, "unused"); let hand_written = json!({ "eval_id": "crash", "condition": "with_skill", "skill_path": "/staged/skill/SKILL.md", "prompt": "Do the crash task", @@ -796,14 +681,7 @@ mod tests { }); fs::write(&paths[0].run_record_path, hand_written.to_string()).unwrap(); - let skipped = record_runs( - &iter, - Harness::ClaudeCode, - DispatchMechanism::InSession, - Some(&sub), - false, - ) - .unwrap(); + let skipped = record_runs(&iter, Harness::ClaudeCode, false).unwrap(); assert_eq!(skipped.recorded, 0); assert_eq!(skipped.skipped_existing, 1); assert_eq!( @@ -811,14 +689,7 @@ mod tests { "Agent-authored." ); - let replaced = record_runs( - &iter, - Harness::ClaudeCode, - DispatchMechanism::InSession, - Some(&sub), - true, - ) - .unwrap(); + let replaced = record_runs(&iter, Harness::ClaudeCode, true).unwrap(); assert_eq!(replaced.recorded, 1); assert_eq!(read_run(&iter, "crash", "with_skill").final_message, "New."); } @@ -826,7 +697,7 @@ mod tests { #[test] fn backfills_timing_only_when_absent() { let root = TempDir::new().unwrap(); - let (iter, sub) = dirs(&root); + let iter = dirs(&root); let paths = write_iteration( &iter, &[FixtureTask { @@ -835,26 +706,14 @@ mod tests { final_message: Some("Done."), }], ); - write_subagent( - &sub, - "agent-a", - "crash:with_skill:i1-nonce1", - &transcript_lines("unused"), - ); + write_claude_events(&paths[0].outputs_dir, "unused"); fs::write( &paths[0].timing_path, json!({"total_tokens": 12345, "duration_ms": 9000}).to_string(), ) .unwrap(); - record_runs( - &iter, - Harness::ClaudeCode, - DispatchMechanism::InSession, - Some(&sub), - false, - ) - .unwrap(); + record_runs(&iter, Harness::ClaudeCode, false).unwrap(); // Agent-captured completion-event timing wins; not overwritten. let timing = read_timing_value(&iter, "crash", "with_skill"); @@ -863,44 +722,10 @@ mod tests { assert!(timing.get("source").is_none()); } - #[test] - fn falls_back_to_transcript_final_assistant_text_when_final_message_md_missing() { - let root = TempDir::new().unwrap(); - let (iter, sub) = dirs(&root); - write_iteration( - &iter, - &[FixtureTask { - eval_id: "crash", - condition: "with_skill", - final_message: None, - }], - ); - write_subagent( - &sub, - "agent-a", - "crash:with_skill:i1-nonce1", - &transcript_lines("Closing summary from transcript."), - ); - - let result = record_runs( - &iter, - Harness::ClaudeCode, - DispatchMechanism::InSession, - Some(&sub), - false, - ) - .unwrap(); - assert_eq!(result.recorded, 1); - assert_eq!( - read_run(&iter, "crash", "with_skill").final_message, - "Closing summary from transcript." - ); - } - #[test] fn skips_the_slot_entirely_when_no_final_message_source_exists() { let root = TempDir::new().unwrap(); - let (iter, sub) = dirs(&root); + let iter = dirs(&root); write_iteration( &iter, &[FixtureTask { @@ -911,14 +736,7 @@ mod tests { ); // No final-message.md, no transcript. - let result = record_runs( - &iter, - Harness::ClaudeCode, - DispatchMechanism::InSession, - Some(&sub), - false, - ) - .unwrap(); + let result = record_runs(&iter, Harness::ClaudeCode, false).unwrap(); assert_eq!(result.recorded, 0); assert_eq!(result.skipped_no_final_message, 1); assert!(!run_exists(&iter, "crash", "with_skill")); @@ -928,7 +746,7 @@ mod tests { #[test] fn writes_empty_invocations_and_no_timing_when_transcript_missing() { let root = TempDir::new().unwrap(); - let (iter, sub) = dirs(&root); + let iter = dirs(&root); write_iteration( &iter, &[FixtureTask { @@ -937,16 +755,9 @@ mod tests { final_message: Some("Done."), }], ); - // final-message.md exists but no subagent transcript matches. + // final-message.md exists but no events file is present. - let result = record_runs( - &iter, - Harness::ClaudeCode, - DispatchMechanism::InSession, - Some(&sub), - false, - ) - .unwrap(); + let result = record_runs(&iter, Harness::ClaudeCode, false).unwrap(); assert_eq!(result.recorded, 1); assert_eq!(result.missing_transcript, 1); @@ -959,16 +770,9 @@ mod tests { #[test] fn errors_when_dispatch_json_is_absent() { let root = TempDir::new().unwrap(); - let (iter, sub) = dirs(&root); + let iter = dirs(&root); // Hand-authored/operator runs have no dispatch.json — the manual path owns them. - let err = record_runs( - &iter, - Harness::ClaudeCode, - DispatchMechanism::InSession, - Some(&sub), - false, - ) - .unwrap_err(); + let err = record_runs(&iter, Harness::ClaudeCode, false).unwrap_err(); assert!( err.to_string().contains("dispatch.json"), "error was: {err}" @@ -982,36 +786,7 @@ mod tests { missing_transcript: 0, ..Default::default() }; - assert!( - result - .transcript_warning(Harness::ClaudeCode, DispatchMechanism::InSession) - .is_none() - ); - } - - #[test] - fn claude_code_warning_names_agent_description_when_all_runs_miss() { - let result = RecordRunsResult { - recorded: 8, - missing_transcript: 8, - ..Default::default() - }; - let warning = result - .transcript_warning(Harness::ClaudeCode, DispatchMechanism::InSession) - .unwrap(); - assert!(warning.contains("8"), "names the count: {warning}"); - assert!( - warning.contains("agent_description"), - "points at the load-bearing key: {warning}" - ); - assert!( - warning.to_lowercase().contains("verbatim"), - "says pass it verbatim: {warning}" - ); - assert!( - warning.contains("--subagents-dir"), - "offers the other likely cause: {warning}" - ); + assert!(result.transcript_warning(Harness::ClaudeCode).is_none()); } #[test] @@ -1021,9 +796,7 @@ mod tests { missing_transcript: 1, ..Default::default() }; - let warning = result - .transcript_warning(Harness::ClaudeCode, DispatchMechanism::InSession) - .unwrap(); + let warning = result.transcript_warning(Harness::ClaudeCode).unwrap(); assert!(warning.contains('1'), "names the count: {warning}"); } @@ -1034,9 +807,7 @@ mod tests { missing_transcript: 2, ..Default::default() }; - let warning = result - .transcript_warning(Harness::Codex, DispatchMechanism::Cli) - .unwrap(); + let warning = result.transcript_warning(Harness::Codex).unwrap(); assert!( warning.contains("codex-events.jsonl"), "names the Codex source: {warning}" @@ -1050,7 +821,7 @@ mod tests { #[test] fn assembles_claude_hybrid_records_from_each_tasks_events() { let root = TempDir::new().unwrap(); - let (iter, _sub) = dirs(&root); + let iter = dirs(&root); let paths = write_iteration( &iter, &[FixtureTask { @@ -1061,14 +832,7 @@ mod tests { ); write_claude_events(&paths[0].outputs_dir, "Closing summary."); - let result = record_runs( - &iter, - Harness::ClaudeCode, - DispatchMechanism::Cli, - None, - false, - ) - .unwrap(); + let result = record_runs(&iter, Harness::ClaudeCode, false).unwrap(); assert_eq!(result.recorded, 1); assert_eq!(result.missing_transcript, 0); @@ -1091,7 +855,7 @@ mod tests { // Claude `-p` has no --output-last-message, so the result event's text is // the primary final-message source. let root = TempDir::new().unwrap(); - let (iter, _sub) = dirs(&root); + let iter = dirs(&root); let paths = write_iteration( &iter, &[FixtureTask { @@ -1102,14 +866,7 @@ mod tests { ); write_claude_events(&paths[0].outputs_dir, "Closing summary from claude -p."); - let result = record_runs( - &iter, - Harness::ClaudeCode, - DispatchMechanism::Cli, - None, - false, - ) - .unwrap(); + let result = record_runs(&iter, Harness::ClaudeCode, false).unwrap(); assert_eq!(result.recorded, 1); assert_eq!( read_run(&iter, "crash", "with_skill").final_message, @@ -1124,9 +881,7 @@ mod tests { missing_transcript: 2, ..Default::default() }; - let warning = result - .transcript_warning(Harness::ClaudeCode, DispatchMechanism::Cli) - .unwrap(); + let warning = result.transcript_warning(Harness::ClaudeCode).unwrap(); assert!( warning.contains("claude-events.jsonl"), "names the Claude hybrid source: {warning}" diff --git a/tests/run/claude_cli.rs b/tests/run/claude_cli.rs index 55a167b..69f551f 100644 --- a/tests/run/claude_cli.rs +++ b/tests/run/claude_cli.rs @@ -1,7 +1,6 @@ //! Claude Code CLI run modes (`--run-mode hybrid` / `headless`): `claude -p` //! stream-json dispatch guidance, run-mode persistence + defaulting, the -//! human-followed runbook, the write guard under Cli dispatch, and the remaining -//! run-mode combo rejections (Codex interactive). +//! human-followed runbook, and the write guard under Cli dispatch. use crate::helpers::*; use predicates::str::contains; @@ -72,31 +71,6 @@ fn claude_hybrid_dispatch_guidance_includes_agent_model_when_provided() { assert!(stdout.contains("--model opus")); } -#[test] -fn claude_defaults_to_interactive_handoff() { - let tmp = tempfile::TempDir::new().unwrap(); - let (skill_dir, cwd) = setup(tmp.path(), DEFAULT_EVALS); - skill_eval() - .current_dir(&cwd) - .args(["run", "--skill-dir"]) - .arg(&skill_dir) - .args([ - "--skill", - "mr-review", - "--harness", - "claude-code", - "--dry-run", - ]) - .assert() - .success(); - - // No --run-mode → interactive default; no CLI recipe in the manifest. - let conditions = read_json(&iteration_dir(&cwd).join("conditions.json")); - assert_eq!(conditions["run_mode"], "interactive"); - let manifest = read_str(&iteration_dir(&cwd).join("dispatch-manifest.md")); - assert!(!manifest.contains("claude -p")); -} - #[test] fn claude_hybrid_runbook_is_human_followed_cli_recipe() { let tmp = tempfile::TempDir::new().unwrap(); @@ -226,29 +200,6 @@ fn claude_hybrid_record_runs_does_not_require_a_session_id() { .stdout(contains("Recorded:")); } -#[test] -fn codex_rejects_run_mode_interactive() { - let tmp = tempfile::TempDir::new().unwrap(); - let (skill_dir, cwd) = setup(tmp.path(), DEFAULT_EVALS); - skill_eval() - .current_dir(&cwd) - .args(["run", "--skill-dir"]) - .arg(&skill_dir) - .args([ - "--skill", - "mr-review", - "--harness", - "codex", - "--run-mode", - "interactive", - "--dry-run", - ]) - .assert() - .failure() - .stderr(contains("interactive")) - .stderr(contains("codex")); -} - #[test] fn claude_cli_guard_installs_project_hook() { let tmp = tempfile::TempDir::new().unwrap(); diff --git a/tests/run/env_layout.rs b/tests/run/env_layout.rs index f0f6073..86dd8e4 100644 --- a/tests/run/env_layout.rs +++ b/tests/run/env_layout.rs @@ -1,6 +1,7 @@ -//! Isolated-run env builder: staging redirects into the per-iteration -//! `env/` dir, fixtures are copied in like a real repo, and `RUNBOOK.md` lives in -//! the env. eval-magic meta stays above the env in `iteration-N/`. +//! Isolated-run env builder: staging redirects into the per-`(group, condition)` +//! `env--/` dirs, fixtures are copied into each like a real repo, +//! and `RUNBOOK.md` lives above them in `iteration-N/`. eval-magic meta stays above +//! the envs in `iteration-N/`. use crate::helpers::*; use serde_json::json; @@ -19,7 +20,8 @@ fn stages_into_env_not_cwd() { .assert() .success(); - // The staged skill lands under env/.claude/skills, not the invocation cwd. + // The staged skill lands under env-g1-with_skill/.claude/skills, not the + // invocation cwd. assert_eq!( env_staged_entries(&cwd), vec!["slow-powers-eval-1-with_skill__mr-review"] @@ -30,7 +32,11 @@ fn stages_into_env_not_cwd() { ); // eval-magic meta stays above the env, in iteration-N/. assert!(iteration_dir(&cwd).join("dispatch.json").exists()); - assert!(!env_dir(&cwd).join("dispatch.json").exists()); + assert!( + !cli_env_dir(&cwd, "g1", "with_skill") + .join("dispatch.json") + .exists() + ); } #[test] @@ -52,8 +58,10 @@ fn env_dir_created_even_with_no_stage() { .assert() .success(); - // Even with staging disabled, the env must exist for fixtures + RUNBOOK. - assert!(env_dir(&cwd).is_dir()); + // Even with staging disabled, each per-(group, condition) env must exist for + // fixtures + the per-env guard. + assert!(cli_env_dir(&cwd, "g1", "with_skill").is_dir()); + assert!(cli_env_dir(&cwd, "g1", "without_skill").is_dir()); } #[test] @@ -77,10 +85,14 @@ fn fixtures_copied_into_env_like_a_real_repo() { .assert() .success(); - // Structure preserved under env/, not flattened into an inputs/ bucket. - assert_eq!(read_str(&env_dir(&cwd).join("src/main.rs")), "fn main() {}"); - assert_eq!(read_str(&env_dir(&cwd).join("data/x.json")), "{}"); - assert!(!env_dir(&cwd).join("inputs").exists()); + // Structure preserved under each per-condition env, not flattened into an + // inputs/ bucket. Fixtures are copied into every relevant env (per its group). + for cond in ["with_skill", "without_skill"] { + let env = cli_env_dir(&cwd, "g1", cond); + assert_eq!(read_str(&env.join("src/main.rs")), "fn main() {}"); + assert_eq!(read_str(&env.join("data/x.json")), "{}"); + assert!(!env.join("inputs").exists()); + } // The dispatch prompt lists fixtures env-relative — the agent's cwd is env. let dispatch = read_json(&iteration_dir(&cwd).join("dispatch.json")); @@ -154,15 +166,17 @@ fn dispatch_outputs_live_under_env() { // Canonicalize to compare across the macOS /var → /private/var symlink: // dispatch.json stores resolved paths, but the test roots come from the raw // tempdir, so a lexical starts_with would mismatch. - let env = fs::canonicalize(env_dir(&cwd)).unwrap(); let iter = fs::canonicalize(iteration_dir(&cwd)).unwrap(); - let outputs_root = env.join(".eval-magic-outputs"); for task in tasks { - // The agent-under-test (cwd = env/) writes only inside its env. + // The agent-under-test (cwd = its per-(group, condition) env) writes only + // inside that env's .eval-magic-outputs/. + let cond = task["condition"].as_str().unwrap(); + let env = fs::canonicalize(cli_env_dir(&cwd, "g1", cond)).unwrap(); + let outputs_root = env.join(".eval-magic-outputs"); let outputs_dir = fs::canonicalize(task["outputs_dir"].as_str().unwrap()).unwrap(); assert!( outputs_dir.starts_with(&outputs_root), - "outputs_dir under env/.eval-magic-outputs/: {}", + "outputs_dir under env-g1-{cond}/.eval-magic-outputs/: {}", outputs_dir.display() ); // run.json / timing.json are eval-magic meta: above the env, in iteration-N/. @@ -208,8 +222,14 @@ fn shared_fixture_copied_once_across_conditions_and_runs() { .assert() .success(); - // One shared copy in env, referenced env-relative by every condition × run. - assert_eq!(read_str(&env_dir(&cwd).join("fixture.txt")), "DATA"); + // One copy per env, shared env-relative by that env's runs. Each condition env + // carries its own copy, referenced env-relative ("fixture.txt") by every task. + for cond in ["with_skill", "without_skill"] { + assert_eq!( + read_str(&cli_env_dir(&cwd, "g1", cond).join("fixture.txt")), + "DATA" + ); + } let dispatch = read_json(&iteration_dir(&cwd).join("dispatch.json")); let tasks = dispatch["tasks"].as_array().unwrap(); assert_eq!(tasks.len(), 4, "1 eval × 2 conditions × 2 runs"); @@ -238,8 +258,12 @@ fn two_evals_sharing_a_fixture_declaration_succeeds() { .assert() .success(); - // Two evals declaring the same fixture from the same source is an idempotent share. - assert_eq!(read_str(&env_dir(&cwd).join("shared.txt")), "SHARED"); + // Two evals declaring the same fixture from the same source is an idempotent + // share: the with_skill env carries a single copy. + assert_eq!( + read_str(&cli_env_dir(&cwd, "g1", "with_skill").join("shared.txt")), + "SHARED" + ); let dispatch = read_json(&iteration_dir(&cwd).join("dispatch.json")); for id in ["e1", "e2"] { let task = dispatch["tasks"] @@ -276,14 +300,14 @@ fn env_contains_only_the_staged_skill_no_repo_leakage() { .assert() .success(); - // env/.claude/skills holds only the staged skill-under-test. + // env-g1-with_skill/.claude/skills holds only the staged skill-under-test. assert_eq!( env_staged_entries(&cwd), vec!["slow-powers-eval-1-with_skill__mr-review"] ); - // The unrelated cwd skill is absent from env. + // The unrelated cwd skill is absent from the env. assert!( - !env_dir(&cwd) + !cli_env_dir(&cwd, "g1", "with_skill") .join(".claude/skills/unrelated-skill") .exists() ); @@ -301,9 +325,12 @@ fn guard_marker_allowed_roots_cover_meta_above_env() { .assert() .success(); - // The guard arms inside env, but its allowedRoots include the workspace root above env, - // so eval-magic can still write meta (benchmark.json, dispatch.json) into iteration-N/. - let marker = read_json(&env_dir(&cwd).join(".claude/skills/.slow-powers-eval-guard.json")); + // The guard arms inside each env, but its allowedRoots include the workspace root + // above env, so eval-magic can still write meta (benchmark.json, dispatch.json) + // into iteration-N/. + let marker = read_json( + &cli_env_dir(&cwd, "g1", "with_skill").join(".claude/skills/.slow-powers-eval-guard.json"), + ); let roots = marker["allowedRoots"].as_array().unwrap(); let iter = iteration_dir(&cwd); assert!( diff --git a/tests/run/grouping.rs b/tests/run/grouping.rs index e941409..d018a34 100644 --- a/tests/run/grouping.rs +++ b/tests/run/grouping.rs @@ -1,7 +1,8 @@ //! Isolation-group batching during `run`: how the setup phase groups evals into -//! environments and records the plan in `dispatch.json`. Covers the in-session -//! single-env (byte-compat) path, the Cli per-(group, condition) split that closes -//! the condition-isolation gap, and the explicit `isolation: isolated` hint. +//! environments and records the plan in `dispatch.json`. Covers the per-(group, +//! condition) env split that closes the condition-isolation gap — emitted for every +//! run now, including the bare default invocation — and the explicit +//! `isolation: isolated` hint that fans a second group out into its own envs. use crate::helpers::*; use serde_json::json; @@ -17,7 +18,7 @@ fn write_fixtures(skill_dir: &std::path::Path) { } #[test] -fn insession_single_group_omits_groups_key_and_stays_bare_env() { +fn single_group_emits_groups_key_and_per_condition_envs() { let tmp = tempfile::TempDir::new().unwrap(); let (skill_dir, cwd) = setup(tmp.path(), DEFAULT_EVALS); skill_eval() @@ -28,17 +29,27 @@ fn insession_single_group_omits_groups_key_and_stays_bare_env() { .assert() .success(); - // The common no-conflict in-session case is byte-identical to the pre-grouping - // shape: a bare env/, no `groups` summary, and no per-task group/eval_root keys. - assert!(env_dir(&cwd).exists()); + // Even the bare default invocation now splits the env per (group, condition) and + // always records a `groups` summary — the single-env, no-groups shape is gone. + assert!(cli_env_dir(&cwd, "g1", "with_skill").exists()); + assert!(cli_env_dir(&cwd, "g1", "without_skill").exists()); let dispatch = read_json(&iteration_dir(&cwd).join("dispatch.json")); - assert!( - dispatch.get("groups").is_none(), - "single-group in-session omits the groups summary: {dispatch}" - ); + let groups = dispatch["groups"] + .as_array() + .expect("groups summary present even for a single group"); + assert_eq!(groups.len(), 1); + assert_eq!(groups[0]["id"], "g1"); + + // A single group means no per-task group tag, but each task still carries the + // per-condition env it runs in via eval_root. for task in dispatch["tasks"].as_array().unwrap() { - assert!(task.get("group").is_none(), "no group tag: {task}"); - assert!(task.get("eval_root").is_none(), "no eval_root: {task}"); + assert!(task.get("group").is_none(), "single group: no tag: {task}"); + let cond = task["condition"].as_str().unwrap(); + let eval_root = task["eval_root"].as_str().expect("task carries eval_root"); + assert!( + eval_root.ends_with(&format!("env-g1-{cond}")), + "eval_root points at the per-condition env: {eval_root}" + ); } } @@ -105,7 +116,7 @@ fn cli_single_group_emits_groups_and_splits_env_per_condition() { } #[test] -fn isolated_hint_splits_into_two_groups_in_session() { +fn isolated_hint_splits_into_two_groups() { let tmp = tempfile::TempDir::new().unwrap(); let (skill_dir, cwd) = setup(tmp.path(), TWO_EVALS_ONE_ISOLATED); write_fixtures(&skill_dir); @@ -133,7 +144,7 @@ fn isolated_hint_splits_into_two_groups_in_session() { groups[1]["rationale"] ); - // Tasks are tagged with their group. + // With two groups, tasks are tagged with their group. let e2_task = dispatch["tasks"] .as_array() .unwrap() @@ -142,12 +153,23 @@ fn isolated_hint_splits_into_two_groups_in_session() { .unwrap(); assert_eq!(e2_task["group"], "g2"); - // In-session stages only the FIRST group's fixtures into the one env up front; - // the isolated group's fixtures are swapped in later by reset-batch. - assert_eq!(read_str(&env_dir(&cwd).join("a.txt")), "AAA"); + // Each group gets its own per-condition envs, holding only that group's fixtures — + // g1's a.txt never leaks into g2's env and vice versa. + assert_eq!( + read_str(&cli_env_dir(&cwd, "g1", "with_skill").join("a.txt")), + "AAA" + ); + assert!( + !cli_env_dir(&cwd, "g1", "with_skill").join("b.txt").exists(), + "the isolated group's fixture is not staged into g1's env" + ); + assert_eq!( + read_str(&cli_env_dir(&cwd, "g2", "with_skill").join("b.txt")), + "BBB" + ); assert!( - !env_dir(&cwd).join("b.txt").exists(), - "the isolated group's fixture is not staged into the shared env up front" + !cli_env_dir(&cwd, "g2", "with_skill").join("a.txt").exists(), + "g1's fixture is not staged into the isolated group's env" ); } diff --git a/tests/run/helpers.rs b/tests/run/helpers.rs index 1c5caab..5e3a96d 100644 --- a/tests/run/helpers.rs +++ b/tests/run/helpers.rs @@ -36,24 +36,20 @@ pub fn iteration_dir(cwd: &Path) -> PathBuf { .join("iteration-1") } -/// The isolated env dir that becomes the agent-under-test's cwd (in-session -/// dispatch): staging, fixtures, and `RUNBOOK.md` all land under here, below -/// `iteration_dir`. -pub fn env_dir(cwd: &Path) -> PathBuf { - iteration_dir(cwd).join("env") -} - -/// A per-`(group, condition)` Cli env dir — the cwd each `claude -p`/`codex exec` +/// A per-`(group, condition)` env dir — the cwd each `claude -p`/`codex exec` /// subprocess runs from: `iteration-N/env--/`. Each holds only /// that condition's skill (or none, for the control arm) and its group's fixtures. +/// Staging, fixtures, and the guard marker all land under here, below +/// `iteration_dir`; `RUNBOOK.md` lives above it in `iteration_dir`. pub fn cli_env_dir(cwd: &Path, group: &str, condition: &str) -> PathBuf { iteration_dir(cwd).join(format!("env-{group}-{condition}")) } -/// Staged skill names under the env's harness skills dir (`env/.claude/skills`), -/// excluding the staging manifest, sorted. +/// Staged skill names under the default single-group `with_skill` env's harness +/// skills dir (`env-g1-with_skill/.claude/skills`), excluding the staging +/// manifest, sorted. pub fn env_staged_entries(cwd: &Path) -> Vec { - staged_entries(&env_dir(cwd).join(".claude/skills")) + staged_entries(&cli_env_dir(cwd, "g1", "with_skill").join(".claude/skills")) } pub fn read_json(path: &Path) -> Value { diff --git a/tests/run/lifecycle.rs b/tests/run/lifecycle.rs index f7a1b15..42f69d5 100644 --- a/tests/run/lifecycle.rs +++ b/tests/run/lifecycle.rs @@ -11,8 +11,8 @@ use std::path::Path; fn guard_installs_pretooluse_hook_under_env() { let tmp = tempfile::TempDir::new().unwrap(); let (skill_dir, cwd) = setup(tmp.path(), DEFAULT_EVALS); - // The guard arms inside the isolated env — the agent-under-test's cwd. - let settings = env_dir(&cwd).join(".claude/settings.local.json"); + // The guard arms inside each per-(group, condition) env — the agent-under-test's cwd. + let settings = cli_env_dir(&cwd, "g1", "with_skill").join(".claude/settings.local.json"); skill_eval() .current_dir(&cwd) @@ -48,13 +48,16 @@ fn guard_installs_pretooluse_hook_under_env() { } #[test] -fn finalize_does_not_warn_about_env_scoped_guard_from_cwd() { +fn finalize_warns_about_armed_per_env_guard_for_default_run() { let tmp = tempfile::TempDir::new().unwrap(); let (skill_dir, cwd) = setup(tmp.path(), DEFAULT_EVALS); - // The guard arms inside the env; `finalize` checks the invocation cwd, where no - // guard lives, so it does not warn. The env-scoped guard is harmless to the operator's - // cwd (it only loads when cwd = env); the in-env loop handles it within the session. - let marker = env_dir(&cwd).join(".claude/skills/.slow-powers-eval-guard.json"); + // The bare default run is hybrid: `--guard` arms a marker in each per-(group, + // condition) env. `finalize` runs from the invocation cwd, not inside any env, but + // the reworked finalize walks the per-env markers, so it reminds the operator the + // guard is still armed. (finalize only warns; `teardown` disarms — the marker + // survives finalize.) + let marker = + cli_env_dir(&cwd, "g1", "with_skill").join(".claude/skills/.slow-powers-eval-guard.json"); skill_eval() .current_dir(&cwd) @@ -72,7 +75,7 @@ fn finalize_does_not_warn_about_env_scoped_guard_from_cwd() { .args(["--skill", "mr-review", "--iteration", "1"]) .assert() .success() - .stdout(contains("Guard still armed").not()); + .stdout(contains("Guard still armed")); assert!(marker.exists()); } @@ -105,8 +108,8 @@ fn finalize_does_not_warn_when_guard_is_not_armed() { fn teardown_reclaims_workspace_and_env_guard() { let tmp = tempfile::TempDir::new().unwrap(); let (skill_dir, cwd) = setup(tmp.path(), DEFAULT_EVALS); - let settings = env_dir(&cwd).join(".claude/settings.local.json"); - let staged = env_dir(&cwd).join(".claude/skills"); + let settings = cli_env_dir(&cwd, "g1", "with_skill").join(".claude/settings.local.json"); + let staged = cli_env_dir(&cwd, "g1", "with_skill").join(".claude/skills"); skill_eval() .current_dir(&cwd) @@ -355,8 +358,8 @@ fn runs_flag_expands_dispatches_into_run_dirs() { .join(cond) .join(format!("run-{k}")); assert!(run_dir.is_dir(), "missing meta run dir {run_dir:?}"); - // Per-run outputs dir inside the env. - let out_dir = env_dir(&cwd) + // Per-run outputs dir inside the condition's env. + let out_dir = cli_env_dir(&cwd, "g1", cond) .join(".eval-magic-outputs") .join(format!("eval-{eval}")) .join(cond) @@ -396,8 +399,10 @@ fn runs_one_keeps_flat_single_run_layout() { let cond_dir = iteration_dir(&cwd).join("eval-e1").join("with_skill"); assert!(cond_dir.is_dir()); assert!(!cond_dir.join("run-1").exists()); - // Outputs live inside the env, flat (no run-1/ segment) for a single-run cell. - let out_dir = env_dir(&cwd).join(".eval-magic-outputs/eval-e1/with_skill"); + // Outputs live inside the condition's env, flat (no run-1/ segment) for a + // single-run cell. + let out_dir = + cli_env_dir(&cwd, "g1", "with_skill").join(".eval-magic-outputs/eval-e1/with_skill"); assert!(out_dir.is_dir()); assert!(!out_dir.join("run-1").exists()); } diff --git a/tests/run/main.rs b/tests/run/main.rs index 74ef8d2..42cd0f7 100644 --- a/tests/run/main.rs +++ b/tests/run/main.rs @@ -15,7 +15,5 @@ mod env_layout; mod grouping; mod lifecycle; mod opencode; -mod reset_batch; mod runbook; mod staging; -mod switch_condition; diff --git a/tests/run/reset_batch.rs b/tests/run/reset_batch.rs deleted file mode 100644 index f2b7247..0000000 --- a/tests/run/reset_batch.rs +++ /dev/null @@ -1,118 +0,0 @@ -//! `reset-batch`: the per-group isolation barrier for a single-session (in-session) -//! isolated run. Between eval-group batches it wipes the shared `env/` working tree -//! — keeping the staged skills and the outputs tree — and re-seeds it with the next -//! group's fixtures, so a prior batch's fixtures and stray writes can't leak. - -use crate::helpers::*; -use predicates::str::contains; -use std::fs; -use std::path::Path; - -const WITH_SLUG: &str = "slow-powers-eval-1-with_skill__mr-review"; - -/// Two evals routed into two groups: e2's `isolation: isolated` hint forces its own -/// group, so the in-session env stages group g1 (e1/a.txt) up front and swaps in -/// group g2 (e2/b.txt) via reset-batch. -const TWO_GROUPS: &str = r#"{ "skill_name": "mr-review", "evals": [ - { "id": "e1", "prompt": "p1", "expected_output": "o", "files": ["a.txt"] }, - { "id": "e2", "prompt": "p2", "expected_output": "o", "files": ["b.txt"], "isolation": "isolated" } ] }"#; - -/// Stage a two-group interactive iteration; returns `(skill_dir, cwd)` with `env/` -/// holding group g1's fixtures. -fn setup_two_groups(root: &Path) -> (std::path::PathBuf, std::path::PathBuf) { - let (skill_dir, cwd) = setup(root, TWO_GROUPS); - fs::write(skill_dir.join("mr-review/evals/a.txt"), "AAA").unwrap(); - fs::write(skill_dir.join("mr-review/evals/b.txt"), "BBB").unwrap(); - skill_eval() - .current_dir(&cwd) - .args(["run", "--skill-dir"]) - .arg(&skill_dir) - .args(["--skill", "mr-review", "--mode", "new-skill", "--dry-run"]) - .assert() - .success(); - (skill_dir, cwd) -} - -/// Run `reset-batch` the way the runbook prescribes: from inside `env/`, carrying -/// only the self-sufficient `--skill-dir/--skill/--workspace-dir` selector. -fn reset_to(cwd: &Path, skill_dir: &Path, group: &str) -> assert_cmd::assert::Assert { - skill_eval() - .current_dir(env_dir(cwd)) - .args(["reset-batch", "--skill-dir"]) - .arg(skill_dir) - .args(["--skill", "mr-review", "--workspace-dir"]) - .arg(cwd.join(".eval-magic")) - .args(["--iteration", "1", "--group", group]) - .assert() -} - -#[test] -fn reset_batch_wipes_working_tree_and_reseeds_group_fixtures() { - let tmp = tempfile::TempDir::new().unwrap(); - let (skill_dir, cwd) = setup_two_groups(tmp.path()); - - // Up front the env holds group g1's fixture only. - assert_eq!(read_str(&env_dir(&cwd).join("a.txt")), "AAA"); - assert!(!env_dir(&cwd).join("b.txt").exists()); - // Simulate a stray file the g1 batch's agent wrote into the env. - fs::write(env_dir(&cwd).join("stray.txt"), "STRAY").unwrap(); - - reset_to(&cwd, &skill_dir, "g2").success(); - - // The env is now seeded for g2: its fixture present, g1's gone, the stray write - // gone — a clean tree. - assert_eq!(read_str(&env_dir(&cwd).join("b.txt")), "BBB"); - assert!(!env_dir(&cwd).join("a.txt").exists()); - assert!(!env_dir(&cwd).join("stray.txt").exists()); - - // The staged skill and the outputs tree survive the wipe. - assert!( - env_dir(&cwd) - .join(".claude/skills") - .join(WITH_SLUG) - .is_dir(), - "the staged skill survives reset-batch" - ); - assert!(env_dir(&cwd).join(".eval-magic-outputs").exists()); -} - -#[test] -fn reset_batch_can_restore_the_first_group() { - let tmp = tempfile::TempDir::new().unwrap(); - let (skill_dir, cwd) = setup_two_groups(tmp.path()); - - // Move to g2, then back to g1 (as condition B's loop does after condition A left - // the env on the last group). - reset_to(&cwd, &skill_dir, "g2").success(); - reset_to(&cwd, &skill_dir, "g1").success(); - assert_eq!(read_str(&env_dir(&cwd).join("a.txt")), "AAA"); - assert!(!env_dir(&cwd).join("b.txt").exists()); -} - -#[test] -fn reset_batch_rejects_unknown_group() { - let tmp = tempfile::TempDir::new().unwrap(); - let (skill_dir, cwd) = setup_two_groups(tmp.path()); - reset_to(&cwd, &skill_dir, "g99") - .failure() - .stderr(contains("unknown --group")); -} - -#[test] -fn reset_batch_on_single_group_run_explains_it_is_unneeded() { - let tmp = tempfile::TempDir::new().unwrap(); - let (skill_dir, cwd) = setup(tmp.path(), DEFAULT_EVALS); - skill_eval() - .current_dir(&cwd) - .args(["run", "--skill-dir"]) - .arg(&skill_dir) - .args(["--skill", "mr-review", "--mode", "new-skill", "--dry-run"]) - .assert() - .success(); - - // A single-group run tags no task with a group, so reset-batch has nothing to do - // and says so rather than silently wiping. - reset_to(&cwd, &skill_dir, "g1") - .failure() - .stderr(contains("single group")); -} diff --git a/tests/run/runbook.rs b/tests/run/runbook.rs index e4c4a62..b1eb688 100644 --- a/tests/run/runbook.rs +++ b/tests/run/runbook.rs @@ -2,65 +2,6 @@ //! artifact, and the post-run pointer at it. use crate::helpers::*; -use predicates::prelude::PredicateBooleanExt; -use predicates::str::contains; - -#[test] -fn run_writes_interactive_runbook_and_points_at_it() { - let tmp = tempfile::TempDir::new().unwrap(); - let (skill_dir, cwd) = setup(tmp.path(), DEFAULT_EVALS); - // A real run (not --dry-run) so the post-run "Next:" handoff prints; --dry-run - // stops before next steps by contract. - skill_eval() - .current_dir(&cwd) - .args(["run", "--skill-dir"]) - .arg(&skill_dir) - .args(["--skill", "mr-review"]) - .assert() - .success() - // The summary hands off to a fresh isolated session: cd into env/, then - // "Read and follow RUNBOOK.md". It must not re-print the dispatch loop — - // that lives only in RUNBOOK.md now (the session-juggling apparatus is gone). - // (The exact env path in the handoff is locked by the util.rs unit test; - // here we just confirm the handoff is wired into stdout.) - .stdout(contains("Read and follow RUNBOOK.md")) - .stdout(contains("1. cd ")) - .stdout(contains("one batch at a time").not()); - - // The runbook lives inside the isolated env — the session's cwd reads it. - assert!(!iteration_dir(&cwd).join("RUNBOOK.md").exists()); - let book = read_str(&env_dir(&cwd).join("RUNBOOK.md")); - assert!(book.contains("mr-review"), "names the skill: {book}"); - assert!( - book.contains("with_skill") && book.contains("without_skill"), - "names both conditions: {book}" - ); - assert!( - book.contains("agent_description"), - "carries the in-session dispatch guidance: {book}" - ); - // The per-condition batch loop: a switch-condition barrier between the two - // batches, carrying the absolute --workspace-dir so it resolves from env/. - assert!( - book.contains("eval-magic switch-condition --skill-dir") - && book.contains("--workspace-dir") - && book.contains("--condition without_skill"), - "carries the switch-condition barrier between batches: {book}" - ); - assert!( - book.contains("eval-magic ingest --skill-dir"), - "carries the ingest command: {book}" - ); - assert!( - book.contains("eval-magic finalize --skill-dir"), - "carries the finalize command: {book}" - ); - assert!( - book.contains("benchmark.json"), - "points at the result: {book}" - ); - assert!(!book.contains("{{"), "no unsubstituted tokens: {book}"); -} #[test] fn run_writes_headless_runbook_for_codex() { @@ -75,8 +16,12 @@ fn run_writes_headless_runbook_for_codex() { .success(); // Cli dispatches from per-(group, condition) envs, so the human-followed runbook - // lives in the iteration dir (there is no single env/). - assert!(!env_dir(&cwd).join("RUNBOOK.md").exists()); + // lives in the iteration dir, not inside any env. + assert!( + !cli_env_dir(&cwd, "g1", "with_skill") + .join("RUNBOOK.md") + .exists() + ); let book = read_str(&iteration_dir(&cwd).join("RUNBOOK.md")); assert!( book.contains("human driving"), diff --git a/tests/run/staging.rs b/tests/run/staging.rs index aafc2b5..e0f3bc0 100644 --- a/tests/run/staging.rs +++ b/tests/run/staging.rs @@ -64,9 +64,9 @@ fn run_from_skill_dir_defaults_to_new_skill_without_staging_siblings() { .assert() .success() .stdout(contains("Preparing mr-review iteration-1 (new-skill)")) - // The run summary now hands off to the isolated session; the pipeline - // commands live in the RUNBOOK (asserted below), not the printed summary. - .stdout(contains("Read and follow RUNBOOK.md")); + // The run summary points at the human-followed RUNBOOK (a copy of the dispatch + // steps); the auto-derived pipeline commands are threaded into it (asserted below). + .stdout(contains("RUNBOOK.md")); assert!( direct_iteration_dir(&skill_sub) @@ -79,13 +79,10 @@ fn run_from_skill_dir_defaults_to_new_skill_without_staging_siblings() { ); // Run from inside the skill dir with no args: the auto-derived target selector - // (`command_target_args`) is threaded into the RUNBOOK's pipeline commands. - let runbook = read_str( - &direct_iteration_dir(&skill_sub) - .join("env") - .join("RUNBOOK.md"), - ); - assert!(runbook.contains("eval-magic ingest --skill-dir")); + // (`command_target_args`) is threaded into the RUNBOOK's pipeline commands. The + // RUNBOOK lives in the iteration dir (Cli dispatch has no single env/). + let runbook = read_str(&direct_iteration_dir(&skill_sub).join("RUNBOOK.md")); + assert!(runbook.contains("ingest --skill-dir")); assert!(runbook.contains("--skill mr-review --workspace-dir")); assert!(runbook.contains("--iteration 1")); @@ -194,7 +191,7 @@ fn stage_name_threads_verbatim_name_and_registers_cleanup() { .assert() .success(); - let skills_dir = env_dir(&cwd).join(".claude/skills"); + let skills_dir = cli_env_dir(&cwd, "g1", "with_skill").join(".claude/skills"); assert_eq!(staged_entries(&skills_dir), vec!["mr-review"]); let conditions = read_json(&iteration_dir(&cwd).join("conditions.json")); @@ -231,10 +228,10 @@ fn stage_name_threads_verbatim_name_and_registers_cleanup() { fn stage_name_refuses_to_clobber_preexisting_dir() { let tmp = tempfile::TempDir::new().unwrap(); let (skill_dir, cwd) = setup(tmp.path(), DEFAULT_EVALS); - // Staging now lands in env/.claude/skills, which is fresh per iteration. - // The clobber guard still matters on a re-run (--iteration 1) where the env - // already holds an untracked skill dir; pre-seed that and confirm it is preserved. - let preexisting = env_dir(&cwd).join(".claude/skills/my-real-skill"); + // Staging now lands in env-g1-with_skill/.claude/skills, which is fresh per + // iteration. The clobber guard still matters on a re-run (--iteration 1) where the + // env already holds an untracked skill dir; pre-seed that and confirm it is preserved. + let preexisting = cli_env_dir(&cwd, "g1", "with_skill").join(".claude/skills/my-real-skill"); fs::create_dir_all(&preexisting).unwrap(); fs::write(preexisting.join("SKILL.md"), "USER OWNED").unwrap(); diff --git a/tests/run/switch_condition.rs b/tests/run/switch_condition.rs deleted file mode 100644 index cd39279..0000000 --- a/tests/run/switch_condition.rs +++ /dev/null @@ -1,153 +0,0 @@ -//! `switch-condition`: the per-condition read-isolation barrier for a -//! single-session isolated run. It removes the off-condition's staged skill from -//! `env/.claude/skills/` between dispatch batches, and must resolve the iteration -//! tree while invoked from `cwd = env/`. - -use crate::helpers::*; -use std::path::{Path, PathBuf}; - -const WITH_SLUG: &str = "slow-powers-eval-1-with_skill__mr-review"; - -fn env_skills_dir(cwd: &Path) -> PathBuf { - env_dir(cwd).join(".claude/skills") -} - -/// Run `switch-condition` the way the runbook prescribes: from inside `env/`, -/// carrying only the self-sufficient `--skill-dir/--skill/--workspace-dir` selector. -fn switch_to(cwd: &Path, skill_dir: &Path, condition: &str) -> assert_cmd::assert::Assert { - skill_eval() - .current_dir(env_dir(cwd)) - .args(["switch-condition", "--skill-dir"]) - .arg(skill_dir) - .args(["--skill", "mr-review", "--workspace-dir"]) - .arg(cwd.join(".eval-magic")) - .args(["--iteration", "1", "--condition", condition]) - .assert() -} - -#[test] -fn switch_condition_removes_off_condition_slug_from_env_cwd() { - let tmp = tempfile::TempDir::new().unwrap(); - let (skill_dir, cwd) = setup(tmp.path(), DEFAULT_EVALS); - // Build the env (staging happens even under --dry-run). - skill_eval() - .current_dir(&cwd) - .args(["run", "--skill-dir"]) - .arg(&skill_dir) - .args(["--skill", "mr-review", "--mode", "new-skill", "--dry-run"]) - .assert() - .success(); - - let with_slug = env_skills_dir(&cwd).join(WITH_SLUG); - assert!(with_slug.is_dir(), "with_skill staged before switch"); - - // Move to the without_skill batch: the off-condition (with_skill) staged skill - // is removed so the control arm cannot read it. - switch_to(&cwd, &skill_dir, "without_skill").success(); - - assert!(!with_slug.exists(), "with_skill slug removed after switch"); -} - -#[test] -fn switch_condition_is_idempotent() { - let tmp = tempfile::TempDir::new().unwrap(); - let (skill_dir, cwd) = setup(tmp.path(), DEFAULT_EVALS); - skill_eval() - .current_dir(&cwd) - .args(["run", "--skill-dir"]) - .arg(&skill_dir) - .args(["--skill", "mr-review", "--mode", "new-skill", "--dry-run"]) - .assert() - .success(); - - // Two switches in a row: the second is a no-op, not an error (a re-run after a - // fix, or an over-eager operator, must stay safe). - switch_to(&cwd, &skill_dir, "without_skill").success(); - switch_to(&cwd, &skill_dir, "without_skill").success(); - assert!(!env_skills_dir(&cwd).join(WITH_SLUG).exists()); -} - -#[test] -fn switch_condition_preserves_guard_marker() { - let tmp = tempfile::TempDir::new().unwrap(); - let (skill_dir, cwd) = setup(tmp.path(), DEFAULT_EVALS); - // A guarded run arms the write guard; --guard requires a real (non-dry) run. - skill_eval() - .current_dir(&cwd) - .args(["run", "--skill-dir"]) - .arg(&skill_dir) - .args(["--skill", "mr-review", "--mode", "new-skill", "--guard"]) - .assert() - .success(); - - // The guard marker is a sibling file of the slug subtree inside the skills dir. - let marker = env_skills_dir(&cwd).join(".slow-powers-eval-guard.json"); - assert!(marker.exists(), "guard armed before switch"); - - switch_to(&cwd, &skill_dir, "without_skill").success(); - - assert!( - !env_skills_dir(&cwd).join(WITH_SLUG).exists(), - "slug removed" - ); - assert!(marker.exists(), "guard marker survives the switch"); -} - -#[test] -fn switch_condition_rejects_unknown_condition() { - let tmp = tempfile::TempDir::new().unwrap(); - let (skill_dir, cwd) = setup(tmp.path(), DEFAULT_EVALS); - skill_eval() - .current_dir(&cwd) - .args(["run", "--skill-dir"]) - .arg(&skill_dir) - .args(["--skill", "mr-review", "--mode", "new-skill", "--dry-run"]) - .assert() - .success(); - - switch_to(&cwd, &skill_dir, "bogus_condition") - .failure() - .stderr(predicates::str::contains( - "unknown --condition 'bogus_condition'", - )); - // A typo must not silently leave the staged skill in place under a false sense - // of isolation. - assert!(env_skills_dir(&cwd).join(WITH_SLUG).is_dir()); -} - -#[test] -fn switch_condition_revision_removes_old_skill_keeps_new() { - let tmp = tempfile::TempDir::new().unwrap(); - let (skill_dir, cwd) = setup(tmp.path(), DEFAULT_EVALS); - // Revision mode compares a baseline snapshot (old_skill) against the working - // SKILL.md (new_skill); both arms stage a skill. Seed the baseline snapshot. - let snapshot = iteration_dir(&cwd) - .parent() - .unwrap() - .join("snapshots") - .join("baseline"); - std::fs::create_dir_all(&snapshot).unwrap(); - std::fs::write( - snapshot.join("SKILL.md"), - "---\nname: mr-review\ndescription: review merge requests\n---\n\nold body\n", - ) - .unwrap(); - - skill_eval() - .current_dir(&cwd) - .args(["run", "--skill-dir"]) - .arg(&skill_dir) - .args(["--skill", "mr-review", "--mode", "revision", "--dry-run"]) - .assert() - .success(); - - let old_slug = env_skills_dir(&cwd).join("slow-powers-eval-1-old_skill__mr-review"); - let new_slug = env_skills_dir(&cwd).join("slow-powers-eval-1-new_skill__mr-review"); - assert!(old_slug.is_dir() && new_slug.is_dir(), "both arms staged"); - - // Switch to the new_skill batch: only the old_skill slug is removed. - switch_to(&cwd, &skill_dir, "new_skill").success(); - - assert!(!old_slug.exists(), "old_skill slug removed"); - assert!(new_slug.is_dir(), "new_skill slug kept"); -}