From 7dfe1a2eb771069a9c26f0cb4962f29a79db17ad Mon Sep 17 00:00:00 2001 From: Max Haarhaus Date: Wed, 24 Jun 2026 22:18:46 -0400 Subject: [PATCH] fix(cli-mode): fix runbook framing and permission issues for cli mode --- src/adapters/claude_cli.rs | 19 ++- src/adapters/codex_cli.rs | 22 +++- src/adapters/harness.rs | 15 ++- src/cli/commands/pipeline.rs | 17 ++- src/cli/run/dispatch.rs | 8 +- src/cli/run/orchestrate/build.rs | 10 +- src/cli/run/runbook.rs | 5 +- src/pipeline/record_runs.rs | 200 ++++++++++++++++++++++++++++++- 8 files changed, 266 insertions(+), 30 deletions(-) diff --git a/src/adapters/claude_cli.rs b/src/adapters/claude_cli.rs index 2377902..2924b49 100644 --- a/src/adapters/claude_cli.rs +++ b/src/adapters/claude_cli.rs @@ -10,6 +10,7 @@ //! block on a TTY and piped task data cannot become extra prompt context. use super::cli_command::render_cli_model_arg; +use std::path::Path; /// Copy/pasteable Claude Code dispatch command template. pub(crate) fn claude_exec_command_template( @@ -56,8 +57,13 @@ pub(crate) fn claude_parallel_dispatch_recipe( } /// Judge dispatch recipe over `judge-tasks.json`, one `claude -p` per task. -pub(crate) fn claude_judge_dispatch_recipe(model_flag: Option<&str>) -> String { +/// +/// Judges run from `judge_cwd` (the iteration dir) — a common ancestor of every +/// judge prompt, verdict `response_path`, and agent `outputs_dir`, and a dir with +/// no write-guard hook. +pub(crate) fn claude_judge_dispatch_recipe(model_flag: Option<&str>, judge_cwd: &Path) -> String { let model_flag = model_flag.unwrap_or("--model"); + let cwd = judge_cwd.display(); [ "Dispatch each judge task from judge-tasks.json with:".to_string(), String::new(), @@ -73,7 +79,9 @@ pub(crate) fn claude_judge_dispatch_recipe(model_flag: Option<&str>) -> String { " model_arg=\"\"; [ -n \"$model\" ] && model_arg=\"".to_string() + model_flag + " $model\"", - " cd && claude -p --output-format stream-json --verbose --permission-mode acceptEdits $model_arg \\".to_string(), + format!( + " cd \"{cwd}\" && claude -p --output-format stream-json --verbose --permission-mode acceptEdits $model_arg \\" + ), " \"Read the file at $prompt_path and follow it exactly. You are a judge worker only: write the JSON verdict to $response_path, then reply with one sentence. Do not run eval-magic. Do not dispatch other judge tasks. Do not wait for other workers.\" \\".to_string(), " \"$response_base.claude-events.jsonl\" \\".to_string(), @@ -89,6 +97,7 @@ mod tests { use super::{ claude_exec_command_template, claude_judge_dispatch_recipe, claude_parallel_dispatch_recipe, }; + use std::path::Path; #[test] fn exec_template_carries_required_stream_json_flags() { @@ -127,10 +136,12 @@ mod tests { } #[test] - fn judge_recipe_drives_claude_p() { - let recipe = claude_judge_dispatch_recipe(Some("--model")); + fn judge_recipe_runs_from_iteration_dir() { + let recipe = claude_judge_dispatch_recipe(Some("--model"), Path::new("/work/iter-1")); assert!(recipe.contains("claude -p"), "{recipe}"); assert!(recipe.contains("judge-tasks.json"), "{recipe}"); assert!(recipe.contains("response_path"), "{recipe}"); + assert!(!recipe.contains(""), "{recipe}"); + assert!(recipe.contains("cd \"/work/iter-1\" &&"), "{recipe}"); } } diff --git a/src/adapters/codex_cli.rs b/src/adapters/codex_cli.rs index 8b6935a..de64db0 100644 --- a/src/adapters/codex_cli.rs +++ b/src/adapters/codex_cli.rs @@ -1,6 +1,7 @@ //! Codex CLI command rendering for `DispatchMechanism::Cli` guidance. use super::cli_command::render_cli_model_arg; +use std::path::Path; /// Copy/pasteable Codex dispatch command template. Stdin is detached so a /// surrounding `xargs`/pipe cannot be treated as extra prompt context. @@ -60,13 +61,20 @@ pub(crate) fn codex_parallel_dispatch_recipe( .join("\n") } -pub(crate) fn codex_judge_dispatch_recipe(model_flag: Option<&str>, guard: bool) -> String { +/// Judges run from `judge_cwd` (the iteration dir) — a common ancestor of every +/// judge prompt, verdict `response_path`, and agent `outputs_dir`. +pub(crate) fn codex_judge_dispatch_recipe( + model_flag: Option<&str>, + guard: bool, + judge_cwd: &Path, +) -> String { let hook_trust = if guard { " --dangerously-bypass-hook-trust" } else { "" }; let model_flag = model_flag.unwrap_or("-m"); + let cwd = judge_cwd.display(); [ "Dispatch each judge task from judge-tasks.json with:".to_string(), String::new(), @@ -81,7 +89,7 @@ pub(crate) fn codex_judge_dispatch_recipe(model_flag: Option<&str>, guard: bool) " mkdir -p \"$(dirname \"$response_path\")\"".to_string(), " if [ -n \"$model\" ]; then".to_string(), format!( - " codex --ask-for-approval never exec --cd --sandbox workspace-write{hook_trust} {model_flag} \"$model\" --json \\" + " codex --ask-for-approval never exec --cd \"{cwd}\" --sandbox workspace-write{hook_trust} {model_flag} \"$model\" --json \\" ), " \"Read the file at $prompt_path and follow it exactly. You are a judge worker only: write the JSON verdict to $response_path, then reply with one sentence. Do not run eval-magic. Do not dispatch other judge tasks. Do not wait for other workers.\" \\".to_string(), " , guard: bool) " 2> \"$response_base.codex-stderr.log\"".to_string(), " else".to_string(), format!( - " codex --ask-for-approval never exec --cd --sandbox workspace-write{hook_trust} --json \\" + " codex --ask-for-approval never exec --cd \"{cwd}\" --sandbox workspace-write{hook_trust} --json \\" ), " \"Read the file at $prompt_path and follow it exactly. You are a judge worker only: write the JSON verdict to $response_path, then reply with one sentence. Do not run eval-magic. Do not dispatch other judge tasks. Do not wait for other workers.\" \\".to_string(), " --sandbox workspace-write --dangerously-bypass-hook-trust -m \"$model\" --json \\" + " codex --ask-for-approval never exec --cd \"/work/iter-1\" --sandbox workspace-write --dangerously-bypass-hook-trust -m \"$model\" --json \\" ), "{recipe}" ); assert!( recipe.contains( - " codex --ask-for-approval never exec --cd --sandbox workspace-write --dangerously-bypass-hook-trust --json \\" + " codex --ask-for-approval never exec --cd \"/work/iter-1\" --sandbox workspace-write --dangerously-bypass-hook-trust --json \\" ), "{recipe}" ); + assert!(!recipe.contains(""), "{recipe}"); } } diff --git a/src/adapters/harness.rs b/src/adapters/harness.rs index 725a737..07426c3 100644 --- a/src/adapters/harness.rs +++ b/src/adapters/harness.rs @@ -120,7 +120,7 @@ pub trait HarnessAdapter { /// The post-`grade` / post-`ingest` judge dispatch guidance for a /// [`Cli`](crate::core::DispatchMechanism::Cli)-dispatch harness. `None` /// leaves the generic in-session-style judge handoff in place. - fn cli_judge_next_steps(&self, _ctx: CliJudgeContext) -> Option { + fn cli_judge_next_steps(&self, _ctx: CliJudgeContext<'_>) -> Option { None } @@ -195,8 +195,9 @@ pub struct CliManifestContext<'a> { /// Context for rendering a harness's one-shot CLI judge-dispatch guidance. #[derive(Debug, Clone, Copy)] -pub struct CliJudgeContext { +pub struct CliJudgeContext<'a> { pub guard: bool, + pub iteration_dir: &'a Path, } impl HarnessAdapter for ClaudeCodeAdapter { @@ -261,8 +262,11 @@ impl HarnessAdapter for ClaudeCodeAdapter { String::new(), ]) } - fn cli_judge_next_steps(&self, _ctx: CliJudgeContext) -> Option { - Some(claude_judge_dispatch_recipe(self.cli_model_flag())) + fn cli_judge_next_steps(&self, ctx: CliJudgeContext<'_>) -> Option { + Some(claude_judge_dispatch_recipe( + self.cli_model_flag(), + ctx.iteration_dir, + )) } fn parse_transcript(&self, path: &Path) -> io::Result> { parse_transcript(path) @@ -350,10 +354,11 @@ impl HarnessAdapter for CodexAdapter { String::new(), ]) } - fn cli_judge_next_steps(&self, ctx: CliJudgeContext) -> Option { + fn cli_judge_next_steps(&self, ctx: CliJudgeContext<'_>) -> Option { Some(codex_judge_dispatch_recipe( self.cli_model_flag(), ctx.guard, + ctx.iteration_dir, )) } fn parse_transcript(&self, path: &Path) -> io::Result> { diff --git a/src/cli/commands/pipeline.rs b/src/cli/commands/pipeline.rs index 501ae96..ab185e0 100644 --- a/src/cli/commands/pipeline.rs +++ b/src/cli/commands/pipeline.rs @@ -18,7 +18,11 @@ use crate::validation; const JUDGE_WORKER_PROMPT: &str = "Read the file at and follow it exactly. You are a judge worker only: write the JSON verdict to , then reply with one sentence. Do not run eval-magic. Do not dispatch other judge tasks. Do not wait for other workers."; -fn judge_dispatch_guidance(ctx: &RunContext) -> String { +fn judge_dispatch_guidance(ctx: &RunContext, iteration: u32) -> String { + let iteration_dir = ctx + .workspace_root + .join(&ctx.skill_name) + .join(format!("iteration-{iteration}")); match ctx.run_mode.mechanism() { DispatchMechanism::InSession => { format!("Dispatch each task as a judge subagent with:\n {JUDGE_WORKER_PROMPT}") @@ -26,6 +30,7 @@ fn judge_dispatch_guidance(ctx: &RunContext) -> String { DispatchMechanism::Cli => adapter_for(ctx.harness) .cli_judge_next_steps(CliJudgeContext { guard: sandbox::guard_is_armed(&ctx.stage_root), + iteration_dir: &iteration_dir, }) .unwrap_or_else(|| { format!( @@ -107,7 +112,7 @@ pub(crate) fn run_ingest(args: CommonArgs) -> anyhow::Result<()> { .and_then(|s| serde_json::from_str::(&s).ok()) .and_then(|v| v.get("total_tasks").and_then(serde_json::Value::as_u64)); let target_args = command_target_args(&ctx); - let judge_guidance = judge_dispatch_guidance(&ctx); + let judge_guidance = judge_dispatch_guidance(&ctx, iteration); match total_tasks { Some(0) => println!( "\n✅ Ingest complete — no judge dispatches needed.\nNext: eval-magic finalize{target_args} --iteration {iteration}" @@ -351,15 +356,19 @@ pub(crate) fn run_record_runs(args: CommonArgs) -> anyhow::Result<()> { pipeline::record_runs(&dir, ctx.harness, mechanism, subagents_dir, args.overwrite)?; println!( - "\nRecorded: {}, skipped (existing run.json): {}, skipped (no final message): {}, missing transcript: {}", + "\nRecorded: {}, skipped (existing run.json): {}, skipped (no final message): {}, skipped (prompt unread): {}, missing transcript: {}", result.recorded, result.skipped_existing, result.skipped_no_final_message, + result.skipped_prompt_unread, result.missing_transcript ); if let Some(warning) = result.transcript_warning(ctx.harness, mechanism) { eprintln!("{warning}"); } + if let Some(warning) = result.prompt_unread_warning() { + eprintln!("{warning}"); + } Ok(()) } @@ -502,7 +511,7 @@ pub(crate) fn run_grade(args: GradeArgs) -> anyhow::Result<()> { ); } let target_args = command_target_args(&ctx); - let judge_guidance = judge_dispatch_guidance(&ctx); + let judge_guidance = judge_dispatch_guidance(&ctx, iteration); println!( "\nNext: {judge_guidance}\nThen run: eval-magic grade{target_args} --iteration {iteration} --finalize" ); diff --git a/src/cli/run/dispatch.rs b/src/cli/run/dispatch.rs index 87ddf94..e610166 100644 --- a/src/cli/run/dispatch.rs +++ b/src/cli/run/dispatch.rs @@ -270,7 +270,7 @@ pub fn build_dispatch_task(opts: &DispatchTaskOpts) -> Result format!("eval-{}/{cond_name}", ev.id), Some(k) => format!("eval-{}/{cond_name}/run-{k}", ev.id), diff --git a/src/cli/run/runbook.rs b/src/cli/run/runbook.rs index fb76b90..ac0b921 100644 --- a/src/cli/run/runbook.rs +++ b/src/cli/run/runbook.rs @@ -172,7 +172,10 @@ pub(crate) fn build_runbook(ctx: &RunbookContext) -> String { agent_model: ctx.agent_model, }); judge_recipe = adapter - .cli_judge_next_steps(CliJudgeContext { guard: ctx.guard }) + .cli_judge_next_steps(CliJudgeContext { + guard: ctx.guard, + iteration_dir: ctx.iteration_dir, + }) .unwrap_or_else(|| { "Dispatch each judge task `ingest` listed through the same harness CLI, \ capturing its transcript output, then finalize." diff --git a/src/pipeline/record_runs.rs b/src/pipeline/record_runs.rs index 3d84277..c97034b 100644 --- a/src/pipeline/record_runs.rs +++ b/src/pipeline/record_runs.rs @@ -31,8 +31,7 @@ struct DispatchFile { } /// The subset of a dispatch task record-runs consumes. `dispatch.json` carries -/// more fields (e.g. `staged_skill_slug`, `dispatch_prompt_path`); serde ignores -/// the extras. +/// more fields (e.g. `staged_skill_slug`); serde ignores the extras. #[derive(Debug, Deserialize)] struct DispatchTask { eval_id: String, @@ -46,6 +45,8 @@ struct DispatchTask { run_record_path: String, timing_path: String, agent_description: String, + #[serde(default)] + dispatch_prompt_path: String, } /// Tally of what record-runs did across the dispatch's tasks. @@ -55,6 +56,7 @@ pub struct RecordRunsResult { pub skipped_existing: usize, pub skipped_no_final_message: usize, pub missing_transcript: usize, + pub skipped_prompt_unread: usize, } impl RecordRunsResult { @@ -102,6 +104,24 @@ impl RecordRunsResult { assertions will grade unverifiable." )) } + + /// A loud, actionable warning when one or more dispatches were excluded + /// because their transcript shows a failed read of the dispatch prompt — the + /// agent never received its instructions, so the result is a no-op, not data. + /// `None` when none were flagged. + pub fn prompt_unread_warning(&self) -> Option { + if self.skipped_prompt_unread == 0 { + return None; + } + let n = self.skipped_prompt_unread; + let plural = if n == 1 { "" } else { "es" }; + Some(format!( + "⚠ {n} dispatch{plural} skipped — the transcript shows a failed read of the dispatch \ + prompt (the agent never received its instructions). These are NOT recorded, so they \ + cannot be graded as data. Check the env/sandbox can reach each task's \ + `dispatch_prompt_path`, then re-dispatch." + )) + } } /// Assemble `run.json` + `timing.json` for every task in @@ -137,6 +157,19 @@ pub fn record_runs( if run_record_path.exists() && !overwrite { // An agent/operator already wrote this run.json — leave it untouched. result.skipped_existing += 1; + } else if let Some(summary) = &summary + && prompt_read_failed( + summary, + &task.dispatch_prompt_path, + &prompt_sentinel(&task.dispatch_prompt_path), + ) + { + // The transcript shows the agent tried to read its prompt and the + // read returned an error, not the prompt — it never received its + // instructions. Skip both run.json and timing so the no-op can't be + // graded as data. + result.skipped_prompt_unread += 1; + continue; } else { let final_message_path = Path::new(&task.outputs_dir).join("final-message.md"); let final_message = if final_message_path.exists() { @@ -192,6 +225,58 @@ pub fn record_runs( Ok(result) } +/// Positive evidence that the agent tried to read its dispatch prompt and +/// failed: the transcript has a tool call referencing `prompt_path`, yet no such +/// call returned the prompt's content (its distinctive first-line `sentinel`). +/// +/// A run that never references the prompt path is NOT flagged — absence is not +/// proof of failure (an in-session subagent can receive the prompt another way), +/// and requiring positive evidence keeps the check free of false positives. +/// Returns `false` when `sentinel` is empty (the prompt file was missing or +/// unreadable, so the read cannot be judged). +fn prompt_read_failed(summary: &TranscriptSummary, prompt_path: &str, sentinel: &str) -> bool { + if sentinel.is_empty() { + return false; + } + let mut referenced = false; + let mut delivered = false; + for inv in &summary.tool_invocations { + let mentions_prompt = inv + .args + .as_ref() + .is_some_and(|a| a.to_string().contains(prompt_path)); + if !mentions_prompt { + continue; + } + referenced = true; + if inv + .result + .as_ref() + .and_then(serde_json::Value::as_str) + .is_some_and(|r| r.contains(sentinel)) + { + delivered = true; + } + } + referenced && !delivered +} + +/// The dispatch prompt's distinctive first non-empty line, used as the sentinel +/// for [`prompt_read_failed`]. Empty when the prompt file is missing/unreadable. +fn prompt_sentinel(prompt_path: &str) -> String { + if prompt_path.is_empty() { + return String::new(); + } + fs::read_to_string(prompt_path) + .ok() + .and_then(|p| { + p.lines() + .find(|l| !l.trim().is_empty()) + .map(|l| l.trim().to_string()) + }) + .unwrap_or_default() +} + /// Resolve a task's transcript summary, keyed on the dispatch mechanism: a /// `Cli`-mechanism harness reads the events file its CLI wrote under the task's /// outputs dir (e.g. Codex's `codex-events.jsonl`); an `InSession` harness reads @@ -299,6 +384,117 @@ mod tests { fs::write(outputs_dir.join("claude-events.jsonl"), jsonl(&lines)).unwrap(); } + /// A `claude -p` events fixture where the agent reads its dispatch prompt: + /// a `Read` tool call whose `input.file_path` is `prompt_path`, a + /// `tool_result` carrying `read_result` (the file content on success, an + /// error string on a denied/out-of-cwd read), and a terminal `result` event. + fn write_claude_events_prompt_read( + outputs_dir: &Path, + prompt_path: &str, + read_result: &str, + final_text: &str, + ) { + let lines = vec![ + json!({"type": "system", "subtype": "init", "cwd": "/env"}), + json!({"type": "assistant", "message": {"id": "msg_1", "role": "assistant", "content": [{"type": "tool_use", "id": "toolu_1", "name": "Read", "input": {"file_path": prompt_path}}]}}), + json!({"type": "user", "message": {"role": "user", "content": [{"type": "tool_result", "tool_use_id": "toolu_1", "content": read_result}]}}), + json!({"type": "result", "subtype": "success", "is_error": false, "result": final_text, "duration_ms": 30_000, "usage": {"input_tokens": 100, "output_tokens": 20, "cache_creation_input_tokens": 0, "cache_read_input_tokens": 5}}), + ]; + fs::write(outputs_dir.join("claude-events.jsonl"), jsonl(&lines)).unwrap(); + } + + const PROMPT_SENTINEL: &str = + "You are executing a single test case for a skill evaluation framework."; + + #[test] + fn flags_dispatch_whose_prompt_read_failed() { + // A dispatch that couldn't read its prompt still exits 0 and emits a + // final message — but the run is a silent no-op, not data (issue #109). + let tmp = TempDir::new().unwrap(); + let iter = tmp.path(); + let paths = write_iteration( + iter, + &[FixtureTask { + eval_id: "e1", + condition: "with_skill", + final_message: Some("I could not read the prompt file."), + }], + ); + let prompt_path = iter + .join("eval-e1") + .join("with_skill") + .join("dispatch-prompt.txt"); + fs::write( + &prompt_path, + format!("{PROMPT_SENTINEL}\n\nUser request:\ndo it"), + ) + .unwrap(); + // The transcript shows a Read of the prompt path that ERRORED — the + // result is a denial, not the prompt content. + write_claude_events_prompt_read( + &paths[0].outputs_dir, + &prompt_path.to_string_lossy(), + "File is outside the allowed working directory.", + "I could not read the prompt file.", + ); + + let result = record_runs( + iter, + Harness::ClaudeCode, + DispatchMechanism::Cli, + None, + false, + ) + .unwrap(); + + assert_eq!(result.skipped_prompt_unread, 1); + assert_eq!(result.recorded, 0); + assert!(!paths[0].run_record_path.exists()); + } + + #[test] + fn records_dispatch_when_prompt_read_succeeded() { + // The same shape, but the Read returned the prompt content (Read echoes + // it with a line-number prefix) — a legitimate run, recorded as data. + let tmp = TempDir::new().unwrap(); + let iter = tmp.path(); + let paths = write_iteration( + iter, + &[FixtureTask { + eval_id: "e1", + condition: "with_skill", + final_message: Some("Done."), + }], + ); + let prompt_path = iter + .join("eval-e1") + .join("with_skill") + .join("dispatch-prompt.txt"); + fs::write( + &prompt_path, + format!("{PROMPT_SENTINEL}\n\nUser request:\ndo it"), + ) + .unwrap(); + write_claude_events_prompt_read( + &paths[0].outputs_dir, + &prompt_path.to_string_lossy(), + &format!(" 1→{PROMPT_SENTINEL}\n 2→\n 3→User request:"), + "Done.", + ); + + let result = record_runs( + iter, + Harness::ClaudeCode, + DispatchMechanism::Cli, + None, + false, + ) + .unwrap(); + + assert_eq!(result.recorded, 1); + assert_eq!(result.skipped_prompt_unread, 0); + } + struct FixtureTask { eval_id: &'static str, condition: &'static str,