slowdini · slowdini · Jun 25, 2026 · Jun 21, 2026 · Jun 25, 2026 · Jun 25, 2026
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "eval-magic"
-version = "0.4.0"
+version = "0.4.1"
 edition = "2024"
 description = "One-stop CLI for running skill evals — measure whether an agent skill actually shifts behavior."
 license = "MIT"

diff --git a/src/adapters/claude_cli.rs b/src/adapters/claude_cli.rs
@@ -10,6 +10,7 @@
 //! block on a TTY and piped task data cannot become extra prompt context.
 
 use super::cli_command::render_cli_model_arg;
+use std::path::Path;
 
 /// Copy/pasteable Claude Code dispatch command template.
 pub(crate) fn claude_exec_command_template(
@@ -56,8 +57,13 @@ pub(crate) fn claude_parallel_dispatch_recipe(
 }
 
 /// Judge dispatch recipe over `judge-tasks.json`, one `claude -p` per task.
-pub(crate) fn claude_judge_dispatch_recipe(model_flag: Option<&str>) -> String {
+///
+/// Judges run from `judge_cwd` (the iteration dir) — a common ancestor of every
+/// judge prompt, verdict `response_path`, and agent `outputs_dir`, and a dir with
+/// no write-guard hook.
+pub(crate) fn claude_judge_dispatch_recipe(model_flag: Option<&str>, judge_cwd: &Path) -> String {
     let model_flag = model_flag.unwrap_or("--model");
+    let cwd = judge_cwd.display();
     [
         "Dispatch each judge task from judge-tasks.json with:".to_string(),
         String::new(),
@@ -73,7 +79,9 @@ pub(crate) fn claude_judge_dispatch_recipe(model_flag: Option<&str>) -> String {
         "    model_arg=\"\"; [ -n \"$model\" ] && model_arg=\"".to_string()
             + model_flag
             + " $model\"",
-        "    cd <eval-root> && claude -p --output-format stream-json --verbose --permission-mode acceptEdits $model_arg \\".to_string(),
+        format!(
+            "    cd \"{cwd}\" && claude -p --output-format stream-json --verbose --permission-mode acceptEdits $model_arg \\"
+        ),
         "      \"Read the file at $prompt_path and follow it exactly. You are a judge worker only: write the JSON verdict to $response_path, then reply with one sentence. Do not run eval-magic. Do not dispatch other judge tasks. Do not wait for other workers.\" \\".to_string(),
         "      </dev/null \\".to_string(),
         "      > \"$response_base.claude-events.jsonl\" \\".to_string(),
@@ -89,6 +97,7 @@ mod tests {
     use super::{
         claude_exec_command_template, claude_judge_dispatch_recipe, claude_parallel_dispatch_recipe,
     };
+    use std::path::Path;
 
     #[test]
     fn exec_template_carries_required_stream_json_flags() {
@@ -127,10 +136,12 @@ mod tests {
     }
 
     #[test]
-    fn judge_recipe_drives_claude_p() {
-        let recipe = claude_judge_dispatch_recipe(Some("--model"));
+    fn judge_recipe_runs_from_iteration_dir() {
+        let recipe = claude_judge_dispatch_recipe(Some("--model"), Path::new("/work/iter-1"));
         assert!(recipe.contains("claude -p"), "{recipe}");
         assert!(recipe.contains("judge-tasks.json"), "{recipe}");
         assert!(recipe.contains("response_path"), "{recipe}");
+        assert!(!recipe.contains("<eval-root>"), "{recipe}");
+        assert!(recipe.contains("cd \"/work/iter-1\" &&"), "{recipe}");
     }
 }
diff --git a/src/adapters/codex_cli.rs b/src/adapters/codex_cli.rs
@@ -1,6 +1,7 @@
 //! Codex CLI command rendering for `DispatchMechanism::Cli` guidance.
 
 use super::cli_command::render_cli_model_arg;
+use std::path::Path;
 
 /// Copy/pasteable Codex dispatch command template. Stdin is detached so a
 /// surrounding `xargs`/pipe cannot be treated as extra prompt context.
@@ -60,13 +61,20 @@ pub(crate) fn codex_parallel_dispatch_recipe(
     .join("\n")
 }
 
-pub(crate) fn codex_judge_dispatch_recipe(model_flag: Option<&str>, guard: bool) -> String {
+/// Judges run from `judge_cwd` (the iteration dir) — a common ancestor of every
+/// judge prompt, verdict `response_path`, and agent `outputs_dir`.
+pub(crate) fn codex_judge_dispatch_recipe(
+    model_flag: Option<&str>,
+    guard: bool,
+    judge_cwd: &Path,
+) -> String {
     let hook_trust = if guard {
         " --dangerously-bypass-hook-trust"
     } else {
         ""
     };
     let model_flag = model_flag.unwrap_or("-m");
+    let cwd = judge_cwd.display();
     [
         "Dispatch each judge task from judge-tasks.json with:".to_string(),
         String::new(),
@@ -81,15 +89,15 @@ pub(crate) fn codex_judge_dispatch_recipe(model_flag: Option<&str>, guard: bool)
         "    mkdir -p \"$(dirname \"$response_path\")\"".to_string(),
         "    if [ -n \"$model\" ]; then".to_string(),
         format!(
-            "      codex --ask-for-approval never exec --cd <eval-root> --sandbox workspace-write{hook_trust} {model_flag} \"$model\" --json \\"
+            "      codex --ask-for-approval never exec --cd \"{cwd}\" --sandbox workspace-write{hook_trust} {model_flag} \"$model\" --json \\"
         ),
         "        \"Read the file at $prompt_path and follow it exactly. You are a judge worker only: write the JSON verdict to $response_path, then reply with one sentence. Do not run eval-magic. Do not dispatch other judge tasks. Do not wait for other workers.\" \\".to_string(),
         "        </dev/null \\".to_string(),
         "        > \"$response_base.codex-events.jsonl\" \\".to_string(),
         "        2> \"$response_base.codex-stderr.log\"".to_string(),
         "    else".to_string(),
         format!(
-            "      codex --ask-for-approval never exec --cd <eval-root> --sandbox workspace-write{hook_trust} --json \\"
+            "      codex --ask-for-approval never exec --cd \"{cwd}\" --sandbox workspace-write{hook_trust} --json \\"
         ),
         "        \"Read the file at $prompt_path and follow it exactly. You are a judge worker only: write the JSON verdict to $response_path, then reply with one sentence. Do not run eval-magic. Do not dispatch other judge tasks. Do not wait for other workers.\" \\".to_string(),
         "        </dev/null \\".to_string(),
@@ -107,6 +115,7 @@ mod tests {
     use super::{
         codex_exec_command_template, codex_judge_dispatch_recipe, codex_parallel_dispatch_recipe,
     };
+    use std::path::Path;
 
     #[test]
     fn exec_template_places_approval_policy_before_exec() {
@@ -133,19 +142,20 @@ mod tests {
 
     #[test]
     fn judge_recipe_places_approval_policy_before_exec() {
-        let recipe = codex_judge_dispatch_recipe(Some("-m"), true);
+        let recipe = codex_judge_dispatch_recipe(Some("-m"), true, Path::new("/work/iter-1"));
 
         assert!(
             recipe.contains(
-                "      codex --ask-for-approval never exec --cd <eval-root> --sandbox workspace-write --dangerously-bypass-hook-trust -m \"$model\" --json \\"
+                "      codex --ask-for-approval never exec --cd \"/work/iter-1\" --sandbox workspace-write --dangerously-bypass-hook-trust -m \"$model\" --json \\"
             ),
             "{recipe}"
         );
         assert!(
             recipe.contains(
-                "      codex --ask-for-approval never exec --cd <eval-root> --sandbox workspace-write --dangerously-bypass-hook-trust --json \\"
+                "      codex --ask-for-approval never exec --cd \"/work/iter-1\" --sandbox workspace-write --dangerously-bypass-hook-trust --json \\"
             ),
             "{recipe}"
         );
+        assert!(!recipe.contains("<eval-root>"), "{recipe}");
     }
 }
diff --git a/src/adapters/harness.rs b/src/adapters/harness.rs
@@ -120,7 +120,7 @@ pub trait HarnessAdapter {
     /// The post-`grade` / post-`ingest` judge dispatch guidance for a
     /// [`Cli`](crate::core::DispatchMechanism::Cli)-dispatch harness. `None`
     /// leaves the generic in-session-style judge handoff in place.
-    fn cli_judge_next_steps(&self, _ctx: CliJudgeContext) -> Option<String> {
+    fn cli_judge_next_steps(&self, _ctx: CliJudgeContext<'_>) -> Option<String> {
         None
     }
 
@@ -195,8 +195,9 @@ pub struct CliManifestContext<'a> {
 
 /// Context for rendering a harness's one-shot CLI judge-dispatch guidance.
 #[derive(Debug, Clone, Copy)]
-pub struct CliJudgeContext {
+pub struct CliJudgeContext<'a> {
     pub guard: bool,
+    pub iteration_dir: &'a Path,
 }
 
 impl HarnessAdapter for ClaudeCodeAdapter {
@@ -261,8 +262,11 @@ impl HarnessAdapter for ClaudeCodeAdapter {
             String::new(),
         ])
     }
-    fn cli_judge_next_steps(&self, _ctx: CliJudgeContext) -> Option<String> {
-        Some(claude_judge_dispatch_recipe(self.cli_model_flag()))
+    fn cli_judge_next_steps(&self, ctx: CliJudgeContext<'_>) -> Option<String> {
+        Some(claude_judge_dispatch_recipe(
+            self.cli_model_flag(),
+            ctx.iteration_dir,
+        ))
     }
     fn parse_transcript(&self, path: &Path) -> io::Result<Vec<ToolInvocation>> {
         parse_transcript(path)
@@ -350,10 +354,11 @@ impl HarnessAdapter for CodexAdapter {
             String::new(),
         ])
     }
-    fn cli_judge_next_steps(&self, ctx: CliJudgeContext) -> Option<String> {
+    fn cli_judge_next_steps(&self, ctx: CliJudgeContext<'_>) -> Option<String> {
         Some(codex_judge_dispatch_recipe(
             self.cli_model_flag(),
             ctx.guard,
+            ctx.iteration_dir,
         ))
     }
     fn parse_transcript(&self, path: &Path) -> io::Result<Vec<ToolInvocation>> {

diff --git a/src/cli/commands/pipeline.rs b/src/cli/commands/pipeline.rs
@@ -18,14 +18,19 @@ use crate::validation;
 
 const JUDGE_WORKER_PROMPT: &str = "Read the file at <dispatch_prompt_path> and follow it exactly. You are a judge worker only: write the JSON verdict to <response_path>, then reply with one sentence. Do not run eval-magic. Do not dispatch other judge tasks. Do not wait for other workers.";
 
-fn judge_dispatch_guidance(ctx: &RunContext) -> String {
+fn judge_dispatch_guidance(ctx: &RunContext, iteration: u32) -> String {
+    let iteration_dir = ctx
+        .workspace_root
+        .join(&ctx.skill_name)
+        .join(format!("iteration-{iteration}"));
     match ctx.run_mode.mechanism() {
         DispatchMechanism::InSession => {
             format!("Dispatch each task as a judge subagent with:\n  {JUDGE_WORKER_PROMPT}")
         }
         DispatchMechanism::Cli => adapter_for(ctx.harness)
             .cli_judge_next_steps(CliJudgeContext {
                 guard: sandbox::guard_is_armed(&ctx.stage_root),
+                iteration_dir: &iteration_dir,
             })
             .unwrap_or_else(|| {
                 format!(
@@ -107,7 +112,7 @@ pub(crate) fn run_ingest(args: CommonArgs) -> anyhow::Result<()> {
         .and_then(|s| serde_json::from_str::<serde_json::Value>(&s).ok())
         .and_then(|v| v.get("total_tasks").and_then(serde_json::Value::as_u64));
     let target_args = command_target_args(&ctx);
-    let judge_guidance = judge_dispatch_guidance(&ctx);
+    let judge_guidance = judge_dispatch_guidance(&ctx, iteration);
     match total_tasks {
         Some(0) => println!(
             "\n✅ Ingest complete — no judge dispatches needed.\nNext: eval-magic finalize{target_args} --iteration {iteration}"
@@ -351,15 +356,19 @@ pub(crate) fn run_record_runs(args: CommonArgs) -> anyhow::Result<()> {
         pipeline::record_runs(&dir, ctx.harness, mechanism, subagents_dir, args.overwrite)?;
 
     println!(
-        "\nRecorded: {}, skipped (existing run.json): {}, skipped (no final message): {}, missing transcript: {}",
+        "\nRecorded: {}, skipped (existing run.json): {}, skipped (no final message): {}, skipped (prompt unread): {}, missing transcript: {}",
         result.recorded,
         result.skipped_existing,
         result.skipped_no_final_message,
+        result.skipped_prompt_unread,
         result.missing_transcript
     );
     if let Some(warning) = result.transcript_warning(ctx.harness, mechanism) {
         eprintln!("{warning}");
     }
+    if let Some(warning) = result.prompt_unread_warning() {
+        eprintln!("{warning}");
+    }
     Ok(())
 }
 
@@ -502,7 +511,7 @@ pub(crate) fn run_grade(args: GradeArgs) -> anyhow::Result<()> {
             );
         }
         let target_args = command_target_args(&ctx);
-        let judge_guidance = judge_dispatch_guidance(&ctx);
+        let judge_guidance = judge_dispatch_guidance(&ctx, iteration);
         println!(
             "\nNext: {judge_guidance}\nThen run: eval-magic grade{target_args} --iteration {iteration} --finalize"
         );

diff --git a/src/cli/run/dispatch.rs b/src/cli/run/dispatch.rs
@@ -270,7 +270,7 @@ pub fn build_dispatch_task(opts: &DispatchTaskOpts) -> Result<DispatchTask, RunE
         run_record_path: cond_dir.join("run.json").to_string_lossy().into_owned(),
         timing_path: cond_dir.join("timing.json").to_string_lossy().into_owned(),
         agent_description,
-        dispatch_prompt_path: cond_dir
+        dispatch_prompt_path: Path::new(opts.outputs_dir)
             .join("dispatch-prompt.txt")
             .to_string_lossy()
             .into_owned(),
@@ -681,9 +681,11 @@ mod tests {
     }
 
     #[test]
-    fn dispatch_prompt_path_under_cond_dir() {
+    fn dispatch_prompt_path_under_outputs_dir() {
         let task = build_dispatch_task(&base_opts()).unwrap();
-        assert_eq!(task.dispatch_prompt_path, "/tmp/cond/dispatch-prompt.txt");
+        assert_eq!(task.dispatch_prompt_path, "/tmp/out/dispatch-prompt.txt");
+        assert_eq!(task.run_record_path, "/tmp/cond/run.json");
+        assert_eq!(task.timing_path, "/tmp/cond/timing.json");
     }
 
     const SAMPLE_DIRECTORY: &str = "## Active Skills Directory\n\n* **`test-driven-development`**\n  * *Trigger:* Use whenever implementing code.\n* **`systematic-debugging`**\n  * *Trigger:* Use when debugging.";

diff --git a/src/cli/run/orchestrate/build.rs b/src/cli/run/orchestrate/build.rs
@@ -173,13 +173,13 @@ pub(super) fn write_dispatch(
                     } else {
                         (cond_dir.join(format!("run-{run_idx}")), Some(run_idx))
                     };
-                    // Create the per-run meta dir (run.json / timing.json /
-                    // dispatch-prompt.txt), which lives above the env.
+                    // Create the per-run meta dir (run.json / timing.json), which
+                    // lives above the env.
                     fs::create_dir_all(&run_dir)?;
                     // The agent-under-test's cwd is its env, so its outputs land
-                    // *inside* it — never above its sandbox.
-                    // A hidden, per-(eval, condition, run) subtree keeps concurrent
-                    // same-env subagents from colliding.
+                    // *inside* it — never above its sandbox. A hidden,
+                    // per-(eval, condition, run) subtree keeps concurrent same-env
+                    // subagents from colliding.
                     let outputs_rel = match run_index {
                         None => format!("eval-{}/{cond_name}", ev.id),
                         Some(k) => format!("eval-{}/{cond_name}/run-{k}", ev.id),

diff --git a/src/cli/run/runbook.rs b/src/cli/run/runbook.rs
@@ -172,7 +172,10 @@ pub(crate) fn build_runbook(ctx: &RunbookContext) -> String {
                 agent_model: ctx.agent_model,
             });
             judge_recipe = adapter
-                .cli_judge_next_steps(CliJudgeContext { guard: ctx.guard })
+                .cli_judge_next_steps(CliJudgeContext {
+                    guard: ctx.guard,
+                    iteration_dir: ctx.iteration_dir,
+                })
                 .unwrap_or_else(|| {
                     "Dispatch each judge task `ingest` listed through the same harness CLI, \
                      capturing its transcript output, then finalize."