slowdini · slowdini · Jun 21, 2026 · Jun 19, 2026 · Jun 19, 2026 · Jun 19, 2026
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,6 @@
 /target
 .DS_Store
+
+# eval-magic run artifacts (workspace root + per-env outputs) — churn every run
+.eval-magic/
+.eval-magic-outputs/
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "eval-magic"
-version = "0.3.4"
+version = "0.4.0"
 edition = "2024"
 description = "One-stop CLI for running skill evals — measure whether an agent skill actually shifts behavior."
 license = "MIT"

diff --git a/README.md b/README.md
diff --git a/docs/harness-parity.md b/docs/harness-parity.md
diff --git a/profiles/claude-code/runbook.md b/profiles/claude-code/runbook.md
@@ -0,0 +1,71 @@
+# Eval run — {{SKILL_NAME}} (iteration {{ITERATION}})
+
+You are an agent in a **fresh, isolated** session. Follow this runbook top to bottom to run
+the eval and produce `benchmark.json`. Everything you need is in this iteration directory —
+you should not need anything from the surrounding repo.
+
+- **Skill under test:** {{SKILL_NAME}}
+- **Mode:** {{MODE}} — comparing `{{COND_A}}` vs `{{COND_B}}`
+- **Dispatches:** {{NUM_TASKS}} (the `tasks[]` array in `{{DISPATCH_JSON}}`)
+
+The two conditions run as **separate batches** in this one session: dispatch every subagent of
+one batch, wait for them **all** to return, then switch conditions before dispatching the next.
+Never interleave the batches — `switch-condition` removes the off-condition's staged skill, and a
+subagent still in flight could observe a half-removed skill or read the wrong one.
+
+## 1. Dispatch the `{{COND_A}}` batch
+
+{{DISPATCH_COND_A}}
+
+Wait for **every** one of these subagents to return before continuing.
+
+## 2. Switch to the `{{COND_B}}` condition
+
+This removes the `{{COND_A}}` staged skill so the `{{COND_B}}` batch cannot read it:
+
+```
+{{SWITCH_CMD}}
+```
+
+## 3. Dispatch the `{{COND_B}}` batch
+
+{{DISPATCH_COND_B}}
+
+Wait for **every** one of these subagents to return before continuing.
+
+## 4. Ingest
+
+```
+{{INGEST_CMD}}
+```
+
+`ingest` records each run, backfills transcripts, scans for stray writes, and grades every
+mechanical assertion. It then prints any `llm_judge` tasks it could not grade itself.
+
+## 5. Dispatch the judge subagents, then finalize
+
+Dispatch each judge task `ingest` listed as a subagent the same way — pass its
+`agent_description` verbatim — then merge the verdicts and aggregate:
+
+```
+{{FINALIZE_CMD}}
+```
+
+## 6. Read the result
+
+`finalize` writes the cross-condition benchmark to:
+
+```
+{{BENCHMARK_PATH}}
+```
+
+Read it for the per-condition pass rates and the `{{COND_A}}` − `{{COND_B}}` deltas. This is
+the artifact the prep session resumes on.
+
+## 7. Tear down
+
+When you are done, remove the staged skills (and the write guard, if armed):
+
+```
+{{TEARDOWN_CMD}}
+```
diff --git a/profiles/shared/runbook-headless.md b/profiles/shared/runbook-headless.md
@@ -0,0 +1,40 @@
+# Eval run — {{SKILL_NAME}} (iteration {{ITERATION}}, {{HARNESS}})
+
+This runbook is for a human driving the run from a terminal. Work from this iteration directory
+and copy-paste each step. The workspace is self-contained — you should not need the surrounding
+repo.
+
+- **Skill under test:** {{SKILL_NAME}}
+- **Mode:** {{MODE}} — comparing `{{COND_A}}` vs `{{COND_B}}`
+- **Dispatches:** {{NUM_TASKS}} (the `tasks[]` array in `{{DISPATCH_JSON}}`)
+
+## 1. Dispatch the eval agents, then ingest
+{{DISPATCH_RECIPE}}
+
+`ingest` records each run, backfills transcripts, scans for stray writes, and grades every
+mechanical assertion. It then prints any `llm_judge` tasks it could not grade itself.
+
+## 2. Dispatch the judge agents, then finalize
+{{JUDGE_RECIPE}}
+
+Then merge the verdicts and aggregate:
+
+```
+{{FINALIZE_CMD}}
+```
+
+## 3. Read the result
+
+`finalize` writes the cross-condition benchmark to:
+
+```
+{{BENCHMARK_PATH}}
+```
+
+Read it for the per-condition pass rates and the `{{COND_A}}` − `{{COND_B}}` deltas.
+
+## 4. Tear down
+
+```
+{{TEARDOWN_CMD}}
+```
diff --git a/schema/evals.schema.json b/schema/evals.schema.json
@@ -48,6 +48,12 @@
           "minimum": 1,
           "description": "Runs per condition for this eval, for variance reduction; overrides the --runs flag. Defaults to the flag's value (1 unless raised)."
         },
+        "isolation": {
+          "type": "string",
+          "enum": ["shared", "isolated"],
+          "default": "shared",
+          "description": "Isolation hint for run batching. 'isolated' forces this eval into its own group so it never shares an env with another eval (for confounds the framework can't detect from fixture conflicts, e.g. the agent mutates a shared fixture). Defaults to 'shared'. Evals whose fixtures conflict are auto-isolated regardless."
+        },
         "skill_should_trigger": {
           "type": "boolean",
           "default": true,

diff --git a/src/adapters/claude_cli.rs b/src/adapters/claude_cli.rs
@@ -0,0 +1,136 @@
+//! Claude Code `claude -p` command rendering for `DispatchMechanism::Cli`
+//! guidance (hybrid / headless run modes).
+//!
+//! Differences from the Codex recipe, all forced by the `claude` CLI:
+//! `--output-format stream-json` requires `--verbose` in `-p` mode; there is no
+//! `--cd` flag, so the dispatch runs from the env dir (`cd <eval-root> &&`);
+//! and there is no `--output-last-message`, so the final message is recovered
+//! from the stream-json `result` event by the transcript adapter rather than
+//! written to a file. `</dev/null` detaches stdin so a permission prompt cannot
+//! block on a TTY and piped task data cannot become extra prompt context.
+
+use super::cli_command::render_cli_model_arg;
+
+/// Copy/pasteable Claude Code dispatch command template.
+pub(crate) fn claude_exec_command_template(
+    model_flag: Option<&str>,
+    agent_model: Option<&str>,
+) -> String {
+    let model_arg = render_cli_model_arg(model_flag, agent_model);
+    [
+        format!(
+            "cd <eval-root> && claude -p --output-format stream-json --verbose --permission-mode acceptEdits{model_arg} \\"
+        ),
+        "  \"Read the file at <dispatch_prompt_path> and follow its instructions exactly. When you finish, make your final response your closing summary.\" \\".to_string(),
+        "  </dev/null \\".to_string(),
+        "  > <outputs_dir>/claude-events.jsonl \\".to_string(),
+        "  2> <outputs_dir>/claude-stderr.log".to_string(),
+    ]
+    .join("\n")
+}
+
+/// Parallel dispatch recipe over `dispatch.json` tasks, one `claude -p` per task.
+pub(crate) fn claude_parallel_dispatch_recipe(
+    model_flag: Option<&str>,
+    agent_model: Option<&str>,
+) -> String {
+    let model_arg = render_cli_model_arg(model_flag, agent_model);
+    [
+        "JOBS=${JOBS:-4}".to_string(),
+        "jq -j '.tasks[] | [.eval_root, .dispatch_prompt_path, .outputs_dir] | @tsv + \"\\u0000\"' dispatch.json | \\".to_string(),
+        "  xargs -0 -P \"$JOBS\" -I{} sh -c '".to_string(),
+        "    eval_root=\"$(printf \"%s\" \"$1\" | cut -f1)\"".to_string(),
+        "    prompt_path=\"$(printf \"%s\" \"$1\" | cut -f2)\"".to_string(),
+        "    outputs_dir=\"$(printf \"%s\" \"$1\" | cut -f3)\"".to_string(),
+        "    mkdir -p \"$outputs_dir\"".to_string(),
+        format!(
+            "    cd \"$eval_root\" && claude -p --output-format stream-json --verbose --permission-mode acceptEdits{model_arg} \\"
+        ),
+        "      \"Read the file at $prompt_path and follow its instructions exactly. When you finish, make your final response your closing summary.\" \\".to_string(),
+        "      </dev/null \\".to_string(),
+        "      > \"$outputs_dir/claude-events.jsonl\" \\".to_string(),
+        "      2> \"$outputs_dir/claude-stderr.log\"".to_string(),
+        "  ' sh {}".to_string(),
+    ]
+    .join("\n")
+}
+
+/// Judge dispatch recipe over `judge-tasks.json`, one `claude -p` per task.
+pub(crate) fn claude_judge_dispatch_recipe(model_flag: Option<&str>) -> String {
+    let model_flag = model_flag.unwrap_or("--model");
+    [
+        "Dispatch each judge task from judge-tasks.json with:".to_string(),
+        String::new(),
+        "```bash".to_string(),
+        "JOBS=${JOBS:-4}".to_string(),
+        "jq -j '.tasks[] | [.dispatch_prompt_path, .response_path, (.model // \"\")] | @tsv + \"\\u0000\"' judge-tasks.json | \\".to_string(),
+        "  xargs -0 -P \"$JOBS\" -I{} sh -c '".to_string(),
+        "    prompt_path=\"$(printf \"%s\" \"$1\" | cut -f1)\"".to_string(),
+        "    response_path=\"$(printf \"%s\" \"$1\" | cut -f2)\"".to_string(),
+        "    model=\"$(printf \"%s\" \"$1\" | cut -f3)\"".to_string(),
+        "    response_base=\"${response_path%.json}\"".to_string(),
+        "    mkdir -p \"$(dirname \"$response_path\")\"".to_string(),
+        "    model_arg=\"\"; [ -n \"$model\" ] && model_arg=\"".to_string()
+            + model_flag
+            + " $model\"",
+        "    cd <eval-root> && claude -p --output-format stream-json --verbose --permission-mode acceptEdits $model_arg \\".to_string(),
+        "      \"Read the file at $prompt_path and follow it exactly. You are a judge worker only: write the JSON verdict to $response_path, then reply with one sentence. Do not run eval-magic. Do not dispatch other judge tasks. Do not wait for other workers.\" \\".to_string(),
+        "      </dev/null \\".to_string(),
+        "      > \"$response_base.claude-events.jsonl\" \\".to_string(),
+        "      2> \"$response_base.claude-stderr.log\"".to_string(),
+        "  ' sh {}".to_string(),
+        "```".to_string(),
+    ]
+    .join("\n")
+}
+
+#[cfg(test)]
+mod tests {
+    use super::{
+        claude_exec_command_template, claude_judge_dispatch_recipe, claude_parallel_dispatch_recipe,
+    };
+
+    #[test]
+    fn exec_template_carries_required_stream_json_flags() {
+        let cmd = claude_exec_command_template(Some("--model"), None);
+        assert!(cmd.contains("claude -p"), "{cmd}");
+        assert!(cmd.contains("--output-format stream-json"), "{cmd}");
+        // stream-json requires --verbose in -p mode.
+        assert!(cmd.contains("--verbose"), "{cmd}");
+        assert!(cmd.contains("--permission-mode acceptEdits"), "{cmd}");
+        assert!(cmd.contains("> <outputs_dir>/claude-events.jsonl"), "{cmd}");
+        assert!(cmd.contains("2> <outputs_dir>/claude-stderr.log"), "{cmd}");
+        assert!(cmd.contains("</dev/null"), "{cmd}");
+        // claude has no --cd flag; the dispatch runs from the env dir.
+        assert!(cmd.contains("cd <eval-root>"), "{cmd}");
+        assert!(cmd.contains("<dispatch_prompt_path>"), "{cmd}");
+        // claude has no --output-last-message; final text comes from the result event.
+        assert!(!cmd.contains("--output-last-message"), "{cmd}");
+        assert!(!cmd.contains("final-message.md"), "{cmd}");
+    }
+
+    #[test]
+    fn exec_template_includes_model_only_when_declared() {
+        let with = claude_exec_command_template(Some("--model"), Some("opus"));
+        assert!(with.contains("--model opus"), "{with}");
+        let without = claude_exec_command_template(Some("--model"), None);
+        assert!(!without.contains("--model "), "{without}");
+    }
+
+    #[test]
+    fn parallel_recipe_drives_claude_p_per_task() {
+        let recipe = claude_parallel_dispatch_recipe(Some("--model"), Some("sonnet"));
+        assert!(recipe.contains("claude -p"), "{recipe}");
+        assert!(recipe.contains("claude-events.jsonl"), "{recipe}");
+        assert!(recipe.contains("dispatch.json"), "{recipe}");
+        assert!(recipe.contains("--model sonnet"), "{recipe}");
+    }
+
+    #[test]
+    fn judge_recipe_drives_claude_p() {
+        let recipe = claude_judge_dispatch_recipe(Some("--model"));
+        assert!(recipe.contains("claude -p"), "{recipe}");
+        assert!(recipe.contains("judge-tasks.json"), "{recipe}");
+        assert!(recipe.contains("response_path"), "{recipe}");
+    }
+}
diff --git a/src/adapters/claude_code_transcript.rs b/src/adapters/claude_code_transcript.rs
@@ -16,11 +16,11 @@ use std::path::{Path, PathBuf};
 use std::time::SystemTime;
 
 #[derive(Debug, Deserialize)]
-struct UsageRecord {
-    input_tokens: Option<i64>,
-    output_tokens: Option<i64>,
-    cache_creation_input_tokens: Option<i64>,
-    cache_read_input_tokens: Option<i64>,
+pub(crate) struct UsageRecord {
+    pub(crate) input_tokens: Option<i64>,
+    pub(crate) output_tokens: Option<i64>,
+    pub(crate) cache_creation_input_tokens: Option<i64>,
+    pub(crate) cache_read_input_tokens: Option<i64>,
 }
 
 #[derive(Debug, Deserialize)]
@@ -32,9 +32,9 @@ struct Message {
 }
 
 #[derive(Debug, Deserialize)]
-struct TranscriptRecord {
+pub(crate) struct TranscriptRecord {
     #[serde(rename = "type")]
-    record_type: Option<String>,
+    pub(crate) record_type: Option<String>,
     timestamp: Option<String>,
     message: Option<Message>,
 }
@@ -83,7 +83,7 @@ fn stringify_result(content: Option<&Value>) -> String {
     }
 }
 
-fn read_records(jsonl_path: &Path) -> io::Result<Vec<TranscriptRecord>> {
+pub(crate) fn read_records(jsonl_path: &Path) -> io::Result<Vec<TranscriptRecord>> {
     let raw = fs::read_to_string(jsonl_path)?;
     let mut records = Vec::new();
     for line in raw.split('\n') {
@@ -98,7 +98,7 @@ fn read_records(jsonl_path: &Path) -> io::Result<Vec<TranscriptRecord>> {
     Ok(records)
 }
 
-fn extract_invocations(records: &[TranscriptRecord]) -> Vec<ToolInvocation> {
+pub(crate) fn extract_invocations(records: &[TranscriptRecord]) -> Vec<ToolInvocation> {
     let mut invocations: Vec<ToolInvocation> = Vec::new();
     let mut index_by_id: HashMap<String, usize> = HashMap::new();
 
@@ -151,6 +151,27 @@ pub fn parse_transcript(jsonl_path: &Path) -> io::Result<Vec<ToolInvocation>> {
     Ok(extract_invocations(&read_records(jsonl_path)?))
 }
 
+/// The concatenated text blocks of the last assistant message carrying any text.
+/// Shared with the `-p` stream-json parser, which uses it as the final-message
+/// fallback when the terminal `result` event is absent or errored.
+pub(crate) fn last_assistant_text(records: &[TranscriptRecord]) -> Option<String> {
+    let mut final_text: Option<String> = None;
+    for record in records {
+        if record.record_type.as_deref() != Some("assistant") {
+            continue;
+        }
+        let texts: Vec<&str> = content_blocks(&record.message)
+            .iter()
+            .filter(|b| b.get("type").and_then(Value::as_str) == Some("text"))
+            .filter_map(|b| b.get("text").and_then(Value::as_str))
+            .collect();
+        if !texts.is_empty() {
+            final_text = Some(texts.join("\n"));
+        }
+    }
+    final_text
+}
+
 /// A transcript boiled down to the artifacts the pipeline needs.
 #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
 pub struct TranscriptSummary {
@@ -179,7 +200,6 @@ pub fn parse_transcript_full(jsonl_path: &Path) -> io::Result<TranscriptSummary>
     let mut first_ts: Option<i64> = None;
     let mut last_ts: Option<i64> = None;
     let mut timestamp_count = 0usize;
-    let mut final_text: Option<String> = None;
 
     for record in &records {
         if let Some(ts_str) = &record.timestamp
@@ -201,17 +221,10 @@ pub fn parse_transcript_full(jsonl_path: &Path) -> io::Result<TranscriptSummary>
         {
             usage_by_id.insert(id, usage);
         }
-
-        let texts: Vec<&str> = content_blocks(&record.message)
-            .iter()
-            .filter(|b| b.get("type").and_then(Value::as_str) == Some("text"))
-            .filter_map(|b| b.get("text").and_then(Value::as_str))
-            .collect();
-        if !texts.is_empty() {
-            final_text = Some(texts.join("\n"));
-        }
     }
 
+    let final_text = last_assistant_text(&records);
+
     let total_tokens = if usage_by_id.is_empty() {
         None
     } else {