diff --git a/.gitignore b/.gitignore
index 0592392..24b97b6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,6 @@
 /target
 .DS_Store
+
+# eval-magic run artifacts (workspace root + per-env outputs) — churn every run
+.eval-magic/
+.eval-magic-outputs/
diff --git a/Cargo.lock b/Cargo.lock
index 31f8217..dcc9b44 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -275,7 +275,7 @@ dependencies = [
 
 [[package]]
 name = "eval-magic"
-version = "0.3.4"
+version = "0.4.0"
 dependencies = [
  "anyhow",
  "assert_cmd",
diff --git a/Cargo.toml b/Cargo.toml
index 2b2ed8a..8c0b826 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "eval-magic"
-version = "0.3.4"
+version = "0.4.0"
 edition = "2024"
 description = "One-stop CLI for running skill evals — measure whether an agent skill actually shifts behavior."
 license = "MIT"
diff --git a/README.md b/README.md
index 4edb9d2..4d8ec07 100644
--- a/README.md
+++ b/README.md
@@ -41,7 +41,7 @@ cargo build --release          # binary at target/release/eval-magic
 
 ## How an eval works
 
-For each test case, the runner sets up two conditions and your agent dispatches a fresh subagent into each, with clean context:
+For each test case, the runner sets up two conditions and a fresh subagent runs each with clean context — *how* that subagent is dispatched (in-session vs. one-shot CLI) is the run-mode axis covered under [Harnesses](#harnesses):
 
 - **Mode A — new skill:** `with_skill` vs `without_skill`. Validates a brand-new skill beats baseline behavior with no skill loaded.
 - **Mode B — revision (the common case):** `old_skill` vs `new_skill`. Tests a language change to an existing skill — you snapshot the old `SKILL.md`, then run both variants against the same prompts. A negative or zero `delta.pass_rate` is a signal to revert.
@@ -85,24 +85,35 @@ environment.
 ### Mode A — new skill (with vs. without)
 
 ```bash
-# 1. Build the iteration workspace (arm --guard — see Cost & confirmation).
+# 1. Build the iteration's isolated env (arm --guard — see Cost & confirmation).
+#    run stages skills into .eval-magic/my-skill/iteration-1/env/, copies
+#    fixtures in, and writes RUNBOOK.md. It does NOT dispatch — it prints a handoff.
 #    Add --runs <N> to dispatch every eval N times per condition for variance
 #    reduction (a per-eval "runs" field in evals.json overrides the flag).
 eval-magic run --guard
 
-# 2. Your agent dispatches each task in skills-workspace/my-skill/iteration-1/dispatch.json
-#    as a fresh subagent (each reads its dispatch_prompt_path and follows it).
+# 2. Enter the isolated env and follow the runbook. cd into iteration-1/env/ and
+#    start a fresh agent session there (interactive Claude Code: the staged skills
+#    must be present at session start), then say: "Read and follow RUNBOOK.md".
+#    That session drives the whole loop below — dispatch → switch-condition →
+#    ingest → finalize — and writes benchmark.json into iteration-1/. (Headless:
+#    you, a human, follow the same RUNBOOK.md top to bottom; hybrid: the session
+#    shells out a `claude -p` / `codex exec` recipe per task.) See Claude Code
+#    below for the plugin-isolation and transcript specifics.
 
-# 3. Assemble records, detect stray writes, grade. ingest auto-resolves the
-#    subagents dir from CLAUDE_CODE_SESSION_ID; outside that session pass
-#    --session-id <id> or --subagents-dir <path>.
+# Steps 3–5 are driven from inside the runbook — shown here for reference:
+
+# 3. ingest assembles records, detects stray writes, and grades, stopping at the
+#    judge hand-off. In-session it auto-resolves transcripts from
+#    CLAUDE_CODE_SESSION_ID; hybrid/headless read each task's events file instead.
 eval-magic ingest
 
 # 4. Dispatch the judge tasks ingest lists, then finalize. If --guard is still
 #    armed, finalize reminds you to run teardown-guard before editing source.
 eval-magic finalize
 
-# 5. Read skills-workspace/my-skill/iteration-1/benchmark.json, then clean up:
+# 5. Read .eval-magic/my-skill/iteration-1/benchmark.json (the prep session
+#    resumes here), then clean up:
 eval-magic teardown
 ```
 
@@ -120,20 +131,35 @@ If you snapshot *before* editing, omit `--ref` (it then reads the working tree)
 
 ## The run loop
 
-A run is one canonical workflow, the same in both modes:
+A run is one canonical workflow. `run` *prepares* an isolated env and hands off; a session entered in that env drives the rest of the loop to `benchmark.json`:
 
 ```
-run  →  dispatch agents  →  ingest  →  dispatch judges  →  finalize  →  teardown
+run (prepare env/ + RUNBOOK.md)
+  └─► [in env/, runbook-driven] dispatch batch A → switch-condition → dispatch batch B
+        → ingest → dispatch judges → finalize  ──►  benchmark.json
+teardown
 ```
 
-1. **`run`** builds the iteration workspace, snapshots the `SKILL.md`, stages skills, and emits `dispatch.json` (machine-readable) alongside `dispatch-manifest.md` (human-readable).
-2. **Dispatch agents.** Read `dispatch.json`. Each task object points at a `dispatch_prompt_path` (the full prompt lives in a file so you never reproduce kilobytes inline), an `agent_description` to pass through *verbatim* as the dispatch description, and the exact `run_record_path` / `timing_path`. For each task, dispatch a fresh subagent told to read the file at `dispatch_prompt_path` and follow it exactly. The `agent_description` is namespaced with the iteration and a per-run nonce (`<eval_id>:<condition>[:r<k>]:i<N>-<nonce>`; the `r<k>` segment appears only in multi-run cells, see `run --help` on `--runs`) — passing it through unchanged is what lets transcripts correlate to runs.
-3. **`ingest`** (a fixed-order chain: record-runs → fill-transcripts → detect-stray-writes → grade) assembles each task's `run.json` and `timing.json` from `dispatch.json` + the subagent's `outputs/final-message.md` + the persisted transcript, scans for stray writes, and grades the `transcript_check` assertions. It stops at the judge hand-off, listing a judge task per `llm_judge` assertion.
-4. **Dispatch judges.** Same pattern as step 2: dispatch a fresh subagent for each judge task to read its prompt file and write its verdict back.
-5. **`finalize`** (grade `--finalize` → aggregate) merges the judge verdicts and writes `benchmark.json`. Read it. If a `--guard` marker is still live, it also reminds you to run `teardown-guard` before editing source.
-6. **`teardown`** disarms the guard, removes the staged skill set, and reclaims the workspace artifacts that are safe to delete.
+1. **`run` prepares — it does not dispatch.** It builds the iteration workspace (`iteration-N/`), snapshots the `SKILL.md`, stages skills into the isolated env `iteration-N/env/` (the agent's cwd), copies fixtures in so it reads like a real repo, emits `dispatch.json` (machine-readable) alongside `dispatch-manifest.md` (human-readable), and writes `RUNBOOK.md` into `env/`. Then it prints a handoff, not a dispatch.
+2. **Enter the isolated env.** `cd` into `iteration-N/env/`, begin a run session there, and say *Read and follow `RUNBOOK.md`*. How you enter differs by run mode (see [Run modes](#run-modes)):
+   - **Interactive (Claude Code):** start a *fresh* Claude Code session in `env/` so the staged skills are present at session start; it dispatches in-session subagents and runs the rest of the loop itself.
+   - **Hybrid (Claude Code / Codex):** an orchestrating session follows `RUNBOOK.md`, shelling out a `claude -p` / `codex exec` recipe per task.
+   - **Headless (Claude Code / Codex):** no session — a human follows the same `RUNBOOK.md`, pasting each recipe and command top to bottom.
+3. **Dispatch agents (runbook-driven).** Read `dispatch.json`. Each task object points at a `dispatch_prompt_path` (the full prompt lives in a file so you never reproduce kilobytes inline), an `agent_description` to pass through *verbatim* as the dispatch description, and the exact `run_record_path` / `timing_path`. For each task, dispatch a fresh subagent told to read the file at `dispatch_prompt_path` and follow it exactly. The `agent_description` is namespaced with the iteration and a per-run nonce (`<eval_id>:<condition>[:r<k>]:i<N>-<nonce>`; the `r<k>` segment appears only in multi-run cells, see `run --help` on `--runs`) — passing it through unchanged is what lets transcripts correlate to runs.
+4. **`switch-condition` between condition batches.** Conditions run as sequential batches, never interleaved. After joining *all* of the first batch's subagents, run `eval-magic switch-condition --condition <next>` to remove the off-condition's staged skill from `env/.claude/skills/`, so the next batch can't read it — the read-isolation barrier. When a run has more than one **isolation group** (see below), the runbook also dispatches each group as its own batch and runs `eval-magic reset-batch --group <g>` between groups — it wipes the shared `env/` working tree and re-seeds it with that group's fixtures, the per-group isolation barrier. The runbook spells out the exact `switch-condition` / `reset-batch` sequence; you never work it out yourself.
+5. **`ingest`** (a fixed-order chain: record-runs → fill-transcripts → detect-stray-writes → grade), run from inside `env/`, assembles each task's `run.json` and `timing.json` from `dispatch.json` + the subagent's `outputs/final-message.md` + the persisted transcript, scans for stray writes, and grades the `transcript_check` assertions. It stops at the judge hand-off, listing a judge task per `llm_judge` assertion.
+6. **Dispatch judges.** Same pattern as step 3: dispatch a fresh subagent for each judge task to read its prompt file and write its verdict back.
+7. **`finalize`** (grade `--finalize` → aggregate) merges the judge verdicts and writes `benchmark.json` into `iteration-N/`, *above* `env/`. Read it. If a `--guard` marker is still live, it also reminds you to run `teardown-guard` before editing source.
+8. **`teardown`** disarms the guard, removes the staged skill set, and reclaims the workspace artifacts that are safe to delete.
+
+The chains run in-process and stop at the first failure; re-running after a fix is safe — every sub-step skips work that's already done. The individual steps (`record-runs`, `fill-transcripts`, `detect-stray-writes`, `grade`, `aggregate`) remain callable for inspection or recovery. Under the `Cli` mechanism (Claude Code hybrid/headless, Codex), the per-task dispatch recipe lives in `RUNBOOK.md` and `ingest` reads each task's events file (`claude-events.jsonl` / `codex-events.jsonl`) instead of an in-session transcript; un-wired harnesses still write records by hand until their adapters land.
+
+### Isolation grouping (which agents batch together)
 
-The chains run in-process and stop at the first failure; re-running after a fix is safe — every sub-step skips work that's already done. The individual steps (`record-runs`, `fill-transcripts`, `detect-stray-writes`, `grade`, `aggregate`) remain callable for inspection or recovery. Codex uses the same chain with `--harness codex` after each task captures `outputs/codex-events.jsonl`; un-wired harnesses still write records by hand until their adapters land.
+`run` decides at **setup** time which evals can share an environment and which need their own, writes the plan into `dispatch.json` (a `groups[]` summary plus a per-task `group`/`eval_root`), and the runbook follows it — the executing session does no isolation reasoning itself. By default every eval shares one group (today's behavior, unchanged). Two things create a separate group: evals whose fixtures would clobber each other, and an eval that opts out explicitly with `"isolation": "isolated"` in `evals.json` (use it when an eval's agent *mutates* a fixture another eval reads). How groups are realized depends on the run mode:
+
+- **Interactive (in-session):** one `env/`; groups are dispatched as sequential batches with a `reset-batch` barrier between them.
+- **Hybrid / headless (Cli):** one env per `(group, condition)` — `iteration-N/env-<group>-<condition>/` — so each subprocess `cd`s into a fully-isolated cwd. This also gives the Cli path real per-condition isolation: the control arm's env contains no skill at all.
 
 ## Cost & confirmation
 
@@ -175,7 +201,7 @@ Read `validity_warnings` **before** trusting any delta — a low skill-invocatio
 Per skill being evaluated, the runner produces this tree (everything but `evals/evals.json` is generated):
 
 ```
-skills-workspace/<skill>/                # outside the skill directory, gitignore it
+.eval-magic/<skill>/                     # outside the skill directory, gitignore it
   snapshots/                             # Mode B baselines, persist across iterations
     <label>/SKILL.md
   iteration-N/
@@ -202,7 +228,7 @@ independently and the benchmark's per-condition `mean`/`stddev`/`n` cover all of
         run-2/  outputs/  run.json  timing.json  grading.json
 ```
 
-The only source file you author for evals is `<skill>/evals/evals.json` (or create it with `eval-magic init`). Keep `skills-workspace/` out of version control — it churns on every run. Snapshot retention is manual: delete `<workspace>/<skill>/snapshots/<label>/` when no longer needed.
+The only source file you author for evals is `<skill>/evals/evals.json` (or create it with `eval-magic init`). Keep `.eval-magic/` out of version control — it churns on every run. Snapshot retention is manual: delete `<workspace>/<skill>/snapshots/<label>/` when no longer needed.
 
 ## Version-controlled baselines
 
@@ -252,8 +278,8 @@ Support today:
 
 | Harness | Headless | Fully interactive | Hybrid |
 |---------|:--------:|:-----------------:|:------:|
-| **Claude Code** | ❌ not yet | ✅ | ❌ not yet |
-| **Codex** | ❌ not yet | ❔ likely N/A¹ | ✅ |
+| **Claude Code** | ✅ | ✅ | ✅ |
+| **Codex** | ✅ | ❔ likely N/A¹ | ✅ |
 | **OpenCode** | ❌ | ❌ | ❌² |
 
 ¹ Codex dispatches via subprocess (`codex exec`), not in-session subagents, so a "fully interactive" Codex mode may not translate. ² OpenCode foundational harness support is wired: `--harness opencode` stages skills under `.opencode/skills/` and emits native dispatch prompts. Transcript ingest, auto-record, and `--guard` are pending.
@@ -264,34 +290,48 @@ Support today:
 - **Claude Code, headless / hybrid** (`claude -p`) — same token-based pricing, but on **subscription plans, starting June 15 2026**, `claude -p` (Agent SDK) usage draws from a **separate monthly Agent SDK credit pool**, distinct from interactive limits. Headless JSON output exposes `total_cost_usd` per invocation, so the runner can record per-task cost — something the in-session Task-tool path can't easily capture.
 - **Codex, hybrid** (`codex exec`) — billed under your Codex usage.
 
-**Intended end state:** all three modes first-class on Claude Code and Codex (where the mode translates), and OpenCode wired as a third harness. Progress is tracked in [GitHub issues](https://github.com/slowdini/eval-magic/issues).
+**Intended end state:** all three modes first-class on Claude Code and Codex (where the mode translates) — reached as of this release; OpenCode wired as a third harness remains. Progress is tracked in [GitHub issues](https://github.com/slowdini/eval-magic/issues).
 
 ### Claude Code (fully wired)
 
-The run loop above *is* the Claude Code loop. Today this is the **fully-interactive** run mode (see [Run modes](#run-modes)) — subagents are dispatched in-session via the Task tool; the **headless** and **hybrid** (`claude -p`) modes are not yet wired. These are the Claude-Code-specific details:
+The run loop above *is* the Claude Code loop. By default this is the **fully-interactive** run mode (see [Run modes](#run-modes)) — subagents are dispatched in-session via the Task tool; the **hybrid** and **headless** (`claude -p`) modes are now wired too (pass `--run-mode hybrid` or `--run-mode headless`, see below). `eval-magic run` itself only *prepares* the isolated env (`.eval-magic/<skill>/iteration-N/env/`) and writes `RUNBOOK.md` into it, then prints a handoff: `cd` into `env/`, start a **fresh** Claude Code session there, and say *Read and follow RUNBOOK.md*. That fresh session — clean cwd, staged skills present at session start — drives the whole dispatch → switch-condition → ingest → finalize loop and writes `benchmark.json`, which the prep session resumes on. These are the Claude-Code-specific details:
 
-**Isolating from installed plugins.** Read this first if the skill you're evaluating shares a name with one an installed, enabled plugin provides. Subagents are dispatched via the **Task tool**, so they inherit *this session's* enabled plugins — the staging slug avoids an on-disk collision but does not stop the installed copy from being discoverable, contaminating both arms (the `without_skill` arm is then not truly skill-absent). Plugins load at session start and can't be unloaded mid-session, so the runner only *detects and warns* (the plugin-shadow banner). To actually isolate, launch the session you run the eval from one of these ways — subagents inherit it:
+**Isolating from installed plugins.** Read this first if the skill you're evaluating shares a name with one an installed, enabled plugin provides. Subagents are dispatched via the **Task tool**, so they inherit *this session's* enabled plugins — the staging slug avoids an on-disk collision but does not stop the installed copy from being discoverable, contaminating both arms (the `without_skill` arm is then not truly skill-absent). Plugins load at session start and can't be unloaded mid-session, so the runner only *detects and warns* (the plugin-shadow banner). The isolated env gives a clean *cwd* but does not unload user/global plugins, so this still applies. To actually isolate, launch the **fresh session you start in `env/`** one of these ways — subagents inherit it:
 
 - **Drop user-scope plugins, keep auth:** `claude --setting-sources project,local`. User-scope `enabledPlugins` isn't loaded; auth is unaffected.
 - **Disable the specific plugin, then restart:** set `"enabledPlugins": { "<plugin>@<marketplace>": false }` in a settings source that loads at startup, and start a fresh session.
 - **Clean config dir (strips everything):** `CLAUDE_CONFIG_DIR="$(mktemp -d)" claude`. No installed plugins or global skills load at all. Auth caveat: OAuth lives in `~/.claude.json`, which a relocated config dir may not carry — set `ANTHROPIC_API_KEY` or re-authenticate once in the fresh dir.
 
-Project-local staged skills live in `<cwd>/.claude/skills/`, independent of installed plugins, so they still load and the meta-check still resolves the slug under all three.
+Project-local staged skills live in the isolated env at `env/.claude/skills/`, independent of installed plugins, so they still load and the meta-check still resolves the slug under all three.
 
-**Same-session staging gotcha.** Claude Code applies *live change detection* to skill directories that existed when the session started, so whether subagents discover the *mid-session*-staged eval skills hinges on one question: did `<cwd>/.claude/skills/` already exist when your session started? If it did, the staged skills are surfaced in-session and subagents dispatched afterward discover them (a freshly-staged skill can lag the watcher by a moment). If `run` had to *create* `.claude/skills/`, that new top-level directory isn't watched until the session re-scans — a restart, or a plugin reload / other refresh event — so until then subagents won't discover the staged copies and every with-skill arm falls back. `run` detects which case applies and prints either a confirmation note or the actionable warning. To guarantee discovery either way: restart (or start a *fresh* session) once `.claude/skills/` exists, or run with `--no-stage` (each `SKILL.md` is inlined into its dispatch prompt, so there is no staged discovery to miss). Regardless, run `detect-stray-writes` (folded into `ingest`) before trusting a staged result.
+**Discovery is structural now.** Claude Code only watches skill directories that existed when the session started. Because `eval-magic run` builds `env/.claude/skills/` *before* you start the fresh session in `env/`, the staged skills are present at session start and discovered normally — there is no mid-session staging, so the old "did the dir exist when your session started?" hazard (and the build-time warning it once printed) no longer applies. `--no-stage` remains for harnesses without project-local skill discovery: each `SKILL.md` is inlined into its dispatch prompt instead of staged. Regardless, run `detect-stray-writes` (folded into `ingest`) before trusting a result.
 
 **Where transcripts live.** Claude Code persists subagent transcripts under `~/.claude/projects/<project-slug>/<parent-session-id>/subagents/`. `ingest` auto-resolves this from the `CLAUDE_CODE_SESSION_ID` the orchestrating session exports (deriving `<project-slug>` from the cwd and scanning `projects/*` if needed), so you normally don't pass `--subagents-dir` at all. When running outside that session — or to target a past session — pass `--session-id <id>`, or override the lookup entirely with `--subagents-dir <path>`. Besides out-of-bounds writes, `detect-stray-writes` also flags **live-source reads**: an arm whose subagent read the live skill source instead of its staged copy. That usually means the Skill tool couldn't resolve the staged slug yet and the agent improvised — fatal in revision mode, where the `old_skill` arm then sees new-skill content. Treat a flagged cell's arm as contaminated.
 
 **Dispatching via the Task tool.** `dispatch.json` is a top-level object (`{ skill_name, iteration, run_nonce, …, tasks: [...] }`); iterate `tasks[]`. For each task, dispatch a fresh subagent via the Task tool with the prompt `Read the file at <dispatch_prompt_path> and follow its instructions exactly.` (substituting the task's `dispatch_prompt_path`), and pass `agent_description` *verbatim* as the description — it's namespaced `<eval_id>:<condition>:i<N>-<nonce>`, and passing it unchanged is what lets transcript correlation work. (The Task tool documents `description` as "short", but pass the full string regardless — correlation depends on the exact value.) You do **not** write `run.json`/`timing.json` yourself; the subagent writes `outputs/final-message.md`, and `ingest` (`record-runs`) assembles both records from disk. For a plan-mode-relevant skill, add `--plan-mode` to inject Claude Code's verbatim plan-mode procedure as a `<system-reminder>` operating-context layer.
 
+**Hybrid mode (`--run-mode hybrid`).** Pass `--run-mode hybrid` to dispatch each task through the `claude -p` one-shot CLI instead of in-session subagents — the same shape as Codex's hybrid flow, where an agent session orchestrates while each test/judge shells out to the CLI. `run` then prints (and `dispatch-manifest.md` / `RUNBOOK.md` carry) a `claude -p` recipe per task:
+
+```bash
+cd <eval-root> && claude -p --output-format stream-json --verbose --permission-mode acceptEdits \
+  "Read the file at <dispatch_prompt_path> and follow its instructions exactly. …" \
+  </dev/null \
+  > <outputs_dir>/claude-events.jsonl \
+  2> <outputs_dir>/claude-stderr.log
+```
+
+Three details differ from Codex's `codex exec`: `--output-format stream-json` **requires `--verbose`** in `-p` mode; `claude` has **no `--cd` flag**, so each dispatch must run from the env dir (`cd <eval-root> &&`) — staged-skill discovery is cwd-relative, so getting this wrong makes the `with_skill` arm behave like `without_skill`; and there is **no `--output-last-message`**, so the final message is recovered from the stream-json `result` event rather than a file. Detach stdin with `</dev/null` so a permission prompt can't block on a TTY. Then `eval-magic ingest --harness claude-code --run-mode hybrid` reads each task's `outputs/claude-events.jsonl` (the `-p` stream-json transcript) to populate `tool_invocations`, tokens, duration, and the final message. `--run-mode` is recorded in `conditions.json`; pass it to each post-dispatch command (the printed next-step commands already carry it). `--guard` works under hybrid and headless too: the `PreToolUse` hook is staged in `env/.claude/settings.local.json`, and because each `claude -p` dispatch runs from `env/` (`cd <eval-root>`), it loads and enforces the hook exactly as an interactive session would (the recipe never passes `--bare`, which would skip hook discovery). A deny aborts the offending dispatch; `detect-stray-writes` (folded into `ingest`) remains the after-the-fact backstop.
+
+**Headless mode (`--run-mode headless`).** The same `claude -p` dispatch as hybrid, but no agent session drives the loop — you (a human) paste each `eval-magic` command and the `claude -p` recipe yourself, ending in a written `benchmark.json`. `run` writes the same human-followed `RUNBOOK.md` (the shared headless template) into `env/`; work from that directory and copy-paste top to bottom: dispatch the tests → `ingest` → dispatch the judges → `finalize` → read the result → `teardown`. Pass `--run-mode headless` to every command of the run (the printed next steps and the runbook already carry it). `--guard` behaves exactly as it does under hybrid.
+
 ### Codex
 
-Codex's `codex exec --json` flow is the **hybrid** run mode today (see [Run modes](#run-modes)) — an agent session orchestrates while each dispatch shells out to the CLI. A fully **headless** Codex mode (eval-magic driving the whole loop with no session) is not yet wired, and a **fully-interactive** mode likely doesn't translate, since Codex dispatches via subprocess rather than in-session subagents.
+Codex's `codex --ask-for-approval never exec --json` flow powers both CLI run modes (see [Run modes](#run-modes)): **hybrid** — an agent session orchestrates while each dispatch shells out to the CLI — and **headless** (`--run-mode headless`), where eval-magic drives the whole loop with no session, writing the same human-followed `RUNBOOK.md`. A **fully-interactive** mode likely doesn't translate, since Codex dispatches via subprocess rather than in-session subagents.
 
-Pass `--harness codex`: skills stage under repo-local `.agents/skills/` (the staged skill-under-test's frontmatter `name:` is rewritten to the eval slug so Codex's repo-local discovery sees it), and `conditions.json` / `dispatch.json` record `"harness": "codex"`. Dispatch each task with a fresh `codex exec --json` execution, capturing the event stream:
+Pass `--harness codex`: skills stage under repo-local `.agents/skills/` (the staged skill-under-test's frontmatter `name:` is rewritten to the eval slug so Codex's repo-local discovery sees it), and `conditions.json` / `dispatch.json` record `"harness": "codex"`. Dispatch each task with a fresh `codex --ask-for-approval never exec --json` execution, capturing the event stream:
 
 ```bash
-codex exec --cd <eval-root> --sandbox workspace-write --ask-for-approval never --json \
+codex --ask-for-approval never exec --cd <eval-root> --sandbox workspace-write --json \
   --output-last-message <outputs_dir>/final-message.md \
   "Read the file at <dispatch_prompt_path> and follow its instructions exactly. When you finish, make your final response exactly the same text you wrote to <outputs_dir>/final-message.md." \
   </dev/null \
@@ -302,7 +342,7 @@ codex exec --cd <eval-root> --sandbox workspace-write --ask-for-approval never -
 When `run --agent-model <id>` is set, the generated Codex recipes insert `-m <id>` before `--json`:
 
 ```bash
-codex exec --cd <eval-root> --sandbox workspace-write --ask-for-approval never -m <agent-model> --json \
+codex --ask-for-approval never exec --cd <eval-root> --sandbox workspace-write -m <agent-model> --json \
   --output-last-message <outputs_dir>/final-message.md \
   "Read the file at <dispatch_prompt_path> and follow its instructions exactly. When you finish, make your final response exactly the same text you wrote to <outputs_dir>/final-message.md." \
   </dev/null \
@@ -310,10 +350,10 @@ codex exec --cd <eval-root> --sandbox workspace-write --ask-for-approval never -
   2> <outputs_dir>/codex-stderr.log
 ```
 
-When the run was armed with `--guard`, add `--dangerously-bypass-hook-trust` to that `codex exec` command so the vetted project-local `PreToolUse` hook staged in `.codex/hooks.json` actually runs:
+When the run was armed with `--guard`, add `--dangerously-bypass-hook-trust` to that `codex --ask-for-approval never exec` command so the vetted project-local `PreToolUse` hook staged in `.codex/hooks.json` actually runs:
 
 ```bash
-codex exec --cd <eval-root> --sandbox workspace-write --ask-for-approval never --dangerously-bypass-hook-trust --json \
+codex --ask-for-approval never exec --cd <eval-root> --sandbox workspace-write --dangerously-bypass-hook-trust --json \
   --output-last-message <outputs_dir>/final-message.md \
   "Read the file at <dispatch_prompt_path> and follow its instructions exactly. When you finish, make your final response exactly the same text you wrote to <outputs_dir>/final-message.md." \
   </dev/null \
diff --git a/docs/harness-parity.md b/docs/harness-parity.md
index a977c7a..15be4bf 100644
--- a/docs/harness-parity.md
+++ b/docs/harness-parity.md
@@ -14,7 +14,7 @@ Parity is organized around **run mode** — *how* an eval is dispatched — as t
    - **`InSession`** — the runner hands tasks to in-session subagents (Claude Code's Task tool). **Claude Code** is the reference.
    - **`Cli`** — each task is dispatched through a one-shot harness CLI subprocess (`codex exec`). **Codex** is the reference.
 
-   These two mechanisms underpin the three *user-facing* run modes in the README's [Run modes](../README.md#run-modes) section: **fully-interactive** rides on `InSession`; **headless** and **hybrid** both ride on `Cli`. `capabilities_for(harness)` (same file) maps each harness to the mechanism it wires today plus the narrow run-option capabilities the generic `run` preflight validates; `mechanism_for(harness)` reads from that table. That table is the single place the run-mode harness↔mechanism coupling lives.
+   These two mechanisms underpin the three *user-facing* run modes (the `RunMode` enum, same file) in the README's [Run modes](../README.md#run-modes) section: **fully-interactive** rides on `InSession`; **headless** and **hybrid** both ride on `Cli` (`RunMode::mechanism()`). The mechanism is selected per run by the `--run-mode` flag, not fixed by the harness: `resolve_run_mode(harness, requested)` defaults the mode per harness (Claude Code → interactive, Codex/OpenCode → hybrid) and rejects combinations a harness doesn't support. A single harness can therefore wire more than one mechanism — Claude Code wires both `InSession` (interactive) and `Cli` (hybrid + headless). `capabilities_for(harness)` (same file) carries the narrow run-option capabilities the generic `run` preflight validates.
 
 2. **Harness-adapter feature parity (the plug-in surface).** Each harness plugs into the runner through one impl of the **`HarnessAdapter`** trait in `src/adapters/harness.rs`, resolved by `adapter_for(harness)`. The trait's methods *are* the feature surface: skill-list rendering, transcript parsing, staged-skills dir, plan-mode profile, the write-guard hook, and the `Cli`-mechanism dispatch guidance. A harness reaches parity for a mechanism by implementing the trait methods that mechanism consumes.
 
@@ -42,9 +42,9 @@ Read these in order. Paths are relative to the repository root.
 
 | Source | What to look for |
 |--------|------------------|
-| `src/core/run_mode.rs` | `DispatchMechanism`, `capabilities_for`, and `mechanism_for`. The two dispatch mechanisms, which one your harness maps to today, and the focused run-option capabilities generic preflight validates |
-| `src/adapters/harness.rs` | The `HarnessAdapter` trait (the feature surface), the three impls, and `adapter_for`. The reference impls are `ClaudeCodeAdapter` (`InSession`) and `CodexAdapter` (`Cli`) — read the one that matches your mechanism |
-| `src/adapters/claude_code_transcript.rs` and `src/adapters/codex_transcript.rs` | The reference transcript parsers (`parse_transcript*` / `parse_codex_events*`) that the trait's `parse_transcript` / `parse_transcript_full` delegate to. A second harness translates its transcript shape into the same `ToolInvocation` list / `TranscriptSummary` |
+| `src/core/run_mode.rs` | `DispatchMechanism`, `RunMode`, `resolve_run_mode`, and `capabilities_for`. The two dispatch mechanisms, the user-facing run modes that select them (and which a harness supports), and the focused run-option capabilities generic preflight validates |
+| `src/adapters/harness.rs` | The `HarnessAdapter` trait (the feature surface), the three impls, and `adapter_for`. The reference impls are `ClaudeCodeAdapter` (`InSession` *and* `Cli` via hybrid/headless) and `CodexAdapter` (`Cli`) — read the one that matches your mechanism |
+| `src/adapters/claude_code_transcript.rs`, `src/adapters/claude_stream_json.rs`, and `src/adapters/codex_transcript.rs` | The reference transcript parsers. The `InSession` branch uses `parse_transcript*`; the `Cli` branch uses `parse_cli_events*` (default-delegating to `parse_transcript*`, so Codex's events parser serves both). Claude Code splits them: `parse_transcript*` parses the in-session subagent JSONL, `parse_cli_events*` parses `claude -p` stream-json (`claude_stream_json.rs`). A second harness translates its transcript shape into the same `ToolInvocation` list / `TranscriptSummary` |
 | `eval-magic --help` and the README's [Environment parity](../README.md#environment-parity) / [Harnesses](../README.md#harnesses) sections | The cross-harness breadcrumbs and the flag-by-flag reference. Treat the breadcrumbs as starting points, not specifications |
 
 Do not skim. The parity report you produce in Step 4 is only as good as the reference you internalized here.
@@ -55,7 +55,7 @@ Do not skim. The parity report you produce in Step 4 is only as good as the refe
 
 Enumerate, using ordinary file search, what already exists for your harness. Do not rely on memory — search the working tree. Useful heuristics:
 
-- Your harness's arm in `adapter_for` and `mechanism_for`, and its `HarnessAdapter` impl in `src/adapters/harness.rs`
+- Your harness's arm in `adapter_for`, its mode support in `resolve_run_mode`, and its `HarnessAdapter` impl in `src/adapters/harness.rs`
 - The harness name anywhere inside `src/` (especially `src/adapters/`, `src/core/context.rs`) and `profiles/`
 - A per-harness section in the README, or tests exercising the runner for the harness (`tests/`, e.g. `tests/run/codex.rs`, `tests/run/opencode.rs`)
 
@@ -65,10 +65,10 @@ Record every path you find. You will reference them in Step 4.
 
 ## Step 4a — Audit run-mode / mechanism parity
 
-State which `DispatchMechanism`(s) your harness supports today, per `mechanism_for`, and whether the mechanism's path is end-to-end:
+State which `DispatchMechanism`(s) your harness supports today, per `resolve_run_mode` (the run modes it accepts), and whether each mechanism's path is end-to-end:
 
-- **`InSession`** consumes: a subagents-dir transcript source (no `cli_events_filename`), and the in-session next-steps guidance (mechanism-level, not adapter-supplied).
-- **`Cli`** consumes these `HarnessAdapter` methods: `cli_events_filename` (the per-task transcript file the CLI writes), `cli_model_flag` (the harness-native model-selection flag, when supported), `cli_next_steps` (the post-`run` dispatch guidance), `cli_manifest_section` (the dispatch-manifest recipe), and `cli_judge_next_steps` (the post-`grade` / post-`ingest` judge dispatch recipe).
+- **`InSession`** consumes: `parse_transcript` / `parse_transcript_full` against a subagents-dir transcript source (no `cli_events_filename`), and the in-session next-steps guidance (mechanism-level, not adapter-supplied).
+- **`Cli`** consumes these `HarnessAdapter` methods: `parse_cli_events` / `parse_cli_events_full` (the events parser, defaulting to `parse_transcript*`), `cli_events_filename` (the per-task transcript file the CLI writes), `cli_model_flag` (the harness-native model-selection flag, when supported), `cli_next_steps` (the post-`run` dispatch guidance), `cli_manifest_section` (the dispatch-manifest recipe, gated on the resolved mechanism), and `cli_judge_next_steps` (the post-`grade` / post-`ingest` judge dispatch recipe).
 
 A harness reaches mechanism parity when its mechanism's path runs end-to-end: dispatch guidance is emitted, the transcript is found and parsed, and `record-runs` / `fill-transcripts` assemble records. The intended end state is each harness supporting every mechanism that translates to it (the README's [Run modes](../README.md#run-modes) matrix tracks current support).
 
@@ -79,16 +79,16 @@ For each `HarnessAdapter` method below, compare your harness's impl against the
 | Adapter capability | Trait method(s) | Reference behavior |
 |--------------------|-----------------|--------------------|
 | Realistic eval environment (skill staging) | `skills_dir`, `render_available_skills_block`, `rewrites_frontmatter_name`, `advertises_staged_slug_name`, `skill_surface_phrase`, `skill_unresolved_phrase` | Stage skills under the harness's project-local dir and render the discoverable-skills block in the harness's **native** presentation, so a dispatch reads like a real session in that harness, not an eval. Claude Code: `.claude/skills/` + `The following skills are available for use with the Skill tool:`. The `--bootstrap` `<session-start-context>` wrapper and the slug-disambiguation framing are shared in `src/cli/run/dispatch.rs` |
-| Skill-eval transcript adapter | `parse_transcript`, `parse_transcript_full` | Translate the harness's persisted transcript into the same `ToolInvocation` list and `TranscriptSummary` (final message, tool invocations, deduped usage/timing) every downstream stage consumes |
-| Skill-eval auto-record (run/timing assembly) | (consumes `parse_transcript_full` + `cli_events_filename`) | `src/pipeline/record_runs.rs` assembles each task's `run.json` + `timing.json` from disk: carry-over fields from `dispatch.json`, `final_message` from `outputs/final-message.md`, and tool invocations/tokens/duration from the parsed transcript. A harness closes this gap by supplying the transcript its `parse_transcript_full` consumes (the portable fallback — hand-authored records against `run-record.schema.json` — is unchanged) |
-| Cli model selection | `cli_model_flag`, `cli_next_steps`, `cli_manifest_section`, `cli_judge_next_steps` | For one-shot CLI dispatch, `run --agent-model` is rendered into the agent command recipe and `run --judge-model` becomes the default model in `judge-tasks.json`; assertion-level `llm_judge.model` remains the most specific override. Codex is the reference: `cli_model_flag` returns `-m`, agent recipes render `codex exec -m <model>`, and judge recipes read each task's resolved `model` and pass `-m "$model"` only when present |
+| Skill-eval transcript adapter | `parse_transcript`, `parse_transcript_full` (InSession); `parse_cli_events`, `parse_cli_events_full` (Cli) | Translate the harness's persisted transcript into the same `ToolInvocation` list and `TranscriptSummary` (final message, tool invocations, deduped usage/timing) every downstream stage consumes. The Cli pair defaults to the InSession pair, so a Cli-only harness implements one parser; a harness wiring **both** mechanisms (Claude Code) implements each — `parse_transcript*` for its in-session transcript, `parse_cli_events*` for its CLI events file |
+| Skill-eval auto-record (run/timing assembly) | (consumes `parse_transcript_full` / `parse_cli_events_full` + `cli_events_filename`) | `src/pipeline/record_runs.rs` assembles each task's `run.json` + `timing.json` from disk: carry-over fields from `dispatch.json`, `final_message` from `outputs/final-message.md` (falling back to the transcript's final text — the primary path for `claude -p`, which has no `--output-last-message`), and tool invocations/tokens/duration from the parsed transcript. A harness closes this gap by supplying the transcript its parser consumes (the portable fallback — hand-authored records against `run-record.schema.json` — is unchanged) |
+| Cli model selection | `cli_model_flag`, `cli_next_steps`, `cli_manifest_section`, `cli_judge_next_steps` | For one-shot CLI dispatch, `run --agent-model` is rendered into the agent command recipe and `run --judge-model` becomes the default model in `judge-tasks.json`; assertion-level `llm_judge.model` remains the most specific override. Codex is the reference: `cli_model_flag` returns `-m`, agent recipes render `codex --ask-for-approval never exec ... -m <model>`, and judge recipes read each task's resolved `model` and pass `-m "$model"` only when present |
 | Eval subagent write enforcement | `install_guard` | Opt-in `--guard` stages a pre-tool hook (`src/sandbox/`) that *denies* subagent writes/installs outside the eval sandbox while dispatches run. Portable fallback for every harness: the `eval-magic detect-stray-writes` post-pass (`src/pipeline/detect_stray_writes.rs`) flags out-of-bounds writes from the parsed transcript after the fact |
 | Eval plan-mode operating context | `plan_mode_profile`, `render_plan_mode_context` | Opt-in `--plan-mode` injects the harness's `profiles/<harness>/plan-mode.md` (embedded at compile time) as a `<system-reminder>` operating-context layer in every dispatch. Claude Code and Codex profiles exist today; a harness with no profile has no `--plan-mode` and an unchanged dispatch contract |
 | Harness-details operator guide | (docs, not a trait method) | The README's per-harness section, e.g. [Claude Code](../README.md#claude-code-fully-wired) |
 
 **Note on the transcript adapter (raised bar).** Baseline eval suites use `transcript_check` assertions — deterministic regex checks against a run's tool invocations (e.g. "a test command ran", "the sibling skill was loaded"). These only grade when `parse_transcript` is implemented for your harness. A harness without it still functions: those assertions grade as *unverifiable* and the `llm_judge` assertions carry the substantive measurement. But adapter richness is an explicit parity target, not optional polish — implementing or enriching `parse_transcript*` lets more of a baseline suite grade mechanically. Treat it as a goal to aim at, not a box already checked.
 
-**Note on write enforcement (parity goal).** Eval subagents are instructed to write only inside their `outputs/` dir, but nothing in the portable contract *enforces* it — a misbehaving subagent can edit the real repo or install packages, silently tainting the run. Two layers address this: the portable `detect-stray-writes` post-pass (available to every harness, since it works off the same parsed transcript) and an opt-in harness-native `install_guard` that stages a pre-tool hook to *block* the write before it happens. Claude Code and Codex both wire this through their `PreToolUse` hook surfaces. **Harness-level tool enforcement is an explicit parity goal, not optional polish.** A harness that can express a pre-tool guard (a hook, a permission rule, a sandboxed cwd) should wire `install_guard`; until then, `detect-stray-writes` is the honest fallback.
+**Note on write enforcement (parity goal).** Eval subagents are instructed to write only inside their `outputs/` dir, but nothing in the portable contract *enforces* it — a misbehaving subagent can edit the real repo or install packages, silently tainting the run. Two layers address this: the portable `detect-stray-writes` post-pass (available to every harness, since it works off the same parsed transcript) and an opt-in harness-native `install_guard` that stages a pre-tool hook to *block* the write before it happens. Claude Code and Codex both wire this through their `PreToolUse` hook surfaces — for Claude Code across *both* mechanisms, since `claude -p` loads the same project-local `settings.local.json` hook from the env cwd each dispatch runs in. **Harness-level tool enforcement is an explicit parity goal, not optional polish.** A harness that can express a pre-tool guard (a hook, a permission rule, a sandboxed cwd) should wire `install_guard`; until then, `detect-stray-writes` is the honest fallback.
 
 **Note on plan-mode fidelity (residual parity goal).** `--plan-mode` injects a harness's *verbatim* plan-mode procedure as operating context, the closest a harness's eval runner can get to reproducing the wild failure where a real plan mode makes loading a skill feel redundant. It is **not** the real mode: it is still text the dispatched subagent reads, not a state the harness places it under, so a pass remains necessary-not-sufficient (the seeding ceiling is explained in the [`slow-powers`](https://github.com/slowdini/slow-powers) `evaluating-skills` skill). A harness that can dispatch an eval subagent *into* its own plan/research mode would close this gap; `--plan-mode` (a profile + renderer) is the approximation every harness can reach in the meantime.
 
@@ -98,7 +98,7 @@ Surface your findings inline using this template:
 ## Eval-Magic Parity Report: <harness>
 
 ### Run-mode / mechanism parity
-- Mechanism(s) wired: <InSession | Cli | both> (per mechanism_for)
+- Mechanism(s) wired: <InSession | Cli | both> (per resolve_run_mode)
 - End-to-end? <yes / partial — what breaks>
 
 ### Harness-adapter feature parity
diff --git a/profiles/claude-code/runbook.md b/profiles/claude-code/runbook.md
new file mode 100644
index 0000000..467167d
--- /dev/null
+++ b/profiles/claude-code/runbook.md
@@ -0,0 +1,71 @@
+# Eval run — {{SKILL_NAME}} (iteration {{ITERATION}})
+
+You are an agent in a **fresh, isolated** session. Follow this runbook top to bottom to run
+the eval and produce `benchmark.json`. Everything you need is in this iteration directory —
+you should not need anything from the surrounding repo.
+
+- **Skill under test:** {{SKILL_NAME}}
+- **Mode:** {{MODE}} — comparing `{{COND_A}}` vs `{{COND_B}}`
+- **Dispatches:** {{NUM_TASKS}} (the `tasks[]` array in `{{DISPATCH_JSON}}`)
+
+The two conditions run as **separate batches** in this one session: dispatch every subagent of
+one batch, wait for them **all** to return, then switch conditions before dispatching the next.
+Never interleave the batches — `switch-condition` removes the off-condition's staged skill, and a
+subagent still in flight could observe a half-removed skill or read the wrong one.
+
+## 1. Dispatch the `{{COND_A}}` batch
+
+{{DISPATCH_COND_A}}
+
+Wait for **every** one of these subagents to return before continuing.
+
+## 2. Switch to the `{{COND_B}}` condition
+
+This removes the `{{COND_A}}` staged skill so the `{{COND_B}}` batch cannot read it:
+
+```
+{{SWITCH_CMD}}
+```
+
+## 3. Dispatch the `{{COND_B}}` batch
+
+{{DISPATCH_COND_B}}
+
+Wait for **every** one of these subagents to return before continuing.
+
+## 4. Ingest
+
+```
+{{INGEST_CMD}}
+```
+
+`ingest` records each run, backfills transcripts, scans for stray writes, and grades every
+mechanical assertion. It then prints any `llm_judge` tasks it could not grade itself.
+
+## 5. Dispatch the judge subagents, then finalize
+
+Dispatch each judge task `ingest` listed as a subagent the same way — pass its
+`agent_description` verbatim — then merge the verdicts and aggregate:
+
+```
+{{FINALIZE_CMD}}
+```
+
+## 6. Read the result
+
+`finalize` writes the cross-condition benchmark to:
+
+```
+{{BENCHMARK_PATH}}
+```
+
+Read it for the per-condition pass rates and the `{{COND_A}}` − `{{COND_B}}` deltas. This is
+the artifact the prep session resumes on.
+
+## 7. Tear down
+
+When you are done, remove the staged skills (and the write guard, if armed):
+
+```
+{{TEARDOWN_CMD}}
+```
diff --git a/profiles/shared/runbook-headless.md b/profiles/shared/runbook-headless.md
new file mode 100644
index 0000000..d9af8db
--- /dev/null
+++ b/profiles/shared/runbook-headless.md
@@ -0,0 +1,40 @@
+# Eval run — {{SKILL_NAME}} (iteration {{ITERATION}}, {{HARNESS}})
+
+This runbook is for a human driving the run from a terminal. Work from this iteration directory
+and copy-paste each step. The workspace is self-contained — you should not need the surrounding
+repo.
+
+- **Skill under test:** {{SKILL_NAME}}
+- **Mode:** {{MODE}} — comparing `{{COND_A}}` vs `{{COND_B}}`
+- **Dispatches:** {{NUM_TASKS}} (the `tasks[]` array in `{{DISPATCH_JSON}}`)
+
+## 1. Dispatch the eval agents, then ingest
+{{DISPATCH_RECIPE}}
+
+`ingest` records each run, backfills transcripts, scans for stray writes, and grades every
+mechanical assertion. It then prints any `llm_judge` tasks it could not grade itself.
+
+## 2. Dispatch the judge agents, then finalize
+{{JUDGE_RECIPE}}
+
+Then merge the verdicts and aggregate:
+
+```
+{{FINALIZE_CMD}}
+```
+
+## 3. Read the result
+
+`finalize` writes the cross-condition benchmark to:
+
+```
+{{BENCHMARK_PATH}}
+```
+
+Read it for the per-condition pass rates and the `{{COND_A}}` − `{{COND_B}}` deltas.
+
+## 4. Tear down
+
+```
+{{TEARDOWN_CMD}}
+```
diff --git a/schema/evals.schema.json b/schema/evals.schema.json
index bccaecf..ef9c100 100644
--- a/schema/evals.schema.json
+++ b/schema/evals.schema.json
@@ -48,6 +48,12 @@
           "minimum": 1,
           "description": "Runs per condition for this eval, for variance reduction; overrides the --runs flag. Defaults to the flag's value (1 unless raised)."
         },
+        "isolation": {
+          "type": "string",
+          "enum": ["shared", "isolated"],
+          "default": "shared",
+          "description": "Isolation hint for run batching. 'isolated' forces this eval into its own group so it never shares an env with another eval (for confounds the framework can't detect from fixture conflicts, e.g. the agent mutates a shared fixture). Defaults to 'shared'. Evals whose fixtures conflict are auto-isolated regardless."
+        },
         "skill_should_trigger": {
           "type": "boolean",
           "default": true,
diff --git a/src/adapters/claude_cli.rs b/src/adapters/claude_cli.rs
new file mode 100644
index 0000000..2377902
--- /dev/null
+++ b/src/adapters/claude_cli.rs
@@ -0,0 +1,136 @@
+//! Claude Code `claude -p` command rendering for `DispatchMechanism::Cli`
+//! guidance (hybrid / headless run modes).
+//!
+//! Differences from the Codex recipe, all forced by the `claude` CLI:
+//! `--output-format stream-json` requires `--verbose` in `-p` mode; there is no
+//! `--cd` flag, so the dispatch runs from the env dir (`cd <eval-root> &&`);
+//! and there is no `--output-last-message`, so the final message is recovered
+//! from the stream-json `result` event by the transcript adapter rather than
+//! written to a file. `</dev/null` detaches stdin so a permission prompt cannot
+//! block on a TTY and piped task data cannot become extra prompt context.
+
+use super::cli_command::render_cli_model_arg;
+
+/// Copy/pasteable Claude Code dispatch command template.
+pub(crate) fn claude_exec_command_template(
+    model_flag: Option<&str>,
+    agent_model: Option<&str>,
+) -> String {
+    let model_arg = render_cli_model_arg(model_flag, agent_model);
+    [
+        format!(
+            "cd <eval-root> && claude -p --output-format stream-json --verbose --permission-mode acceptEdits{model_arg} \\"
+        ),
+        "  \"Read the file at <dispatch_prompt_path> and follow its instructions exactly. When you finish, make your final response your closing summary.\" \\".to_string(),
+        "  </dev/null \\".to_string(),
+        "  > <outputs_dir>/claude-events.jsonl \\".to_string(),
+        "  2> <outputs_dir>/claude-stderr.log".to_string(),
+    ]
+    .join("\n")
+}
+
+/// Parallel dispatch recipe over `dispatch.json` tasks, one `claude -p` per task.
+pub(crate) fn claude_parallel_dispatch_recipe(
+    model_flag: Option<&str>,
+    agent_model: Option<&str>,
+) -> String {
+    let model_arg = render_cli_model_arg(model_flag, agent_model);
+    [
+        "JOBS=${JOBS:-4}".to_string(),
+        "jq -j '.tasks[] | [.eval_root, .dispatch_prompt_path, .outputs_dir] | @tsv + \"\\u0000\"' dispatch.json | \\".to_string(),
+        "  xargs -0 -P \"$JOBS\" -I{} sh -c '".to_string(),
+        "    eval_root=\"$(printf \"%s\" \"$1\" | cut -f1)\"".to_string(),
+        "    prompt_path=\"$(printf \"%s\" \"$1\" | cut -f2)\"".to_string(),
+        "    outputs_dir=\"$(printf \"%s\" \"$1\" | cut -f3)\"".to_string(),
+        "    mkdir -p \"$outputs_dir\"".to_string(),
+        format!(
+            "    cd \"$eval_root\" && claude -p --output-format stream-json --verbose --permission-mode acceptEdits{model_arg} \\"
+        ),
+        "      \"Read the file at $prompt_path and follow its instructions exactly. When you finish, make your final response your closing summary.\" \\".to_string(),
+        "      </dev/null \\".to_string(),
+        "      > \"$outputs_dir/claude-events.jsonl\" \\".to_string(),
+        "      2> \"$outputs_dir/claude-stderr.log\"".to_string(),
+        "  ' sh {}".to_string(),
+    ]
+    .join("\n")
+}
+
+/// Judge dispatch recipe over `judge-tasks.json`, one `claude -p` per task.
+pub(crate) fn claude_judge_dispatch_recipe(model_flag: Option<&str>) -> String {
+    let model_flag = model_flag.unwrap_or("--model");
+    [
+        "Dispatch each judge task from judge-tasks.json with:".to_string(),
+        String::new(),
+        "```bash".to_string(),
+        "JOBS=${JOBS:-4}".to_string(),
+        "jq -j '.tasks[] | [.dispatch_prompt_path, .response_path, (.model // \"\")] | @tsv + \"\\u0000\"' judge-tasks.json | \\".to_string(),
+        "  xargs -0 -P \"$JOBS\" -I{} sh -c '".to_string(),
+        "    prompt_path=\"$(printf \"%s\" \"$1\" | cut -f1)\"".to_string(),
+        "    response_path=\"$(printf \"%s\" \"$1\" | cut -f2)\"".to_string(),
+        "    model=\"$(printf \"%s\" \"$1\" | cut -f3)\"".to_string(),
+        "    response_base=\"${response_path%.json}\"".to_string(),
+        "    mkdir -p \"$(dirname \"$response_path\")\"".to_string(),
+        "    model_arg=\"\"; [ -n \"$model\" ] && model_arg=\"".to_string()
+            + model_flag
+            + " $model\"",
+        "    cd <eval-root> && claude -p --output-format stream-json --verbose --permission-mode acceptEdits $model_arg \\".to_string(),
+        "      \"Read the file at $prompt_path and follow it exactly. You are a judge worker only: write the JSON verdict to $response_path, then reply with one sentence. Do not run eval-magic. Do not dispatch other judge tasks. Do not wait for other workers.\" \\".to_string(),
+        "      </dev/null \\".to_string(),
+        "      > \"$response_base.claude-events.jsonl\" \\".to_string(),
+        "      2> \"$response_base.claude-stderr.log\"".to_string(),
+        "  ' sh {}".to_string(),
+        "```".to_string(),
+    ]
+    .join("\n")
+}
+
+#[cfg(test)]
+mod tests {
+    use super::{
+        claude_exec_command_template, claude_judge_dispatch_recipe, claude_parallel_dispatch_recipe,
+    };
+
+    #[test]
+    fn exec_template_carries_required_stream_json_flags() {
+        let cmd = claude_exec_command_template(Some("--model"), None);
+        assert!(cmd.contains("claude -p"), "{cmd}");
+        assert!(cmd.contains("--output-format stream-json"), "{cmd}");
+        // stream-json requires --verbose in -p mode.
+        assert!(cmd.contains("--verbose"), "{cmd}");
+        assert!(cmd.contains("--permission-mode acceptEdits"), "{cmd}");
+        assert!(cmd.contains("> <outputs_dir>/claude-events.jsonl"), "{cmd}");
+        assert!(cmd.contains("2> <outputs_dir>/claude-stderr.log"), "{cmd}");
+        assert!(cmd.contains("</dev/null"), "{cmd}");
+        // claude has no --cd flag; the dispatch runs from the env dir.
+        assert!(cmd.contains("cd <eval-root>"), "{cmd}");
+        assert!(cmd.contains("<dispatch_prompt_path>"), "{cmd}");
+        // claude has no --output-last-message; final text comes from the result event.
+        assert!(!cmd.contains("--output-last-message"), "{cmd}");
+        assert!(!cmd.contains("final-message.md"), "{cmd}");
+    }
+
+    #[test]
+    fn exec_template_includes_model_only_when_declared() {
+        let with = claude_exec_command_template(Some("--model"), Some("opus"));
+        assert!(with.contains("--model opus"), "{with}");
+        let without = claude_exec_command_template(Some("--model"), None);
+        assert!(!without.contains("--model "), "{without}");
+    }
+
+    #[test]
+    fn parallel_recipe_drives_claude_p_per_task() {
+        let recipe = claude_parallel_dispatch_recipe(Some("--model"), Some("sonnet"));
+        assert!(recipe.contains("claude -p"), "{recipe}");
+        assert!(recipe.contains("claude-events.jsonl"), "{recipe}");
+        assert!(recipe.contains("dispatch.json"), "{recipe}");
+        assert!(recipe.contains("--model sonnet"), "{recipe}");
+    }
+
+    #[test]
+    fn judge_recipe_drives_claude_p() {
+        let recipe = claude_judge_dispatch_recipe(Some("--model"));
+        assert!(recipe.contains("claude -p"), "{recipe}");
+        assert!(recipe.contains("judge-tasks.json"), "{recipe}");
+        assert!(recipe.contains("response_path"), "{recipe}");
+    }
+}
diff --git a/src/adapters/claude_code_transcript.rs b/src/adapters/claude_code_transcript.rs
index e94b1fb..c5edad9 100644
--- a/src/adapters/claude_code_transcript.rs
+++ b/src/adapters/claude_code_transcript.rs
@@ -16,11 +16,11 @@ use std::path::{Path, PathBuf};
 use std::time::SystemTime;
 
 #[derive(Debug, Deserialize)]
-struct UsageRecord {
-    input_tokens: Option<i64>,
-    output_tokens: Option<i64>,
-    cache_creation_input_tokens: Option<i64>,
-    cache_read_input_tokens: Option<i64>,
+pub(crate) struct UsageRecord {
+    pub(crate) input_tokens: Option<i64>,
+    pub(crate) output_tokens: Option<i64>,
+    pub(crate) cache_creation_input_tokens: Option<i64>,
+    pub(crate) cache_read_input_tokens: Option<i64>,
 }
 
 #[derive(Debug, Deserialize)]
@@ -32,9 +32,9 @@ struct Message {
 }
 
 #[derive(Debug, Deserialize)]
-struct TranscriptRecord {
+pub(crate) struct TranscriptRecord {
     #[serde(rename = "type")]
-    record_type: Option<String>,
+    pub(crate) record_type: Option<String>,
     timestamp: Option<String>,
     message: Option<Message>,
 }
@@ -83,7 +83,7 @@ fn stringify_result(content: Option<&Value>) -> String {
     }
 }
 
-fn read_records(jsonl_path: &Path) -> io::Result<Vec<TranscriptRecord>> {
+pub(crate) fn read_records(jsonl_path: &Path) -> io::Result<Vec<TranscriptRecord>> {
     let raw = fs::read_to_string(jsonl_path)?;
     let mut records = Vec::new();
     for line in raw.split('\n') {
@@ -98,7 +98,7 @@ fn read_records(jsonl_path: &Path) -> io::Result<Vec<TranscriptRecord>> {
     Ok(records)
 }
 
-fn extract_invocations(records: &[TranscriptRecord]) -> Vec<ToolInvocation> {
+pub(crate) fn extract_invocations(records: &[TranscriptRecord]) -> Vec<ToolInvocation> {
     let mut invocations: Vec<ToolInvocation> = Vec::new();
     let mut index_by_id: HashMap<String, usize> = HashMap::new();
 
@@ -151,6 +151,27 @@ pub fn parse_transcript(jsonl_path: &Path) -> io::Result<Vec<ToolInvocation>> {
     Ok(extract_invocations(&read_records(jsonl_path)?))
 }
 
+/// The concatenated text blocks of the last assistant message carrying any text.
+/// Shared with the `-p` stream-json parser, which uses it as the final-message
+/// fallback when the terminal `result` event is absent or errored.
+pub(crate) fn last_assistant_text(records: &[TranscriptRecord]) -> Option<String> {
+    let mut final_text: Option<String> = None;
+    for record in records {
+        if record.record_type.as_deref() != Some("assistant") {
+            continue;
+        }
+        let texts: Vec<&str> = content_blocks(&record.message)
+            .iter()
+            .filter(|b| b.get("type").and_then(Value::as_str) == Some("text"))
+            .filter_map(|b| b.get("text").and_then(Value::as_str))
+            .collect();
+        if !texts.is_empty() {
+            final_text = Some(texts.join("\n"));
+        }
+    }
+    final_text
+}
+
 /// A transcript boiled down to the artifacts the pipeline needs.
 #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
 pub struct TranscriptSummary {
@@ -179,7 +200,6 @@ pub fn parse_transcript_full(jsonl_path: &Path) -> io::Result<TranscriptSummary>
     let mut first_ts: Option<i64> = None;
     let mut last_ts: Option<i64> = None;
     let mut timestamp_count = 0usize;
-    let mut final_text: Option<String> = None;
 
     for record in &records {
         if let Some(ts_str) = &record.timestamp
@@ -201,17 +221,10 @@ pub fn parse_transcript_full(jsonl_path: &Path) -> io::Result<TranscriptSummary>
         {
             usage_by_id.insert(id, usage);
         }
-
-        let texts: Vec<&str> = content_blocks(&record.message)
-            .iter()
-            .filter(|b| b.get("type").and_then(Value::as_str) == Some("text"))
-            .filter_map(|b| b.get("text").and_then(Value::as_str))
-            .collect();
-        if !texts.is_empty() {
-            final_text = Some(texts.join("\n"));
-        }
     }
 
+    let final_text = last_assistant_text(&records);
+
     let total_tokens = if usage_by_id.is_empty() {
         None
     } else {
diff --git a/src/adapters/claude_stream_json.rs b/src/adapters/claude_stream_json.rs
new file mode 100644
index 0000000..9d4c25b
--- /dev/null
+++ b/src/adapters/claude_stream_json.rs
@@ -0,0 +1,232 @@
+//! Claude Code `-p --output-format stream-json` transcript parsing.
+//!
+//! Parses the newline-delimited JSON event stream that `claude -p
+//! --output-format stream-json --verbose` writes (captured per task as
+//! `outputs/claude-events.jsonl`). The `assistant`/`user` events wrap a full
+//! Anthropic Messages object under `message`, so tool-call extraction is shared
+//! with the in-session [`claude_code_transcript`](super::claude_code_transcript)
+//! parser. The differences are all in the envelope: there are no per-line
+//! timestamps, and a terminal `result` event carries the authoritative final
+//! text, wall-clock duration, and token usage. `system`, `rate_limit_event`, and
+//! any other non-message events are ignored (they don't deserialize into an
+//! assistant/user record, so the shared extractor skips them).
+
+use std::fs;
+use std::io;
+use std::path::Path;
+
+use crate::core::ToolInvocation;
+
+use super::TranscriptSummary;
+use super::claude_code_transcript::{
+    TranscriptRecord, UsageRecord, extract_invocations, last_assistant_text, read_records,
+};
+
+/// The terminal `{"type":"result", …}` event of a `-p` stream-json run.
+#[derive(Debug, serde::Deserialize)]
+struct ResultEvent {
+    #[serde(default)]
+    result: Option<String>,
+    #[serde(default)]
+    duration_ms: Option<i64>,
+    #[serde(default)]
+    is_error: Option<bool>,
+    #[serde(default)]
+    usage: Option<UsageRecord>,
+}
+
+/// Parse the event stream into ordered tool invocations. Reuses the in-session
+/// extractor: non-message events deserialize into records the extractor skips.
+pub fn parse_claude_stream_json(path: &Path) -> io::Result<Vec<ToolInvocation>> {
+    Ok(extract_invocations(&read_records(path)?))
+}
+
+/// Parse the event stream into a full [`TranscriptSummary`]. Final text,
+/// duration, and token totals come from the terminal `result` event; on a
+/// missing or errored `result` the final text falls back to the last assistant
+/// message's text, and duration/tokens fall back to `None`.
+pub fn parse_claude_stream_json_full(path: &Path) -> io::Result<TranscriptSummary> {
+    let raw = fs::read_to_string(path)?;
+    let mut records: Vec<TranscriptRecord> = Vec::new();
+    let mut result_event: Option<ResultEvent> = None;
+    for line in raw.split('\n') {
+        if line.trim().is_empty() {
+            continue;
+        }
+        // Skip malformed lines rather than failing the whole parse.
+        let Ok(record) = serde_json::from_str::<TranscriptRecord>(line) else {
+            continue;
+        };
+        if record.record_type.as_deref() == Some("result") {
+            result_event = serde_json::from_str::<ResultEvent>(line).ok();
+        }
+        records.push(record);
+    }
+
+    let total_tokens = result_event
+        .as_ref()
+        .and_then(|e| e.usage.as_ref())
+        .map(|u| {
+            u.input_tokens.unwrap_or(0)
+                + u.output_tokens.unwrap_or(0)
+                + u.cache_creation_input_tokens.unwrap_or(0)
+                + u.cache_read_input_tokens.unwrap_or(0)
+        });
+    let duration_ms = result_event.as_ref().and_then(|e| e.duration_ms);
+    let final_text = match &result_event {
+        Some(ev) if ev.is_error != Some(true) => {
+            ev.result.clone().or_else(|| last_assistant_text(&records))
+        }
+        _ => last_assistant_text(&records),
+    };
+
+    Ok(TranscriptSummary {
+        tool_invocations: extract_invocations(&records),
+        total_tokens,
+        duration_ms,
+        final_text,
+    })
+}
+
+#[cfg(test)]
+mod tests {
+    use serde_json::{Value, json};
+    use std::fs;
+    use std::path::Path;
+    use tempfile::TempDir;
+
+    fn write_jsonl(path: &Path, lines: &[Value]) {
+        let body = lines
+            .iter()
+            .map(|l| l.to_string())
+            .collect::<Vec<_>>()
+            .join("\n");
+        fs::write(path, format!("{body}\n")).unwrap();
+    }
+
+    fn usage() -> Value {
+        json!({
+            "input_tokens": 4932,
+            "output_tokens": 139,
+            "cache_creation_input_tokens": 8287,
+            "cache_read_input_tokens": 33490,
+        })
+    }
+
+    #[test]
+    fn extracts_tool_use_and_result_skipping_non_message_events() {
+        let dir = TempDir::new().unwrap();
+        let path = dir.path().join("events.jsonl");
+        write_jsonl(
+            &path,
+            &[
+                json!({"type": "system", "subtype": "init", "cwd": "/env", "model": "claude-opus-4-8", "tools": ["Bash"]}),
+                json!({"type": "assistant", "message": {"id": "msg_1", "role": "assistant", "usage": usage(), "content": [
+                    {"type": "tool_use", "id": "toolu_1", "name": "Bash", "input": {"command": "ls"}}
+                ]}}),
+                json!({"type": "rate_limit_event", "rate_limit_info": {}}),
+                json!({"type": "user", "message": {"role": "user", "content": [
+                    {"type": "tool_result", "tool_use_id": "toolu_1", "content": "a.txt"}
+                ]}}),
+                json!({"type": "result", "subtype": "success", "is_error": false, "result": "Done", "duration_ms": 5637, "usage": usage()}),
+            ],
+        );
+
+        let result = super::parse_claude_stream_json(&path).unwrap();
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].name, "Bash");
+        assert_eq!(result[0].ordinal, 0);
+        assert_eq!(result[0].args, Some(json!({"command": "ls"})));
+        assert_eq!(result[0].result, Some(Value::String("a.txt".into())));
+    }
+
+    #[test]
+    fn result_event_supplies_final_text_duration_and_tokens() {
+        let dir = TempDir::new().unwrap();
+        let path = dir.path().join("events.jsonl");
+        write_jsonl(
+            &path,
+            &[
+                json!({"type": "system", "subtype": "init"}),
+                json!({"type": "assistant", "message": {"id": "msg_1", "role": "assistant", "content": [{"type": "text", "text": "working"}]}}),
+                json!({"type": "result", "subtype": "success", "is_error": false, "result": "Done", "duration_ms": 5637, "usage": usage()}),
+            ],
+        );
+
+        let summary = super::parse_claude_stream_json_full(&path).unwrap();
+        assert_eq!(summary.final_text, Some("Done".into()));
+        assert_eq!(summary.duration_ms, Some(5637));
+        // 4932 + 139 + 8287 + 33490
+        assert_eq!(summary.total_tokens, Some(46848));
+    }
+
+    #[test]
+    fn skill_tool_use_is_preserved_for_meta_check() {
+        let dir = TempDir::new().unwrap();
+        let path = dir.path().join("events.jsonl");
+        write_jsonl(
+            &path,
+            &[
+                json!({"type": "assistant", "message": {"id": "msg_1", "role": "assistant", "content": [
+                    {"type": "tool_use", "id": "toolu_s", "name": "Skill", "input": {"command": "slow-powers-eval-mr-review"}}
+                ]}}),
+                json!({"type": "result", "subtype": "success", "is_error": false, "result": "ok", "duration_ms": 10, "usage": usage()}),
+            ],
+        );
+        let result = super::parse_claude_stream_json(&path).unwrap();
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].name, "Skill");
+        assert_eq!(
+            result[0].args,
+            Some(json!({"command": "slow-powers-eval-mr-review"}))
+        );
+    }
+
+    #[test]
+    fn final_text_falls_back_to_last_assistant_text_when_result_is_error() {
+        let dir = TempDir::new().unwrap();
+        let path = dir.path().join("events.jsonl");
+        write_jsonl(
+            &path,
+            &[
+                json!({"type": "assistant", "message": {"id": "msg_1", "role": "assistant", "content": [{"type": "text", "text": "partial work"}]}}),
+                json!({"type": "result", "subtype": "error_during_execution", "is_error": true, "result": "Execution error", "duration_ms": 12, "usage": usage()}),
+            ],
+        );
+        let summary = super::parse_claude_stream_json_full(&path).unwrap();
+        assert_eq!(summary.final_text, Some("partial work".into()));
+    }
+
+    #[test]
+    fn skips_malformed_jsonl_lines() {
+        let dir = TempDir::new().unwrap();
+        let path = dir.path().join("events.jsonl");
+        let good = json!({"type": "assistant", "message": {"id": "msg_1", "role": "assistant", "content": [
+            {"type": "tool_use", "id": "toolu_1", "name": "Read", "input": {"file_path": "/tmp/x"}}
+        ]}});
+        let result_line = json!({"type": "result", "is_error": false, "result": "Done", "duration_ms": 1, "usage": usage()});
+        let body = format!("{good}\nnot valid json\n{result_line}\n");
+        fs::write(&path, body).unwrap();
+
+        let result = super::parse_claude_stream_json(&path).unwrap();
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].name, "Read");
+    }
+
+    #[test]
+    fn null_duration_and_tokens_when_no_result_event() {
+        let dir = TempDir::new().unwrap();
+        let path = dir.path().join("events.jsonl");
+        write_jsonl(
+            &path,
+            &[
+                json!({"type": "system", "subtype": "init"}),
+                json!({"type": "assistant", "message": {"id": "msg_1", "role": "assistant", "content": [{"type": "text", "text": "incomplete"}]}}),
+            ],
+        );
+        let summary = super::parse_claude_stream_json_full(&path).unwrap();
+        assert_eq!(summary.duration_ms, None);
+        assert_eq!(summary.total_tokens, None);
+        assert_eq!(summary.final_text, Some("incomplete".into()));
+    }
+}
diff --git a/src/adapters/cli_command.rs b/src/adapters/cli_command.rs
new file mode 100644
index 0000000..ab841cd
--- /dev/null
+++ b/src/adapters/cli_command.rs
@@ -0,0 +1,62 @@
+//! Shared rendering helpers for `DispatchMechanism::Cli` command templates
+//! (Codex's `codex exec`, Claude Code's `claude -p`).
+
+/// Quote a value for a POSIX shell only when it contains anything outside a
+/// conservative safe set, single-quoting and escaping embedded quotes otherwise.
+pub(crate) fn shell_quote_arg(value: &str) -> String {
+    if value.bytes().all(|b| {
+        b.is_ascii_alphanumeric() || matches!(b, b'-' | b'_' | b'.' | b'/' | b':' | b'@' | b'+')
+    }) {
+        return value.to_string();
+    }
+    format!("'{}'", value.replace('\'', "'\"'\"'"))
+}
+
+/// Render a ` <flag> <model>` fragment for a CLI dispatch, or an empty string
+/// when the adapter has no model flag or no (non-blank) model was declared.
+pub(crate) fn render_cli_model_arg(flag: Option<&str>, model: Option<&str>) -> String {
+    let Some(model) = model.filter(|m| !m.trim().is_empty()) else {
+        return String::new();
+    };
+    let Some(flag) = flag else {
+        return String::new();
+    };
+    format!(" {flag} {}", shell_quote_arg(model))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::{render_cli_model_arg, shell_quote_arg};
+
+    #[test]
+    fn shell_quote_leaves_safe_values_unquoted() {
+        assert_eq!(shell_quote_arg("gpt-5-mini"), "gpt-5-mini");
+        assert_eq!(shell_quote_arg("claude-opus-4-8"), "claude-opus-4-8");
+        assert_eq!(shell_quote_arg("a/b:c@d+e_f.g"), "a/b:c@d+e_f.g");
+    }
+
+    #[test]
+    fn shell_quote_wraps_values_with_specials() {
+        assert_eq!(shell_quote_arg("a b"), "'a b'");
+        assert_eq!(shell_quote_arg("a'b"), "'a'\"'\"'b'");
+    }
+
+    #[test]
+    fn render_model_arg_empty_when_unset() {
+        assert_eq!(render_cli_model_arg(Some("--model"), None), "");
+        assert_eq!(render_cli_model_arg(Some("--model"), Some("   ")), "");
+        assert_eq!(render_cli_model_arg(None, Some("opus")), "");
+    }
+
+    #[test]
+    fn render_model_arg_renders_flag_and_quoted_model() {
+        assert_eq!(
+            render_cli_model_arg(Some("--model"), Some("opus")),
+            " --model opus"
+        );
+        assert_eq!(
+            render_cli_model_arg(Some("-m"), Some("gpt 5")),
+            " -m 'gpt 5'"
+        );
+    }
+}
diff --git a/src/adapters/codex_cli.rs b/src/adapters/codex_cli.rs
index 6dd35c3..8b6935a 100644
--- a/src/adapters/codex_cli.rs
+++ b/src/adapters/codex_cli.rs
@@ -1,23 +1,6 @@
 //! Codex CLI command rendering for `DispatchMechanism::Cli` guidance.
 
-fn shell_quote_arg(value: &str) -> String {
-    if value.bytes().all(|b| {
-        b.is_ascii_alphanumeric() || matches!(b, b'-' | b'_' | b'.' | b'/' | b':' | b'@' | b'+')
-    }) {
-        return value.to_string();
-    }
-    format!("'{}'", value.replace('\'', "'\"'\"'"))
-}
-
-fn render_cli_model_arg(flag: Option<&str>, model: Option<&str>) -> String {
-    let Some(model) = model.filter(|m| !m.trim().is_empty()) else {
-        return String::new();
-    };
-    let Some(flag) = flag else {
-        return String::new();
-    };
-    format!(" {flag} {}", shell_quote_arg(model))
-}
+use super::cli_command::render_cli_model_arg;
 
 /// Copy/pasteable Codex dispatch command template. Stdin is detached so a
 /// surrounding `xargs`/pipe cannot be treated as extra prompt context.
@@ -34,7 +17,7 @@ pub(crate) fn codex_exec_command_template(
     let model_arg = render_cli_model_arg(model_flag, agent_model);
     [
         format!(
-            "codex exec --cd <eval-root> --sandbox workspace-write --ask-for-approval never{hook_trust}{model_arg} --json \\"
+            "codex --ask-for-approval never exec --cd <eval-root> --sandbox workspace-write{hook_trust}{model_arg} --json \\"
         ),
         "  --output-last-message <outputs_dir>/final-message.md \\".to_string(),
         "  \"Read the file at <dispatch_prompt_path> and follow its instructions exactly. When you finish, make your final response exactly the same text you wrote to <outputs_dir>/final-message.md.\" \\".to_string(),
@@ -58,13 +41,14 @@ pub(crate) fn codex_parallel_dispatch_recipe(
     let model_arg = render_cli_model_arg(model_flag, agent_model);
     [
         "JOBS=${JOBS:-4}".to_string(),
-        "jq -j '.tasks[] | [.dispatch_prompt_path, .outputs_dir] | @tsv + \"\\u0000\"' dispatch.json | \\".to_string(),
+        "jq -j '.tasks[] | [.eval_root, .dispatch_prompt_path, .outputs_dir] | @tsv + \"\\u0000\"' dispatch.json | \\".to_string(),
         "  xargs -0 -P \"$JOBS\" -I{} sh -c '".to_string(),
-        "    prompt_path=\"$(printf \"%s\" \"$1\" | cut -f1)\"".to_string(),
-        "    outputs_dir=\"$(printf \"%s\" \"$1\" | cut -f2)\"".to_string(),
+        "    eval_root=\"$(printf \"%s\" \"$1\" | cut -f1)\"".to_string(),
+        "    prompt_path=\"$(printf \"%s\" \"$1\" | cut -f2)\"".to_string(),
+        "    outputs_dir=\"$(printf \"%s\" \"$1\" | cut -f3)\"".to_string(),
         "    mkdir -p \"$outputs_dir\"".to_string(),
         format!(
-            "    codex exec --cd <eval-root> --sandbox workspace-write --ask-for-approval never{hook_trust}{model_arg} --json \\"
+            "    codex --ask-for-approval never exec --cd \"$eval_root\" --sandbox workspace-write{hook_trust}{model_arg} --json \\"
         ),
         "      --output-last-message \"$outputs_dir/final-message.md\" \\".to_string(),
         "      \"Read the file at $prompt_path and follow its instructions exactly. When you finish, make your final response exactly the same text you wrote to $outputs_dir/final-message.md.\" \\".to_string(),
@@ -97,7 +81,7 @@ pub(crate) fn codex_judge_dispatch_recipe(model_flag: Option<&str>, guard: bool)
         "    mkdir -p \"$(dirname \"$response_path\")\"".to_string(),
         "    if [ -n \"$model\" ]; then".to_string(),
         format!(
-            "      codex exec --cd <eval-root> --sandbox workspace-write --ask-for-approval never{hook_trust} {model_flag} \"$model\" --json \\"
+            "      codex --ask-for-approval never exec --cd <eval-root> --sandbox workspace-write{hook_trust} {model_flag} \"$model\" --json \\"
         ),
         "        \"Read the file at $prompt_path and follow it exactly. You are a judge worker only: write the JSON verdict to $response_path, then reply with one sentence. Do not run eval-magic. Do not dispatch other judge tasks. Do not wait for other workers.\" \\".to_string(),
         "        </dev/null \\".to_string(),
@@ -105,7 +89,7 @@ pub(crate) fn codex_judge_dispatch_recipe(model_flag: Option<&str>, guard: bool)
         "        2> \"$response_base.codex-stderr.log\"".to_string(),
         "    else".to_string(),
         format!(
-            "      codex exec --cd <eval-root> --sandbox workspace-write --ask-for-approval never{hook_trust} --json \\"
+            "      codex --ask-for-approval never exec --cd <eval-root> --sandbox workspace-write{hook_trust} --json \\"
         ),
         "        \"Read the file at $prompt_path and follow it exactly. You are a judge worker only: write the JSON verdict to $response_path, then reply with one sentence. Do not run eval-magic. Do not dispatch other judge tasks. Do not wait for other workers.\" \\".to_string(),
         "        </dev/null \\".to_string(),
@@ -117,3 +101,51 @@ pub(crate) fn codex_judge_dispatch_recipe(model_flag: Option<&str>, guard: bool)
     ]
     .join("\n")
 }
+
+#[cfg(test)]
+mod tests {
+    use super::{
+        codex_exec_command_template, codex_judge_dispatch_recipe, codex_parallel_dispatch_recipe,
+    };
+
+    #[test]
+    fn exec_template_places_approval_policy_before_exec() {
+        let cmd = codex_exec_command_template(Some("-m"), true, Some("gpt-5-mini"));
+        let first_line = cmd.lines().next().unwrap();
+
+        assert_eq!(
+            first_line,
+            "codex --ask-for-approval never exec --cd <eval-root> --sandbox workspace-write --dangerously-bypass-hook-trust -m gpt-5-mini --json \\"
+        );
+    }
+
+    #[test]
+    fn parallel_recipe_places_approval_policy_before_exec() {
+        let recipe = codex_parallel_dispatch_recipe(Some("-m"), true, Some("gpt-5-mini"));
+
+        assert!(
+            recipe.contains(
+                "    codex --ask-for-approval never exec --cd \"$eval_root\" --sandbox workspace-write --dangerously-bypass-hook-trust -m gpt-5-mini --json \\"
+            ),
+            "{recipe}"
+        );
+    }
+
+    #[test]
+    fn judge_recipe_places_approval_policy_before_exec() {
+        let recipe = codex_judge_dispatch_recipe(Some("-m"), true);
+
+        assert!(
+            recipe.contains(
+                "      codex --ask-for-approval never exec --cd <eval-root> --sandbox workspace-write --dangerously-bypass-hook-trust -m \"$model\" --json \\"
+            ),
+            "{recipe}"
+        );
+        assert!(
+            recipe.contains(
+                "      codex --ask-for-approval never exec --cd <eval-root> --sandbox workspace-write --dangerously-bypass-hook-trust --json \\"
+            ),
+            "{recipe}"
+        );
+    }
+}
diff --git a/src/adapters/harness.rs b/src/adapters/harness.rs
index 410ef4e..725a737 100644
--- a/src/adapters/harness.rs
+++ b/src/adapters/harness.rs
@@ -15,11 +15,15 @@ use std::time::Duration;
 use crate::core::{AvailableSkill, Harness, ToolInvocation};
 
 use super::TranscriptSummary;
+use super::claude_cli::{
+    claude_exec_command_template, claude_judge_dispatch_recipe, claude_parallel_dispatch_recipe,
+};
 use super::codex_cli::{
     codex_exec_command_template, codex_judge_dispatch_recipe, codex_parallel_dispatch_recipe,
 };
 use super::{
-    parse_codex_events, parse_codex_events_full, parse_transcript, parse_transcript_full,
+    parse_claude_stream_json, parse_claude_stream_json_full, parse_codex_events,
+    parse_codex_events_full, parse_transcript, parse_transcript_full,
     render_available_skills_block, render_codex_available_skills_block,
     render_opencode_available_skills_block,
 };
@@ -71,6 +75,17 @@ pub trait HarnessAdapter {
         format!("<system-reminder>\n{trimmed}\n</system-reminder>")
     }
 
+    /// The **interactive** (agent-followed) `RUNBOOK.md` template a harness uses
+    /// under [`InSession`](crate::core::DispatchMechanism::InSession) dispatch,
+    /// carrying `{{TOKEN}}` placeholders the run fills. The default is the shared
+    /// headless template (harmless for the Cli-only harnesses that never read it
+    /// via this path); [`InSession`](crate::core::DispatchMechanism::InSession)
+    /// harnesses override it. The Cli-dispatch runbook always uses
+    /// [`HEADLESS_RUNBOOK_TEMPLATE`], selected by mechanism in `build_runbook`.
+    fn runbook_template(&self) -> &'static str {
+        HEADLESS_RUNBOOK_TEMPLATE
+    }
+
     /// For a [`Cli`](crate::core::DispatchMechanism::Cli)-dispatch harness, the
     /// filename (under a task's `outputs/` dir) its one-shot CLI writes the
     /// transcript to. `None` when the harness dispatches in-session (no local
@@ -116,17 +131,48 @@ pub trait HarnessAdapter {
     /// deduped token usage, duration, and final message text.
     fn parse_transcript_full(&self, path: &Path) -> io::Result<TranscriptSummary>;
 
+    /// Parse a [`Cli`](crate::core::DispatchMechanism::Cli)-mechanism events file
+    /// (the harness CLI's captured output) into ordered tool invocations. Defaults
+    /// to [`parse_transcript`](Self::parse_transcript): for Codex/OpenCode the
+    /// on-disk parser already *is* the events parser, so the default is correct;
+    /// Claude Code overrides it, because its `parse_transcript` is the in-session
+    /// subagent parser while its Cli events are `claude -p` stream-json.
+    fn parse_cli_events(&self, path: &Path) -> io::Result<Vec<ToolInvocation>> {
+        self.parse_transcript(path)
+    }
+
+    /// The full-summary counterpart of [`parse_cli_events`](Self::parse_cli_events).
+    fn parse_cli_events_full(&self, path: &Path) -> io::Result<TranscriptSummary> {
+        self.parse_transcript_full(path)
+    }
+
     /// Arm the write guard using this harness's native pre-tool hook surface,
-    /// returning the staged marker path.
+    /// returning the staged marker path. The guard's allowed roots are derived
+    /// from `stage_root` (the isolated env / agent cwd), so it bounds the agent to
+    /// the same env boundary that isolates its reads.
     fn install_guard(
         &self,
         stage_root: &Path,
-        workspace_root: &Path,
         guard_exe: &Path,
         ttl: Option<Duration>,
     ) -> io::Result<PathBuf>;
+
+    /// The banner printed after `--guard` successfully arms, describing the
+    /// harness's native hook surface and how to remove it. Harness-specific text,
+    /// so it lives here rather than in generic run code. `None` for a harness with
+    /// no write guard (its [`install_guard`](Self::install_guard) errors), in which
+    /// case no banner is printed.
+    fn guard_armed_message(&self) -> Option<&'static str> {
+        None
+    }
 }
 
+/// The shared **headless** (human-followed) `RUNBOOK.md` template used by every
+/// [`Cli`](crate::core::DispatchMechanism::Cli)-dispatch run, regardless of
+/// harness (Codex, OpenCode, and Claude Code in hybrid/headless).
+pub const HEADLESS_RUNBOOK_TEMPLATE: &str =
+    include_str!("../../profiles/shared/runbook-headless.md");
+
 pub struct ClaudeCodeAdapter;
 pub struct CodexAdapter;
 pub struct OpenCodeAdapter;
@@ -178,20 +224,70 @@ impl HarnessAdapter for ClaudeCodeAdapter {
     fn plan_mode_profile(&self) -> &'static str {
         include_str!("../../profiles/claude-code/plan-mode.md")
     }
+    fn runbook_template(&self) -> &'static str {
+        include_str!("../../profiles/claude-code/runbook.md")
+    }
+    fn cli_events_filename(&self) -> Option<&'static str> {
+        Some("claude-events.jsonl")
+    }
+    fn cli_model_flag(&self) -> Option<&'static str> {
+        Some("--model")
+    }
+    fn cli_next_steps(&self, ctx: CliDispatchContext<'_>) -> String {
+        format!(
+            "\nNext: iterate the tasks[] array in dispatch.json and dispatch each task (from the env dir — `claude` has no --cd flag) with:\n{}\nThen run `ingest{target_args} --iteration {iteration} --harness claude-code`.",
+            claude_exec_command_template(self.cli_model_flag(), ctx.agent_model),
+            target_args = ctx.target_args,
+            iteration = ctx.iteration
+        )
+    }
+    fn cli_manifest_section(&self, ctx: CliManifestContext<'_>) -> Option<Vec<String>> {
+        Some(vec![
+            "After all dispatches (Claude Code hybrid):".to_string(),
+            String::new(),
+            "Run one fresh `claude -p` per task from the env dir (`cd <eval-root>` — `claude` has no --cd flag). `--output-format stream-json` requires `--verbose`; detach stdin with `</dev/null` so a permission prompt cannot block and piped task data cannot become extra prompt context; capture stdout as `outputs/claude-events.jsonl` and stderr as `outputs/claude-stderr.log`.".to_string(),
+            String::new(),
+            "```bash".to_string(),
+            claude_exec_command_template(self.cli_model_flag(), ctx.agent_model),
+            "```".to_string(),
+            String::new(),
+            "Parallel dispatch from this iteration directory:".to_string(),
+            String::new(),
+            "```bash".to_string(),
+            claude_parallel_dispatch_recipe(self.cli_model_flag(), ctx.agent_model),
+            "```".to_string(),
+            String::new(),
+            "Then run `eval-magic ingest --harness claude-code --run-mode hybrid`; Claude hybrid ingest reads each task's `outputs/claude-events.jsonl`.".to_string(),
+            String::new(),
+        ])
+    }
+    fn cli_judge_next_steps(&self, _ctx: CliJudgeContext) -> Option<String> {
+        Some(claude_judge_dispatch_recipe(self.cli_model_flag()))
+    }
     fn parse_transcript(&self, path: &Path) -> io::Result<Vec<ToolInvocation>> {
         parse_transcript(path)
     }
     fn parse_transcript_full(&self, path: &Path) -> io::Result<TranscriptSummary> {
         parse_transcript_full(path)
     }
+    fn parse_cli_events(&self, path: &Path) -> io::Result<Vec<ToolInvocation>> {
+        parse_claude_stream_json(path)
+    }
+    fn parse_cli_events_full(&self, path: &Path) -> io::Result<TranscriptSummary> {
+        parse_claude_stream_json_full(path)
+    }
     fn install_guard(
         &self,
         stage_root: &Path,
-        workspace_root: &Path,
         guard_exe: &Path,
         ttl: Option<Duration>,
     ) -> io::Result<PathBuf> {
-        crate::sandbox::install::install_claude_guard(stage_root, workspace_root, guard_exe, ttl)
+        crate::sandbox::install::install_claude_guard(stage_root, guard_exe, ttl)
+    }
+    fn guard_armed_message(&self) -> Option<&'static str> {
+        Some(
+            "\n🛡 Write guard armed: a PreToolUse hook is staged in .claude/settings.local.json\n   and will block writes/installs outside the eval sandbox during dispatches —\n   both in-session subagents and `claude -p` (hybrid/headless), which loads the\n   hook from the env cwd each dispatch runs in.\n   It auto-expires in 6h and is removed on the next run; to remove it now:\n     eval-magic teardown-guard",
+        )
     }
 }
 
@@ -238,7 +334,7 @@ impl HarnessAdapter for CodexAdapter {
         Some(vec![
             "After all dispatches (Codex):".to_string(),
             String::new(),
-            "Run one fresh `codex exec --json` per task. Detach stdin with `</dev/null` so piped task data cannot become extra prompt context; capture stdout as `outputs/codex-events.jsonl` and stderr as `outputs/codex-stderr.log`.".to_string(),
+            "Run one fresh `codex --ask-for-approval never exec --json` per task. Detach stdin with `</dev/null` so piped task data cannot become extra prompt context; capture stdout as `outputs/codex-events.jsonl` and stderr as `outputs/codex-stderr.log`.".to_string(),
             String::new(),
             "```bash".to_string(),
             codex_exec_command_template(self.cli_model_flag(), ctx.guard, ctx.agent_model),
@@ -269,11 +365,15 @@ impl HarnessAdapter for CodexAdapter {
     fn install_guard(
         &self,
         stage_root: &Path,
-        workspace_root: &Path,
         guard_exe: &Path,
         ttl: Option<Duration>,
     ) -> io::Result<PathBuf> {
-        crate::sandbox::install::install_codex_guard(stage_root, workspace_root, guard_exe, ttl)
+        crate::sandbox::install::install_codex_guard(stage_root, guard_exe, ttl)
+    }
+    fn guard_armed_message(&self) -> Option<&'static str> {
+        Some(
+            "\n🛡 Write guard armed: a PreToolUse hook is staged in .codex/hooks.json\n   and will block writes/installs outside the eval sandbox during Codex dispatches.\n   Dispatch with codex --ask-for-approval never exec --dangerously-bypass-hook-trust so the vetted eval hook runs.\n   It auto-expires in 6h and is removed on the next run; to remove it now:\n     eval-magic teardown-guard",
+        )
     }
 }
 
@@ -327,7 +427,6 @@ impl HarnessAdapter for OpenCodeAdapter {
     fn install_guard(
         &self,
         _stage_root: &Path,
-        _workspace_root: &Path,
         _guard_exe: &Path,
         _ttl: Option<Duration>,
     ) -> io::Result<PathBuf> {
@@ -392,4 +491,80 @@ mod tests {
             assert_eq!(adapter_for(h).render_plan_mode_context("   "), "");
         }
     }
+
+    #[test]
+    fn claude_adapter_advertises_cli_events_file_and_model_flag() {
+        let a = adapter_for(Harness::ClaudeCode);
+        assert_eq!(a.cli_events_filename(), Some("claude-events.jsonl"));
+        assert_eq!(a.cli_model_flag(), Some("--model"));
+    }
+
+    #[test]
+    fn guard_armed_message_is_harness_specific_and_absent_for_opencode() {
+        // The post-arm `--guard` banner names the harness's native hook surface,
+        // so it lives behind the adapter rather than in generic run code.
+        let claude = adapter_for(Harness::ClaudeCode)
+            .guard_armed_message()
+            .expect("claude code has a write guard");
+        assert!(
+            claude.contains(".claude/settings.local.json"),
+            "claude banner names its hook file: {claude}"
+        );
+
+        let codex = adapter_for(Harness::Codex)
+            .guard_armed_message()
+            .expect("codex has a write guard");
+        assert!(
+            codex.contains(".codex/hooks.json"),
+            "codex banner names its hook file: {codex}"
+        );
+
+        // OpenCode has no write guard (its install_guard errors), so there is no
+        // banner to print.
+        assert_eq!(adapter_for(Harness::OpenCode).guard_armed_message(), None);
+    }
+
+    #[test]
+    fn claude_parse_cli_events_full_reads_stream_json_result_event() {
+        use serde_json::json;
+        let dir = tempfile::TempDir::new().unwrap();
+        let path = dir.path().join("claude-events.jsonl");
+        // No per-line timestamps; the result event is the only source of duration.
+        let lines = [
+            json!({"type": "assistant", "message": {"id": "msg_1", "role": "assistant", "content": [
+                {"type": "tool_use", "id": "toolu_1", "name": "Bash", "input": {"command": "ls"}}
+            ]}}),
+            json!({"type": "result", "subtype": "success", "is_error": false, "result": "Done", "duration_ms": 5637, "usage": {"input_tokens": 1, "output_tokens": 2, "cache_creation_input_tokens": 0, "cache_read_input_tokens": 0}}),
+        ];
+        let body = lines
+            .iter()
+            .map(|l| l.to_string())
+            .collect::<Vec<_>>()
+            .join("\n");
+        std::fs::write(&path, format!("{body}\n")).unwrap();
+
+        let a = adapter_for(Harness::ClaudeCode);
+        let summary = a.parse_cli_events_full(&path).unwrap();
+        assert_eq!(summary.final_text, Some("Done".into()));
+        assert_eq!(summary.duration_ms, Some(5637));
+        assert_eq!(summary.tool_invocations.len(), 1);
+        assert_eq!(summary.tool_invocations[0].name, "Bash");
+
+        // The on-disk parser would find no duration here (no line timestamps),
+        // proving parse_cli_events_full routes to the stream-json parser.
+        assert_eq!(a.parse_transcript_full(&path).unwrap().duration_ms, None);
+    }
+
+    #[test]
+    fn codex_parse_cli_events_delegates_to_events_parser() {
+        use serde_json::json;
+        let dir = tempfile::TempDir::new().unwrap();
+        let path = dir.path().join("codex-events.jsonl");
+        let line = json!({"type": "item.completed", "item": {"id": "i1", "type": "command_execution", "command": "bun test", "output": "ok"}});
+        std::fs::write(&path, format!("{line}\n")).unwrap();
+
+        let inv = adapter_for(Harness::Codex).parse_cli_events(&path).unwrap();
+        assert_eq!(inv.len(), 1);
+        assert_eq!(inv[0].name, "command_execution");
+    }
 }
diff --git a/src/adapters/mod.rs b/src/adapters/mod.rs
index 71a36e9..f48abb1 100644
--- a/src/adapters/mod.rs
+++ b/src/adapters/mod.rs
@@ -6,8 +6,11 @@
 //! submodules, plus plugin-shadow detection. The submodules are re-exported
 //! flat so downstream code writes `crate::adapters::<fn>`.
 
+mod claude_cli;
 pub mod claude_code_session;
 pub mod claude_code_transcript;
+pub mod claude_stream_json;
+mod cli_command;
 mod codex_cli;
 pub mod codex_session;
 pub mod codex_transcript;
@@ -17,7 +20,7 @@ pub mod plugin_shadow;
 
 pub use harness::{
     ClaudeCodeAdapter, CliDispatchContext, CliJudgeContext, CliManifestContext, CodexAdapter,
-    HarnessAdapter, OpenCodeAdapter, adapter_for,
+    HEADLESS_RUNBOOK_TEMPLATE, HarnessAdapter, OpenCodeAdapter, adapter_for,
 };
 
 pub use claude_code_session::{
@@ -28,6 +31,7 @@ pub use claude_code_transcript::{
     SubagentEntry, SubagentMeta, TranscriptSummary, find_by_description, list_subagents,
     parse_transcript, parse_transcript_full,
 };
+pub use claude_stream_json::{parse_claude_stream_json, parse_claude_stream_json_full};
 pub use codex_session::{render_codex_available_skills_block, render_codex_plan_mode_context};
 pub use codex_transcript::{parse_codex_events, parse_codex_events_full};
 pub use opencode_session::{
diff --git a/src/cli/args.rs b/src/cli/args.rs
index 57c7148..0bdb0e7 100644
--- a/src/cli/args.rs
+++ b/src/cli/args.rs
@@ -6,7 +6,7 @@
 
 use clap::{Args, Parser, Subcommand};
 
-use crate::core::Harness;
+use crate::core::{Harness, RunMode};
 
 /// Run skill evals — measure whether an agent skill actually shifts behavior.
 ///
@@ -81,7 +81,21 @@ pub struct CommonArgs {
     /// `--guard` are not yet wired for OpenCode.
     #[arg(long)]
     pub harness: Option<Harness>,
-    /// Workspace directory (defaults to `<cwd>/skills-workspace`).
+    /// Run mode: `interactive` (in-session subagents), `hybrid` (an agent
+    /// orchestrates while each dispatch shells out to the harness CLI), or
+    /// `headless` (CLI-only, no session).
+    ///
+    /// Defaults per harness — Claude Code → `interactive`, Codex/OpenCode →
+    /// `hybrid`. `hybrid`/`headless` dispatch through the harness CLI (`claude -p`,
+    /// `codex exec`) and read each task's `outputs/<harness>-events.jsonl`;
+    /// `interactive` dispatches in-session subagents. Claude Code wires all three
+    /// (`hybrid`/`headless` ride `claude -p` stream-json); Codex wires `hybrid` +
+    /// `headless`; OpenCode wires `hybrid` only. Pass the same value to every command
+    /// of a run (it selects the transcript source at `ingest`); the printed next-step
+    /// commands already carry it.
+    #[arg(long)]
+    pub run_mode: Option<RunMode>,
+    /// Workspace directory (defaults to `<cwd>/.eval-magic`).
     ///
     /// The artifact root. Pass the same value to every command of a run, including
     /// `teardown`.
@@ -192,6 +206,31 @@ pub struct GradeArgs {
     pub finalize: bool,
 }
 
+/// `switch-condition` names the condition about to be dispatched (the one to keep)
+/// on top of the common set.
+#[derive(Debug, Args)]
+pub struct SwitchConditionArgs {
+    #[command(flatten)]
+    pub common: CommonArgs,
+    /// The condition you are about to dispatch next (the one to KEEP). Its
+    /// counterpart's staged skill is removed from `env/.claude/skills/`.
+    #[arg(long)]
+    pub condition: String,
+}
+
+/// `reset-batch` names the isolation group about to be dispatched, on top of the
+/// common set.
+#[derive(Debug, Args)]
+pub struct ResetBatchArgs {
+    #[command(flatten)]
+    pub common: CommonArgs,
+    /// The isolation group you are about to dispatch next. The shared `env/`'s
+    /// working tree is wiped (keeping the staged skills + the outputs tree) and
+    /// re-seeded with this group's fixtures.
+    #[arg(long)]
+    pub group: String,
+}
+
 /// `snapshot` adds a label and an optional git ref on top of the common set.
 #[derive(Debug, Args)]
 pub struct SnapshotArgs {
@@ -268,24 +307,36 @@ pub struct RunArgs {
     /// For harnesses without project-local skill discovery. Forces the LLM-judge
     /// meta-check tier and inlines only SKILL.md (not sibling skills or sibling
     /// asset files); use the staged (default) path when the measured behavior
-    /// depends on sibling files. Also disables `--guard` — the write guard
-    /// requires staging — so no-stage runs are unguarded and rely on
-    /// `detect-stray-writes` after the fact.
+    /// depends on sibling files. The isolated env (`env/`) is still built either
+    /// way — `--no-stage` only skips populating the harness skills dir. Also
+    /// disables `--guard` — the write guard requires staging — so no-stage runs
+    /// are unguarded and rely on `detect-stray-writes` after the fact.
     #[arg(long)]
     pub no_stage: bool,
     /// Arm the write guard (PreToolUse hook) for the dispatch window.
     ///
     /// Stages a harness-native `PreToolUse` hook that *blocks* subagent
-    /// writes/installs outside the eval sandbox while dispatches run. Arm it
-    /// unless the user opts out. The marker auto-expires after 6h and is torn down
-    /// at the next run; while armed the hook fires on your own tool calls too.
-    /// If it remains armed after `finalize`, `finalize` reminds you to run
-    /// `teardown-guard` before editing source. Requires staging — incompatible
-    /// with `--no-stage`, under which guard install is skipped and the run is
-    /// unguarded.
+    /// writes/installs outside the isolated run env (the agent-under-test's cwd)
+    /// while dispatches run. Its allowed roots are the env plus the OS temp dir, so
+    /// the guard boundary matches the same env that isolates the agent's reads.
+    /// Because the harness already cwd-bounds the agent's direct file tools to the
+    /// env, the guard's main remaining value is blocking Bash-subprocess escapes the
+    /// cwd boundary doesn't cover — `npm install`, `git worktree add`, `sed -i`,
+    /// redirects to absolute paths — and acting as a backstop when the isolated
+    /// session runs with relaxed permissions. Arm it unless the user opts out. The
+    /// marker auto-expires after 6h and is torn down at the next run; while armed the
+    /// hook fires on your own tool calls too. If it remains armed after `finalize`,
+    /// `finalize` reminds you to run `teardown` before editing source (which disarms
+    /// the cwd guard and every per-`(group, condition)` Cli env's guard). Requires
+    /// staging — incompatible with `--no-stage`, under which guard install is skipped
+    /// and the run is unguarded.
     /// Codex dispatches must include `--dangerously-bypass-hook-trust` so the
     /// vetted project-local eval hook runs. Unguarded, stray writes are only
     /// *detected* after the fact by `detect-stray-writes`, never blocked.
+    /// Works under Claude Code's CLI run modes (`hybrid`/`headless`) too: the
+    /// `PreToolUse` hook is staged in `env/.claude/settings.local.json`, and each
+    /// `claude -p` dispatch loads it from that cwd (`cd <eval-root>`), enforcing the
+    /// same boundary as an in-session run (the recipe never passes `--bare`).
     /// When invoking this from inside Codex, staging writes `.agents/skills` and
     /// guarded runs also write `.codex/hooks.json`; Codex protects those paths in
     /// its default workspace-write sandbox, so approval/escalation may be needed.
@@ -352,6 +403,9 @@ pub(crate) enum Commands {
     /// Builds the iteration workspace, snapshots the `SKILL.md`, stages skills, and
     /// emits `dispatch.json` (machine-readable) alongside `dispatch-manifest.md`
     /// (human-readable). Your agent then dispatches each task as a fresh subagent.
+    /// Also writes `RUNBOOK.md`, a followable handoff for an isolated run session
+    /// ("Read and follow RUNBOOK.md") — interactive (agent-followed) for Claude
+    /// Code, human-followed for Codex/OpenCode.
     Run(RunArgs),
     /// Snapshot a workspace baseline.
     ///
@@ -384,9 +438,31 @@ pub(crate) enum Commands {
     /// Finalize grading after judge responses are in.
     ///
     /// Fixed-order chain: grade `--finalize` → aggregate. Merges the judge verdicts
-    /// and writes `benchmark.json`. If a live guard remains armed, prints a
-    /// `teardown-guard` reminder before source edits. Requires `--iteration`.
+    /// and writes `benchmark.json`. If a live guard remains armed — the cwd guard, or
+    /// any per-`(group, condition)` Cli env guard — prints a `teardown` reminder before
+    /// source edits. Requires `--iteration`.
     Finalize(CommonArgs),
+    /// Switch the active condition batch in a single-session isolated run.
+    ///
+    /// Removes the *off-condition*'s staged skill from `env/.claude/skills/` so the
+    /// next batch you dispatch cannot read it — the per-condition read-isolation
+    /// barrier for an interactive isolated run (see `RUNBOOK.md`).
+    /// `--condition` names the condition you are about to
+    /// dispatch next (the one to keep); its counterpart's staged skill is removed.
+    /// Run it only after every Task subagent of the prior batch has returned — it is
+    /// a hard barrier. Idempotent; resolves the iteration from `--workspace-dir` so
+    /// it works invoked from `env/`. Requires `--iteration`.
+    SwitchCondition(SwitchConditionArgs),
+    /// Swap the active isolation batch in a single-session isolated run.
+    ///
+    /// Wipes the shared `env/` working tree (keeping `.claude/skills/` and the
+    /// `.eval-magic-outputs/` tree) and re-seeds it with `--group`'s fixtures — the
+    /// per-batch isolation barrier between eval groups in an interactive isolated run
+    /// (see `RUNBOOK.md`). `--group` names the group you are
+    /// about to dispatch next. Run it only after every Task subagent of the prior
+    /// batch has returned — it is a hard barrier. Resolves the iteration from
+    /// `--workspace-dir` so it works invoked from `env/`. Requires `--iteration`.
+    ResetBatch(ResetBatchArgs),
     /// Assemble run records from a dispatch and its transcripts.
     ///
     /// Assembles a schema-valid `run.json` and backfills `timing.json` for every
diff --git a/src/cli/commands/mod.rs b/src/cli/commands/mod.rs
index 1f04e62..cbb697f 100644
--- a/src/cli/commands/mod.rs
+++ b/src/cli/commands/mod.rs
@@ -15,7 +15,7 @@ pub(crate) use guard::{run_guard, run_guard_codex, run_teardown_guard};
 pub(crate) use init::run_init;
 pub(crate) use pipeline::{
     run_aggregate, run_detect_stray_writes, run_fill_transcripts, run_finalize, run_grade,
-    run_ingest, run_record_runs,
+    run_ingest, run_record_runs, run_reset_batch, run_switch_condition,
 };
 pub(crate) use run::run_run;
 pub(crate) use validate::run_validate;
diff --git a/src/cli/commands/pipeline.rs b/src/cli/commands/pipeline.rs
index fc49bd1..501ae96 100644
--- a/src/cli/commands/pipeline.rs
+++ b/src/cli/commands/pipeline.rs
@@ -5,11 +5,13 @@
 use anyhow::bail;
 
 use crate::adapters::{CliJudgeContext, adapter_for};
-use crate::cli::args::{CommonArgs, GradeArgs};
+use crate::cli::args::{CommonArgs, GradeArgs, ResetBatchArgs, SwitchConditionArgs};
 use crate::cli::command_target_args;
 use crate::cli::run;
-use crate::cli::{iteration_dir, resolve_iteration, resolve_subagents_dir, run_context_from};
-use crate::core::{DispatchMechanism, RunContext, mechanism_for};
+use crate::cli::{
+    iteration_dir, resolve_iteration, resolve_subagents_dir, run_context_from, staged_env_roots,
+};
+use crate::core::{DispatchMechanism, RunContext};
 use crate::pipeline;
 use crate::sandbox;
 use crate::validation;
@@ -17,7 +19,7 @@ use crate::validation;
 const JUDGE_WORKER_PROMPT: &str = "Read the file at <dispatch_prompt_path> and follow it exactly. You are a judge worker only: write the JSON verdict to <response_path>, then reply with one sentence. Do not run eval-magic. Do not dispatch other judge tasks. Do not wait for other workers.";
 
 fn judge_dispatch_guidance(ctx: &RunContext) -> String {
-    match mechanism_for(ctx.harness) {
+    match ctx.run_mode.mechanism() {
         DispatchMechanism::InSession => {
             format!("Dispatch each task as a judge subagent with:\n  {JUDGE_WORKER_PROMPT}")
         }
@@ -45,6 +47,7 @@ fn run_step(step: &run::steps::StepCommand) -> anyhow::Result<()> {
         iteration: Some(step.iteration),
         mode: None,
         harness: Some(step.harness),
+        run_mode: Some(step.run_mode),
         workspace_dir: step.workspace_dir.clone(),
         // The chain carries the already-resolved absolute subagents dir, so the
         // session id is no longer needed downstream.
@@ -73,7 +76,7 @@ pub(crate) fn run_ingest(args: CommonArgs) -> anyhow::Result<()> {
     let ctx = run_context_from(&args)?;
     let iteration = resolve_iteration(&ctx, args.iteration)?;
     let resolved = resolve_subagents_dir(
-        ctx.harness,
+        ctx.run_mode.mechanism(),
         args.subagents_dir.as_deref(),
         args.session_id.as_deref(),
     )?;
@@ -84,6 +87,7 @@ pub(crate) fn run_ingest(args: CommonArgs) -> anyhow::Result<()> {
         skill: args.skill.as_deref(),
         iteration,
         harness: ctx.harness,
+        run_mode: ctx.run_mode,
         subagents_dir: resolved.as_deref(),
         workspace_dir: args.workspace_dir.as_deref(),
     });
@@ -128,6 +132,7 @@ pub(crate) fn run_finalize(args: CommonArgs) -> anyhow::Result<()> {
         skill: args.skill.as_deref(),
         iteration,
         harness: ctx.harness,
+        run_mode: ctx.run_mode,
         subagents_dir: None,
         workspace_dir: args.workspace_dir.as_deref(),
     });
@@ -138,9 +143,194 @@ pub(crate) fn run_finalize(args: CommonArgs) -> anyhow::Result<()> {
     println!(
         "\n✅ Finalize complete. Read the benchmark above, then tear down: eval-magic teardown{target_args}"
     );
-    if sandbox::guard_is_armed(&ctx.stage_root) {
-        println!("⚠ Guard still armed — run `eval-magic teardown-guard` before editing source.");
+    // Warn if a guard is still armed. The cwd check covers the in-session flow (run
+    // from inside `env/`); under Cli there is one env per (group, condition), so also
+    // walk each per-env marker. `teardown` (not the cwd-only `teardown-guard`) is what
+    // disarms them all.
+    let mut armed = sandbox::guard_is_armed(&ctx.stage_root);
+    if !armed
+        && ctx.run_mode.mechanism() == DispatchMechanism::Cli
+        && let Ok(dir) = iteration_dir(&ctx, Some(iteration))
+    {
+        armed = staged_env_roots(&dir)
+            .iter()
+            .any(|env| sandbox::guard_is_armed(env));
     }
+    if armed {
+        println!(
+            "⚠ Guard still armed — run `eval-magic teardown` to disarm before editing source."
+        );
+    }
+    Ok(())
+}
+
+/// Switch the active condition batch in a single-session isolated run: remove the
+/// *off-condition*'s staged skill from `env/.claude/skills/` so the next batch the
+/// session dispatches cannot read it. `--condition` names the condition about to be
+/// dispatched (the one to keep); its counterpart is removed. Idempotent, and a hard
+/// barrier — the runbook instructs the operator to join every Task subagent of the
+/// prior batch first. Resolves the iteration from `--workspace-dir`, so it runs from
+/// `cwd = env/`. The guard marker is a sibling file of the slug subtree, so removing
+/// the slug dir leaves it (and an armed guard) intact.
+pub(crate) fn run_switch_condition(args: SwitchConditionArgs) -> anyhow::Result<()> {
+    let ctx = run_context_from(&args.common)?;
+    let dir = iteration_dir(&ctx, args.common.iteration)?;
+
+    let conditions_path = dir.join("conditions.json");
+    if !conditions_path.exists() {
+        bail!("missing: {}", conditions_path.display());
+    }
+    let conditions: crate::core::ConditionsRecord =
+        serde_json::from_str(&std::fs::read_to_string(&conditions_path)?)?;
+
+    // `--condition` names the arm to KEEP; its counterpart is the off-condition to
+    // remove. Validate against the recorded conditions so a typo fails loudly
+    // instead of silently no-opping.
+    let names: Vec<&str> = conditions
+        .conditions
+        .iter()
+        .map(|c| c.name.as_str())
+        .collect();
+    if !names.contains(&args.condition.as_str()) {
+        bail!(
+            "unknown --condition '{}'; this iteration's conditions are: {}",
+            args.condition,
+            names.join(", ")
+        );
+    }
+    let off = conditions
+        .conditions
+        .iter()
+        .find(|c| c.name != args.condition)
+        .ok_or_else(|| anyhow::anyhow!("no off-condition to switch away from"))?;
+
+    let skills_dir = run::staging::skills_dir_for_harness(&dir.join("env"), ctx.harness);
+    match off.staged_skill_slug.as_ref() {
+        // The off-condition staged a skill: remove exactly its slug subtree. We do
+        // NOT use `cleanup_staged_skills` (it prefix-scans and would remove both
+        // arms' slugs and prune the dir) — only this one slug must go.
+        Some(Some(slug)) => {
+            let slug_dir = skills_dir.join(slug);
+            if slug_dir.exists() {
+                std::fs::remove_dir_all(&slug_dir)?;
+                println!(
+                    "Switched to '{}': removed off-condition '{}' staged skill ({}).",
+                    args.condition,
+                    off.name,
+                    slug_dir.display()
+                );
+            } else {
+                println!(
+                    "Switched to '{}': off-condition '{}' staged skill already absent — nothing to do.",
+                    args.condition, off.name
+                );
+            }
+        }
+        // The off-condition never staged a skill (e.g. the new-skill control arm),
+        // so there is nothing to hide.
+        _ => println!(
+            "Switched to '{}': off-condition '{}' has no staged skill — nothing to remove.",
+            args.condition, off.name
+        ),
+    }
+    Ok(())
+}
+
+/// Swap the active isolation batch in a single-session (in-session) isolated run:
+/// wipe the shared `env/` working tree — keeping the staged skills and the
+/// `.eval-magic-outputs/` tree — and re-seed it with `--group`'s fixtures, so the
+/// next batch starts from a clean tree the prior batch's fixtures and stray writes
+/// can't taint. A hard barrier: the runbook joins every Task subagent of the prior
+/// batch first. Resolves the iteration from `--workspace-dir`, so it runs from
+/// `cwd = env/`.
+pub(crate) fn run_reset_batch(args: ResetBatchArgs) -> anyhow::Result<()> {
+    let ctx = run_context_from(&args.common)?;
+    let dir = iteration_dir(&ctx, args.common.iteration)?;
+    let env_dir = dir.join("env");
+    if !env_dir.exists() {
+        bail!("missing env dir: {}", env_dir.display());
+    }
+
+    let dispatch_path = dir.join("dispatch.json");
+    if !dispatch_path.exists() {
+        bail!("missing: {}", dispatch_path.display());
+    }
+    let dispatch: serde_json::Value =
+        serde_json::from_str(&std::fs::read_to_string(&dispatch_path)?)?;
+    let tasks = dispatch["tasks"].as_array().cloned().unwrap_or_default();
+
+    // Groups are tagged on tasks only when there is more than one. Validate against
+    // them so a typo (or a needless reset on a single-group run) fails loudly.
+    let group_ids: std::collections::BTreeSet<&str> =
+        tasks.iter().filter_map(|t| t["group"].as_str()).collect();
+    if !group_ids.contains(args.group.as_str()) {
+        if group_ids.is_empty() {
+            bail!(
+                "unknown --group '{}'; this iteration has a single group, so reset-batch is not needed.",
+                args.group
+            );
+        }
+        bail!(
+            "unknown --group '{}'; this iteration's groups are: {}",
+            args.group,
+            group_ids.into_iter().collect::<Vec<_>>().join(", ")
+        );
+    }
+
+    // The group's declared, env-relative fixture dests (deduped across its tasks).
+    let mut dests: Vec<String> = Vec::new();
+    for t in &tasks {
+        if t["group"].as_str() != Some(args.group.as_str()) {
+            continue;
+        }
+        if let Some(fixtures) = t["fixtures"].as_array() {
+            for f in fixtures.iter().filter_map(|f| f.as_str()) {
+                if !dests.iter().any(|d| d == f) {
+                    dests.push(f.to_string());
+                }
+            }
+        }
+    }
+
+    // Full wipe: drop every entry in env/ except the staged skills, the outputs
+    // tree, and the runbook — so a prior batch's fixtures and any stray writes can't
+    // leak into this one.
+    const KEEP: &[&str] = &[
+        ".claude",
+        ".agents",
+        ".codex",
+        ".opencode",
+        ".eval-magic-outputs",
+        "RUNBOOK.md",
+    ];
+    for entry in std::fs::read_dir(&env_dir)? {
+        let entry = entry?;
+        if KEEP.iter().any(|k| entry.file_name() == **k) {
+            continue;
+        }
+        let path = entry.path();
+        if path.is_dir() {
+            std::fs::remove_dir_all(&path)?;
+        } else {
+            std::fs::remove_file(&path)?;
+        }
+    }
+
+    // Re-seed this group's fixtures from the skill's evals/ dir.
+    for dest in &dests {
+        let src = ctx.skill_subdir.join("evals").join(dest);
+        let dst = env_dir.join(dest);
+        if let Some(parent) = dst.parent() {
+            std::fs::create_dir_all(parent)?;
+        }
+        run::copy_entry(&src, &dst)?;
+    }
+
+    println!(
+        "Reset to group '{}': wiped the env working tree and re-seeded {} fixture(s).",
+        args.group,
+        dests.len()
+    );
     Ok(())
 }
 
@@ -148,15 +338,17 @@ pub(crate) fn run_finalize(args: CommonArgs) -> anyhow::Result<()> {
 /// `dispatch.json`.
 pub(crate) fn run_record_runs(args: CommonArgs) -> anyhow::Result<()> {
     let ctx = run_context_from(&args)?;
+    let mechanism = ctx.run_mode.mechanism();
     let resolved = resolve_subagents_dir(
-        ctx.harness,
+        mechanism,
         args.subagents_dir.as_deref(),
         args.session_id.as_deref(),
     )?;
     let subagents_dir = resolved.as_deref();
 
     let dir = iteration_dir(&ctx, args.iteration)?;
-    let result = pipeline::record_runs(&dir, ctx.harness, subagents_dir, args.overwrite)?;
+    let result =
+        pipeline::record_runs(&dir, ctx.harness, mechanism, subagents_dir, args.overwrite)?;
 
     println!(
         "\nRecorded: {}, skipped (existing run.json): {}, skipped (no final message): {}, missing transcript: {}",
@@ -165,7 +357,7 @@ pub(crate) fn run_record_runs(args: CommonArgs) -> anyhow::Result<()> {
         result.skipped_no_final_message,
         result.missing_transcript
     );
-    if let Some(warning) = result.transcript_warning(ctx.harness) {
+    if let Some(warning) = result.transcript_warning(ctx.harness, mechanism) {
         eprintln!("{warning}");
     }
     Ok(())
@@ -175,15 +367,17 @@ pub(crate) fn run_record_runs(args: CommonArgs) -> anyhow::Result<()> {
 /// the iteration.
 pub(crate) fn run_fill_transcripts(args: CommonArgs) -> anyhow::Result<()> {
     let ctx = run_context_from(&args)?;
+    let mechanism = ctx.run_mode.mechanism();
     let resolved = resolve_subagents_dir(
-        ctx.harness,
+        mechanism,
         args.subagents_dir.as_deref(),
         args.session_id.as_deref(),
     )?;
     let subagents_dir = resolved.as_deref();
 
     let dir = iteration_dir(&ctx, args.iteration)?;
-    let result = pipeline::fill_transcripts(&dir, ctx.harness, subagents_dir, args.overwrite)?;
+    let result =
+        pipeline::fill_transcripts(&dir, ctx.harness, mechanism, subagents_dir, args.overwrite)?;
 
     println!(
         "\nFilled: {}, skipped (already populated): {}, missing transcript: {}",
diff --git a/src/cli/commands/workspace.rs b/src/cli/commands/workspace.rs
index 4ceb853..2905485 100644
--- a/src/cli/commands/workspace.rs
+++ b/src/cli/commands/workspace.rs
@@ -5,7 +5,10 @@ use std::path::Path;
 
 use crate::cli::args::{CommonArgs, PromoteBaselineArgs, SnapshotArgs};
 use crate::cli::run;
-use crate::cli::{command_target_args, resolve_iteration, run_context_from};
+use crate::cli::{
+    command_target_args, iteration_dir, resolve_iteration, run_context_from, staged_env_roots,
+};
+use crate::core::DispatchMechanism;
 use crate::sandbox;
 use crate::workspace;
 
@@ -88,8 +91,19 @@ pub(crate) fn run_promote_baseline(args: PromoteBaselineArgs) -> anyhow::Result<
 /// any iteration with uncommitted results.
 pub(crate) fn run_teardown(args: CommonArgs) -> anyhow::Result<()> {
     let ctx = run_context_from(&args)?;
-    // The guard lives at `<cwd>/.claude` (cwd-only, matching `teardown-guard`).
-    let torn = sandbox::teardown_guard(&std::env::current_dir()?);
+    // Disarm the guard at the invocation cwd — the in-session flow runs teardown from
+    // inside `env/`. Under Cli there is one env per (group, condition) and the human
+    // runs teardown from the iteration dir, so additionally walk each per-env marker
+    // (before `cleanup_workspace` reclaims the tree). Best-effort: a missing iteration
+    // just skips the walk; `teardown_guard` is a no-op without a marker.
+    let mut torn = sandbox::teardown_guard(&std::env::current_dir()?);
+    if ctx.run_mode.mechanism() == DispatchMechanism::Cli
+        && let Ok(dir) = iteration_dir(&ctx, args.iteration)
+    {
+        for env in staged_env_roots(&dir) {
+            torn |= sandbox::teardown_guard(&env);
+        }
+    }
     run::staging::cleanup_staged_skills(&ctx.stage_root, ctx.harness)?;
     let ws = workspace::cleanup_workspace(&ctx.workspace_root, &ctx.skill_name);
 
@@ -120,9 +134,7 @@ pub(crate) fn run_teardown(args: CommonArgs) -> anyhow::Result<()> {
         eprintln!(
             "⚠ Kept {} workspace iteration(s) with results not yet committed:\n{lines}\n   Commit them, e.g.:\n     eval-magic promote-baseline{target_args} --iteration <N>\n   or delete {}/ manually to discard.",
             ws.kept_iterations.len(),
-            Path::new("skills-workspace")
-                .join(&ctx.skill_name)
-                .display()
+            Path::new(".eval-magic").join(&ctx.skill_name).display()
         );
     }
     Ok(())
diff --git a/src/cli/help.rs b/src/cli/help.rs
index 0e69bff..c97afab 100644
--- a/src/cli/help.rs
+++ b/src/cli/help.rs
@@ -13,13 +13,16 @@ EXAMPLES:
 
   # Mode A — evaluate a new skill (with vs. without)
   eval-magic run --guard
-  # …dispatch each task in dispatch.json as a fresh subagent…
-  eval-magic ingest   # auto-resolves --subagents-dir from CLAUDE_CODE_SESSION_ID
-                      # (override: --session-id <id> or --subagents-dir <path>)
-  # …dispatch each judge task ingest listed…
-  eval-magic finalize
-  eval-magic promote-baseline   # optional
-  eval-magic teardown
+  # run builds the isolated env/ + RUNBOOK.md, then prints a handoff:
+  #   cd into env/, start a fresh session, say \"Read and follow RUNBOOK.md\".
+  # The fresh session walks the whole loop below from inside env/:
+  #   …dispatch each task in dispatch.json as a fresh subagent…
+  #   eval-magic ingest      # auto-resolves --subagents-dir from CLAUDE_CODE_SESSION_ID
+  #                          # (override: --session-id <id> or --subagents-dir <path>)
+  #   …dispatch each judge task ingest listed…
+  #   eval-magic finalize
+  #   eval-magic teardown
+  eval-magic promote-baseline   # optional, from the prep session once benchmark.json lands
 
   # Mode B — evaluate a language change (edit-first)
   eval-magic snapshot --ref HEAD
diff --git a/src/cli/mod.rs b/src/cli/mod.rs
index daa9d4d..a83aca7 100644
--- a/src/cli/mod.rs
+++ b/src/cli/mod.rs
@@ -6,13 +6,13 @@
 //! [`commands`], grouped by concern. This module is the thin coordinator: parse,
 //! dispatch, and the shared context/iteration helpers the handlers reuse.
 
-use std::path::PathBuf;
+use std::path::{Path, PathBuf};
 
 use anyhow::{anyhow, bail};
 use clap::Parser;
 
 use crate::adapters::{config_dir_from_env, resolve_subagents_dir_for_session};
-use crate::core::{DetectInput, Harness, RunContext, detect_run_context};
+use crate::core::{DetectInput, DispatchMechanism, RunContext, detect_run_context};
 
 mod args;
 mod commands;
@@ -37,6 +37,7 @@ fn dispatch(command: Option<Commands>) -> anyhow::Result<()> {
             iteration: None,
             mode: None,
             harness: None,
+            run_mode: None,
             workspace_dir: None,
             subagents_dir: None,
             session_id: None,
@@ -61,6 +62,8 @@ fn dispatch(command: Option<Commands>) -> anyhow::Result<()> {
         Commands::Run(args) => run_run(args),
         Commands::Ingest(args) => run_ingest(args),
         Commands::Finalize(args) => run_finalize(args),
+        Commands::SwitchCondition(args) => run_switch_condition(args),
+        Commands::ResetBatch(args) => run_reset_batch(args),
         Commands::Init(args) => run_init(args),
         Commands::Validate(args) => run_validate(args),
         Commands::TeardownGuard(_) => run_teardown_guard(),
@@ -95,6 +98,7 @@ pub(crate) fn run_context_with_bootstrap(
         bootstrap,
         workspace_dir: args.workspace_dir.clone(),
         harness: args.harness,
+        run_mode: args.run_mode,
         cwd: None,
     })?)
 }
@@ -111,14 +115,20 @@ pub(crate) fn parse_id_list(v: Option<&str>) -> Option<Vec<String>> {
 
 /// Render a fully self-sufficient target selector for the current run context.
 ///
-/// Always names both `--skill-dir` and `--skill` (both are always populated in
-/// [`RunContext`] and always re-resolve), so the printed "Next:" commands are
-/// copy-pasteable from any cwd — not just the one `run` happened to start in.
+/// Always names `--skill-dir`, `--skill`, and `--workspace-dir` (all three are
+/// always populated in [`RunContext`] and always re-resolve), so the printed
+/// "Next:" commands are copy-pasteable from any cwd — not just the one `run`
+/// happened to start in. The absolute `--workspace-dir` is what lets the isolated
+/// session run `ingest`/`finalize`/`switch-condition` from `cwd = iteration-N/env/`:
+/// without it, `workspace_root` would default to `<cwd>/.eval-magic`
+/// (`detect_run_context`) and the iteration tree above the env would not resolve.
 pub(crate) fn command_target_args(ctx: &RunContext) -> String {
     format!(
-        " --skill-dir {} --skill {}",
+        " --skill-dir {} --skill {} --workspace-dir {} --run-mode {}",
         ctx.skill_dir.display(),
-        ctx.skill_name
+        ctx.skill_name,
+        ctx.workspace_root.display(),
+        ctx.run_mode.as_str()
     )
 }
 
@@ -168,20 +178,47 @@ pub(crate) fn iteration_dir(ctx: &RunContext, iteration: Option<u32>) -> anyhow:
     Ok(dir)
 }
 
-/// Resolve the subagents transcript dir for a Claude Code stage that reads
-/// transcripts. Precedence: an explicit `--subagents-dir` (validated to exist)
-/// wins; otherwise resolve from a session id — the `--session-id` flag if given,
-/// else the `CLAUDE_CODE_SESSION_ID` env var Claude Code sets in the
+/// The env directories a run staged under `iteration_dir`: the single `env/` for
+/// the InSession mechanism, or one `env-<group>-<condition>/` per `(group, condition)`
+/// for Cli. A best-effort directory scan (returns empty when the dir can't be read),
+/// used by `teardown`/`finalize` to walk every env's write guard. Preferred over
+/// reading `dispatch.json` because it has no parse-failure mode, needs no path
+/// re-basing (recorded env dirs can be relative), and the only `env`/`env-*` children
+/// of an iteration dir are the staged envs.
+pub(crate) fn staged_env_roots(iteration_dir: &Path) -> Vec<PathBuf> {
+    let Ok(entries) = std::fs::read_dir(iteration_dir) else {
+        return Vec::new();
+    };
+    entries
+        .flatten()
+        .filter(|e| e.path().is_dir())
+        .filter(|e| {
+            let name = e.file_name();
+            let name = name.to_string_lossy();
+            name == "env" || name.starts_with("env-")
+        })
+        .map(|e| e.path())
+        .collect()
+}
+
+/// Resolve the subagents transcript dir for an in-session stage that reads
+/// transcripts. The subagents dir is the `InSession` transcript source, so this
+/// is keyed on the dispatch *mechanism*, not the harness: `Cli`-mechanism runs
+/// (Codex; Claude Code hybrid/headless) read each task's `outputs/<events>.jsonl`
+/// and resolve to `None` — they must never bail on a missing
+/// `CLAUDE_CODE_SESSION_ID`. For the `InSession` mechanism (Claude Code
+/// interactive), precedence is: an explicit `--subagents-dir` (validated to
+/// exist) wins; otherwise resolve from a session id — the `--session-id` flag if
+/// given, else the `CLAUDE_CODE_SESSION_ID` env var Claude Code sets in the
 /// orchestrating agent's shell — locating
 /// `<config>/projects/<cwd-slug>/<session-id>/subagents/` (scanning `projects/*`
-/// if the cwd slug differs). Codex/OpenCode read `outputs/codex-events.jsonl`,
-/// so they resolve to `None`.
+/// if the cwd slug differs).
 pub(crate) fn resolve_subagents_dir(
-    harness: Harness,
+    mechanism: DispatchMechanism,
     subagents_dir: Option<&str>,
     session_id: Option<&str>,
 ) -> anyhow::Result<Option<PathBuf>> {
-    if harness != Harness::ClaudeCode {
+    if mechanism != DispatchMechanism::InSession {
         return Ok(None);
     }
     if let Some(dir) = subagents_dir {
@@ -275,25 +312,82 @@ mod tests {
         assert_eq!(resolved.skill_subdir, ctx.skill_subdir);
     }
 
+    /// The isolated session runs `ingest`/`finalize`/`switch-condition` from
+    /// `cwd = iteration-N/env/`. Without an explicit workspace root those commands
+    /// default `workspace_root` to `<cwd>/.eval-magic` and bail "not found",
+    /// so the selector must carry an absolute `--workspace-dir` pointing at the
+    /// real workspace above the env.
+    #[test]
+    fn target_args_carry_absolute_workspace_dir() {
+        let tmp = TempDir::new().unwrap();
+        let root = fs::canonicalize(tmp.path()).unwrap();
+        let skill_subdir = make_skill(&root, "skills", "mr-review");
+
+        let ctx = detect_run_context(DetectInput {
+            cwd: Some(skill_subdir),
+            ..Default::default()
+        })
+        .unwrap();
+
+        let args = command_target_args(&ctx);
+        assert!(
+            args.contains(&format!("--workspace-dir {}", ctx.workspace_root.display())),
+            "selector names absolute --workspace-dir: {args}"
+        );
+        assert!(
+            ctx.workspace_root.is_absolute(),
+            "workspace_root is absolute: {}",
+            ctx.workspace_root.display()
+        );
+
+        // Round-trip from an env-like cwd below the workspace: feeding the
+        // selector's roots back resolves the SAME workspace, not
+        // `<cwd>/.eval-magic`.
+        let env_like = ctx
+            .workspace_root
+            .join("mr-review")
+            .join("iteration-1")
+            .join("env");
+        fs::create_dir_all(&env_like).unwrap();
+        let resolved = detect_run_context(DetectInput {
+            skill_dir: Some(ctx.skill_dir.display().to_string()),
+            skill: Some(ctx.skill_name.clone()),
+            workspace_dir: Some(ctx.workspace_root.display().to_string()),
+            cwd: Some(env_like),
+            ..Default::default()
+        })
+        .unwrap();
+        assert_eq!(resolved.workspace_root, ctx.workspace_root);
+    }
+
     #[test]
-    fn resolve_subagents_dir_is_none_for_non_claude_harness() {
-        // Codex/OpenCode never read a subagents dir, so resolution is a no-op
-        // even when a dir is passed.
+    fn resolve_subagents_dir_is_none_for_cli_mechanism() {
+        // The subagents dir is the InSession transcript source. Cli-mechanism
+        // runs (Codex; Claude Code hybrid/headless) read each task's events file,
+        // so resolution is a no-op — and must NOT bail on a missing
+        // CLAUDE_CODE_SESSION_ID. This is the regression: the old harness-keyed
+        // gate forced session resolution for Claude Code and aborted under
+        // hybrid/headless. The Cli arm returns before reading any env var, so this
+        // is deterministic regardless of the test runner's environment.
         assert_eq!(
-            resolve_subagents_dir(Harness::Codex, Some("/whatever"), None).unwrap(),
+            resolve_subagents_dir(DispatchMechanism::Cli, None, None).unwrap(),
             None
         );
+        // A passed --subagents-dir is ignored in Cli mode (the events file is the
+        // source), so it resolves to None without touching the filesystem.
         assert_eq!(
-            resolve_subagents_dir(Harness::OpenCode, None, None).unwrap(),
+            resolve_subagents_dir(DispatchMechanism::Cli, Some("/whatever"), None).unwrap(),
             None
         );
     }
 
     #[test]
     fn resolve_subagents_dir_uses_existing_explicit_dir() {
+        // InSession (Claude Code interactive): an explicit, existing
+        // --subagents-dir wins over any session-id resolution.
         let tmp = TempDir::new().unwrap();
         let resolved = resolve_subagents_dir(
-            Harness::ClaudeCode,
+            DispatchMechanism::InSession,
             Some(&tmp.path().display().to_string()),
             None,
         )
@@ -303,8 +397,10 @@ mod tests {
 
     #[test]
     fn resolve_subagents_dir_errors_when_explicit_dir_missing() {
+        // InSession with an explicit --subagents-dir that doesn't exist is a hard
+        // error (not a silent fallback to session-id resolution).
         let err = resolve_subagents_dir(
-            Harness::ClaudeCode,
+            DispatchMechanism::InSession,
             Some("/no/such/subagents/dir/xyz"),
             None,
         )
diff --git a/src/cli/run/dispatch.rs b/src/cli/run/dispatch.rs
index 43b70b3..87ddf94 100644
--- a/src/cli/run/dispatch.rs
+++ b/src/cli/run/dispatch.rs
@@ -7,7 +7,6 @@
 //! available-skills block, an optional plan-mode `<system-reminder>`, then the
 //! eval task framing.
 
-use std::ffi::OsStr;
 use std::fs;
 use std::path::Path;
 
@@ -15,9 +14,9 @@ use regex::Regex;
 use serde::{Deserialize, Serialize};
 
 use crate::adapters::{CliManifestContext, adapter_for};
-use crate::core::{AvailableSkill, Eval, Harness};
+use crate::core::{AvailableSkill, DispatchMechanism, Eval, Harness};
 
-use super::{RunError, copy_dir_recursive};
+use super::RunError;
 
 /// One dispatchable task: the metadata the orchestrator persists per
 /// `(eval, condition)`. `dispatch_prompt` is held in memory (for manifest
@@ -39,6 +38,16 @@ pub struct DispatchTask {
     pub timing_path: String,
     pub agent_description: String,
     pub dispatch_prompt_path: String,
+    /// Group id this task belongs to; absent when there is exactly one group
+    /// (the common no-conflict case), keeping single-group `dispatch.json`
+    /// byte-identical.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub group: Option<String>,
+    /// The agent-under-test's cwd for this task (its env dir). Absent in the
+    /// single-group case, where the Cli recipe's `<eval-root>` placeholder still
+    /// resolves to `env/`; present (per `(group, condition)`) for multi-group Cli.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub eval_root: Option<String>,
     #[serde(default, skip_serializing)]
     pub dispatch_prompt: String,
 }
@@ -69,6 +78,12 @@ pub struct DispatchTaskOpts<'a> {
     /// 1-based run index within a multi-run cell (adds an `r<k>` segment to the
     /// dispatch description); `None` for single-run cells.
     pub run_index: Option<u32>,
+    /// Isolation-group id this task belongs to; `None` in the single-group case
+    /// (keeps the serialized task byte-identical to the pre-grouping shape).
+    pub group: Option<&'a str>,
+    /// The task's env dir (the agent-under-test's cwd); `None` in the single-group
+    /// case (the shared `env/`).
+    pub eval_root: Option<&'a str>,
 }
 
 impl Default for DispatchTaskOpts<'_> {
@@ -90,6 +105,8 @@ impl Default for DispatchTaskOpts<'_> {
             harness: Harness::ClaudeCode,
             run_tag: None,
             run_index: None,
+            group: None,
+            eval_root: None,
         }
     }
 }
@@ -257,6 +274,8 @@ pub fn build_dispatch_task(opts: &DispatchTaskOpts) -> Result<DispatchTask, RunE
             .join("dispatch-prompt.txt")
             .to_string_lossy()
             .into_owned(),
+        group: opts.group.map(str::to_string),
+        eval_root: opts.eval_root.map(str::to_string),
         dispatch_prompt: sections.join(""),
     })
 }
@@ -334,39 +353,6 @@ pub fn redact_skill_from_bootstrap(content: &str, skill_name: &str) -> String {
     out.join("\n")
 }
 
-/// Copy an eval's fixture files into `<cond_dir>/inputs/`, returning the copied
-/// paths.
-pub fn copy_fixtures(
-    ev: &Eval,
-    skill_dir: &Path,
-    cond_dir: &Path,
-) -> Result<Vec<String>, RunError> {
-    let Some(files) = ev.files.as_ref().filter(|f| !f.is_empty()) else {
-        return Ok(Vec::new());
-    };
-    let inputs_dir = cond_dir.join("inputs");
-    fs::create_dir_all(&inputs_dir)?;
-    let mut copied = Vec::new();
-    for f in files {
-        let src = skill_dir.join("evals").join(f);
-        if !src.exists() {
-            return Err(RunError::msg(format!(
-                "fixture not found: {}",
-                src.display()
-            )));
-        }
-        let base = Path::new(f).file_name().unwrap_or(OsStr::new(f));
-        let dst = inputs_dir.join(base);
-        if src.is_dir() {
-            copy_dir_recursive(&src, &dst)?;
-        } else {
-            fs::copy(&src, &dst)?;
-        }
-        copied.push(dst.to_string_lossy().into_owned());
-    }
-    Ok(copied)
-}
-
 /// Read the `description:` frontmatter value (unquoted) from a skill's
 /// `SKILL.md`, falling back to a placeholder.
 pub fn get_skill_description(skill_path: &Path) -> String {
@@ -395,6 +381,7 @@ pub use crate::core::Mode;
 #[derive(Debug, Clone, Copy)]
 pub struct ManifestContext<'a> {
     pub harness: Harness,
+    pub mechanism: DispatchMechanism,
     pub guard: bool,
     pub agent_model: Option<&'a str>,
 }
@@ -431,10 +418,14 @@ pub fn build_manifest(
         "**Transcript correlation:** Each task has an `agent_description` field of the form `<eval_id>:<condition>[:r<k>]:i<N>-<nonce>` (the `r<k>` segment appears only in multi-run cells, naming the 1-based run index). When dispatching the subagent via the host's primitive (e.g. Claude Code's Agent tool), pass this string verbatim as the dispatch `description` — do not reconstruct it. The per-run nonce keeps descriptions unique across iterations sharing one session's subagents dir, so the transcript adapter correlates each subagent's persisted transcript back to the right `(eval, condition, run)` slot without collisions.".to_string(),
         String::new(),
     ];
-    if let Some(lines) = adapter_for(context.harness).cli_manifest_section(CliManifestContext {
-        guard: context.guard,
-        agent_model: context.agent_model,
-    }) {
+    // Only a Cli-dispatch run emits a CLI recipe section; an in-session run
+    // (e.g. interactive Claude Code) gets the generic ingest guidance below.
+    if context.mechanism == DispatchMechanism::Cli
+        && let Some(lines) = adapter_for(context.harness).cli_manifest_section(CliManifestContext {
+            guard: context.guard,
+            agent_model: context.agent_model,
+        })
+    {
         header.extend(lines);
     }
     header.extend([
@@ -490,6 +481,7 @@ mod tests {
                 assertions: None,
                 skill_should_trigger: None,
                 runs: None,
+                isolation: None,
             })
             .collect()
     }
@@ -659,6 +651,35 @@ mod tests {
         assert!(list_idx > ssc_end);
     }
 
+    #[test]
+    fn task_carries_group_and_eval_root_when_set_and_omits_when_absent() {
+        let with = build_dispatch_task(&DispatchTaskOpts {
+            group: Some("g2"),
+            eval_root: Some("/work/env-g2-with_skill"),
+            ..base_opts()
+        })
+        .unwrap();
+        assert_eq!(with.group.as_deref(), Some("g2"));
+        assert_eq!(with.eval_root.as_deref(), Some("/work/env-g2-with_skill"));
+        let out = serde_json::to_value(&with).unwrap();
+        assert_eq!(
+            out.get("group"),
+            Some(&serde_json::Value::String("g2".into()))
+        );
+        assert_eq!(
+            out.get("eval_root"),
+            Some(&serde_json::Value::String("/work/env-g2-with_skill".into()))
+        );
+
+        // Single-group default: both omitted, keeping dispatch.json byte-identical.
+        let without = build_dispatch_task(&base_opts()).unwrap();
+        assert_eq!(without.group, None);
+        assert_eq!(without.eval_root, None);
+        let out = serde_json::to_value(&without).unwrap();
+        assert!(out.get("group").is_none());
+        assert!(out.get("eval_root").is_none());
+    }
+
     #[test]
     fn dispatch_prompt_path_under_cond_dir() {
         let task = build_dispatch_task(&base_opts()).unwrap();
diff --git a/src/cli/run/fixtures.rs b/src/cli/run/fixtures.rs
new file mode 100644
index 0000000..e26de63
--- /dev/null
+++ b/src/cli/run/fixtures.rs
@@ -0,0 +1,255 @@
+//! Copy an eval's fixtures into the isolated env (`iteration-N/env/`), laid out
+//! like a real repo so the agent-under-test reads them at natural project-relative
+//! paths. One shared env hosts every eval's fixtures, so [`FixtureClaims`] dedups
+//! idempotent re-declarations and rejects cross-eval clobbers.
+
+use std::fs;
+use std::path::Path;
+
+use crate::core::Eval;
+
+use super::{RunError, copy_entry};
+
+/// Cross-eval claims on env-relative fixture destinations: `dest → (eval_id, source)`.
+/// One shared `env/` hosts every eval's fixtures, so two evals targeting the same path
+/// from *different* sources is an ambiguous, order-dependent clobber — [`claim_fixture_dest`]
+/// rejects it. Same source is an idempotent re-declaration (the common shared-fixture case).
+pub type FixtureClaims = std::collections::HashMap<String, (String, String)>;
+
+/// Record that `eval_id` provides the fixture at env-relative `dest` from `source`.
+/// Returns `Ok(true)` when the dest was already claimed from the same source (idempotent
+/// share — skip the re-copy), `Ok(false)` on the first claim, and `Err` when a *different*
+/// source already claimed the same dest (an order-dependent cross-eval clobber).
+fn claim_fixture_dest(
+    claims: &mut FixtureClaims,
+    eval_id: &str,
+    dest: &str,
+    source: &str,
+) -> Result<bool, RunError> {
+    if let Some((prev_eval, prev_source)) = claims.get(dest) {
+        if prev_source != source {
+            return Err(RunError::msg(format!(
+                "fixture conflict: evals '{prev_eval}' and '{eval_id}' both place a fixture at env path '{dest}' from different sources ('{prev_source}' vs '{source}'). Give them distinct paths."
+            )));
+        }
+        return Ok(true);
+    }
+    claims.insert(dest.to_string(), (eval_id.to_string(), source.to_string()));
+    Ok(false)
+}
+
+/// Reject a fixture path that is absolute or escapes `env/` via `..`, so a fixture
+/// always lands inside the isolated env.
+fn validate_fixture_rel(f: &str) -> Result<(), RunError> {
+    let p = Path::new(f);
+    let escapes = p.is_absolute()
+        || p.components()
+            .any(|c| matches!(c, std::path::Component::ParentDir));
+    if escapes {
+        return Err(RunError::msg(format!(
+            "fixture path must be relative and stay within env: {f}"
+        )));
+    }
+    Ok(())
+}
+
+/// Resolve an eval's declared fixtures to `(env-relative dest, source path)` pairs,
+/// validating each path stays within the env and that the source exists — without
+/// copying anything. [`super::grouping`] consumes these pairs to detect cross-eval
+/// clobbers before any env is built, and [`copy_fixtures`] reuses them, so fixture
+/// path resolution lives in exactly one place.
+pub fn fixture_pairs(ev: &Eval, skill_dir: &Path) -> Result<Vec<(String, String)>, RunError> {
+    let Some(files) = ev.files.as_ref().filter(|f| !f.is_empty()) else {
+        return Ok(Vec::new());
+    };
+    let mut pairs = Vec::with_capacity(files.len());
+    for f in files {
+        validate_fixture_rel(f)?;
+        let src = skill_dir.join("evals").join(f);
+        if !src.exists() {
+            return Err(RunError::msg(format!(
+                "fixture not found: {}",
+                src.display()
+            )));
+        }
+        pairs.push((f.clone(), src.to_string_lossy().into_owned()));
+    }
+    Ok(pairs)
+}
+
+/// Copy an eval's fixture files into `env_root`, preserving each declared relative path
+/// so the env reads like a real repo (`files: ["src/main.rs"]` → `env/src/main.rs`), and
+/// returning the env-relative paths (the agent-under-test's cwd is `env/`). Fixtures are
+/// shared across conditions and runs within one env; `claims` dedups idempotent
+/// re-declarations and rejects cross-eval clobbers. Cross-eval clobbers are routed into
+/// separate isolation groups by [`super::grouping`] before this is called per group, so
+/// within a single group's env a clobber should never reach the `claims` rejection.
+pub fn copy_fixtures(
+    ev: &Eval,
+    skill_dir: &Path,
+    env_root: &Path,
+    claims: &mut FixtureClaims,
+) -> Result<Vec<String>, RunError> {
+    let pairs = fixture_pairs(ev, skill_dir)?;
+    let mut copied = Vec::with_capacity(pairs.len());
+    for (dest, source) in &pairs {
+        let already = claim_fixture_dest(claims, &ev.id, dest, source)?;
+        if !already {
+            let dst = env_root.join(dest);
+            if let Some(parent) = dst.parent() {
+                fs::create_dir_all(parent)?;
+            }
+            copy_entry(Path::new(source), &dst)?;
+        }
+        copied.push(dest.clone());
+    }
+    Ok(copied)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn eval_with_files(id: &str, files: &[&str]) -> Eval {
+        Eval {
+            id: id.to_string(),
+            prompt: "p".to_string(),
+            expected_output: "o".to_string(),
+            files: Some(files.iter().map(|f| (*f).to_string()).collect()),
+            assertions: None,
+            skill_should_trigger: None,
+            runs: None,
+            isolation: None,
+        }
+    }
+
+    #[test]
+    fn fixture_pairs_resolves_dest_and_source_without_copying() {
+        let tmp = tempfile::TempDir::new().unwrap();
+        let skill_dir = tmp.path().join("skill");
+        let evals = skill_dir.join("evals");
+        fs::create_dir_all(evals.join("data")).unwrap();
+        fs::write(evals.join("config.json"), "cfg").unwrap();
+        fs::write(evals.join("data/x.json"), "xx").unwrap();
+
+        let ev = eval_with_files("e1", &["config.json", "data/x.json"]);
+        let pairs = fixture_pairs(&ev, &skill_dir).unwrap();
+
+        assert_eq!(
+            pairs,
+            vec![
+                (
+                    "config.json".to_string(),
+                    evals.join("config.json").to_string_lossy().into_owned()
+                ),
+                (
+                    "data/x.json".to_string(),
+                    evals.join("data/x.json").to_string_lossy().into_owned()
+                ),
+            ]
+        );
+        // Pure: it resolves paths but copies nothing.
+        assert!(!tmp.path().join("env").exists());
+    }
+
+    #[test]
+    fn fixture_pairs_empty_when_no_files() {
+        let tmp = tempfile::TempDir::new().unwrap();
+        let skill_dir = tmp.path().join("skill");
+        fs::create_dir_all(skill_dir.join("evals")).unwrap();
+        let ev = eval_with_files("e1", &[]);
+        assert!(fixture_pairs(&ev, &skill_dir).unwrap().is_empty());
+    }
+
+    #[test]
+    fn fixture_pairs_rejects_escapes_and_missing_sources() {
+        let tmp = tempfile::TempDir::new().unwrap();
+        let skill_dir = tmp.path().join("skill");
+        fs::create_dir_all(skill_dir.join("evals")).unwrap();
+
+        let escaping = eval_with_files("e1", &["../escape.txt"]);
+        assert!(
+            fixture_pairs(&escaping, &skill_dir)
+                .unwrap_err()
+                .to_string()
+                .contains("relative")
+        );
+
+        let missing = eval_with_files("e1", &["nope.json"]);
+        assert!(
+            fixture_pairs(&missing, &skill_dir)
+                .unwrap_err()
+                .to_string()
+                .contains("fixture not found")
+        );
+    }
+
+    #[test]
+    fn copy_fixtures_preserves_declared_relative_paths_in_env() {
+        let tmp = tempfile::TempDir::new().unwrap();
+        let skill_dir = tmp.path().join("skill");
+        let evals = skill_dir.join("evals");
+        fs::create_dir_all(evals.join("data")).unwrap();
+        fs::write(evals.join("config.json"), "cfg").unwrap();
+        fs::write(evals.join("data/x.json"), "xx").unwrap();
+        let env_root = tmp.path().join("env");
+
+        let ev = eval_with_files("e1", &["config.json", "data/x.json"]);
+        let mut claims = FixtureClaims::new();
+        let copied = copy_fixtures(&ev, &skill_dir, &env_root, &mut claims).unwrap();
+
+        // Structure preserved under env/, not flattened into an inputs/ bucket.
+        assert_eq!(
+            fs::read_to_string(env_root.join("config.json")).unwrap(),
+            "cfg"
+        );
+        assert_eq!(
+            fs::read_to_string(env_root.join("data/x.json")).unwrap(),
+            "xx"
+        );
+        assert!(!env_root.join("inputs").exists());
+        // Returns env-relative declared paths (the agent's cwd is env).
+        assert_eq!(
+            copied,
+            vec!["config.json".to_string(), "data/x.json".to_string()]
+        );
+    }
+
+    #[test]
+    fn copy_fixtures_rejects_parent_escaping_and_absolute_paths() {
+        let tmp = tempfile::TempDir::new().unwrap();
+        let skill_dir = tmp.path().join("skill");
+        fs::create_dir_all(skill_dir.join("evals")).unwrap();
+        let env_root = tmp.path().join("env");
+
+        for bad in ["../escape.txt", "/etc/passwd", "a/../../b.txt"] {
+            let ev = eval_with_files("e1", &[bad]);
+            let mut claims = FixtureClaims::new();
+            let err = copy_fixtures(&ev, &skill_dir, &env_root, &mut claims).unwrap_err();
+            assert!(
+                err.to_string().contains("relative"),
+                "expected a path-traversal rejection for {bad}, got: {err}"
+            );
+        }
+    }
+
+    #[test]
+    fn claim_fixture_dest_allows_idempotent_share_errors_on_different_source() {
+        let mut claims = FixtureClaims::new();
+        // First eval claims the dest.
+        assert!(
+            !claim_fixture_dest(&mut claims, "e1", "config.json", "/a/evals/config.json").unwrap()
+        );
+        // A second eval declaring the same dest from the same source is an idempotent share.
+        assert!(
+            claim_fixture_dest(&mut claims, "e2", "config.json", "/a/evals/config.json").unwrap()
+        );
+        // The same dest from a *different* source is an ambiguous cross-eval conflict.
+        let err = claim_fixture_dest(&mut claims, "e3", "config.json", "/b/evals/config.json")
+            .unwrap_err();
+        let msg = err.to_string();
+        assert!(msg.contains("e1"), "names the first claimer: {msg}");
+        assert!(msg.contains("e3"), "names the conflicting eval: {msg}");
+        assert!(msg.contains("config.json"), "names the path: {msg}");
+    }
+}
diff --git a/src/cli/run/grouping.rs b/src/cli/run/grouping.rs
new file mode 100644
index 0000000..105778a
--- /dev/null
+++ b/src/cli/run/grouping.rs
@@ -0,0 +1,265 @@
+//! Setup-time isolation grouping: decide which evals can share one environment
+//! and which must be isolated, *before* a run dispatches anything.
+//!
+//! One env historically hosted every eval's fixtures, so two evals placing
+//! different content at the same path were a hard error. Grouping turns that into
+//! a decision: evals whose fixtures conflict (same env-relative dest from a
+//! *different* source) are routed into separate groups, and an eval may opt into
+//! its own singleton group via [`Isolation::Isolated`]. The realization differs by
+//! dispatch mechanism (one env + reset barrier for in-session; one env per
+//! `(group, condition)` for CLI), but the grouping decision here is shared.
+//!
+//! The conflict rule is identical to the per-env fixture-claim rule in
+//! [`super::fixtures`]: same dest + same source is an idempotent share (evals may
+//! co-group); same dest + different source is a clobber (they must not).
+
+use std::collections::HashMap;
+
+use crate::core::Isolation;
+
+/// One eval's inputs to grouping.
+pub struct GroupInput<'a> {
+    pub eval_id: &'a str,
+    pub isolation: Option<Isolation>,
+    /// `(env-relative dest, source)` fixture pairs this eval declares.
+    pub fixtures: &'a [(String, String)],
+}
+
+/// A computed isolation group: the evals that share one environment, plus a
+/// human-readable reason the group exists (surfaced in `dispatch.json`).
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct Group {
+    pub id: String,
+    pub eval_ids: Vec<String>,
+    pub rationale: String,
+}
+
+/// Group `evals` (in config order) by fixture compatibility and explicit hints.
+///
+/// Deterministic greedy first-fit: each eval joins the first existing group it
+/// does not conflict with; an [`Isolation::Isolated`] eval always gets a fresh,
+/// sealed singleton; otherwise a new group is started. Group ids are `g1, g2, …`
+/// in creation order. With no conflicts and no `isolated` hints this returns a
+/// single `g1` containing every eval — the common case.
+pub fn compute_groups(evals: &[GroupInput]) -> Vec<Group> {
+    /// A group under construction: its accumulated `dest -> (source, eval_id)`
+    /// claims plus whether it is sealed against new members (isolated singletons).
+    struct Building {
+        id: String,
+        eval_ids: Vec<String>,
+        claims: HashMap<String, (String, String)>,
+        sealed: bool,
+        rationale: String,
+    }
+
+    fn claims_of(ev: &GroupInput) -> HashMap<String, (String, String)> {
+        ev.fixtures
+            .iter()
+            .map(|(dest, source)| (dest.clone(), (source.clone(), ev.eval_id.to_string())))
+            .collect()
+    }
+
+    let mut groups: Vec<Building> = Vec::new();
+
+    for ev in evals {
+        // An `isolated` eval always gets a fresh, sealed singleton — nothing else
+        // may join it, and it joins nothing else.
+        if ev.isolation == Some(Isolation::Isolated) {
+            let id = format!("g{}", groups.len() + 1);
+            groups.push(Building {
+                id,
+                eval_ids: vec![ev.eval_id.to_string()],
+                claims: claims_of(ev),
+                sealed: true,
+                rationale: "isolation: isolated".to_string(),
+            });
+            continue;
+        }
+
+        // Greedy first-fit over the non-sealed groups, in creation order.
+        let mut joined = false;
+        let mut conflict_note: Option<String> = None;
+        for g in groups.iter_mut().filter(|g| !g.sealed) {
+            // The eval conflicts with this group iff it claims a dest the group
+            // already holds from a *different* source (an order-dependent clobber).
+            // Same dest + same source is an idempotent share — not a conflict.
+            let mut conflict: Option<(String, String)> = None;
+            for (dest, source) in ev.fixtures {
+                if let Some((prev_source, prev_eval)) = g.claims.get(dest)
+                    && prev_source != source
+                {
+                    conflict = Some((dest.clone(), prev_eval.clone()));
+                    break;
+                }
+            }
+            match conflict {
+                None => {
+                    for (dest, source) in ev.fixtures {
+                        g.claims
+                            .entry(dest.clone())
+                            .or_insert_with(|| (source.clone(), ev.eval_id.to_string()));
+                    }
+                    g.eval_ids.push(ev.eval_id.to_string());
+                    joined = true;
+                    break;
+                }
+                Some((dest, other_eval)) => {
+                    // Record the first conflict as the new group's rationale, but
+                    // keep scanning — a later group may still accept this eval.
+                    conflict_note.get_or_insert_with(|| {
+                        format!("fixture-conflict: {} vs {other_eval} at {dest}", ev.eval_id)
+                    });
+                }
+            }
+        }
+
+        if !joined {
+            let id = format!("g{}", groups.len() + 1);
+            groups.push(Building {
+                id,
+                eval_ids: vec![ev.eval_id.to_string()],
+                claims: claims_of(ev),
+                sealed: false,
+                rationale: conflict_note.unwrap_or_else(|| "default".to_string()),
+            });
+        }
+    }
+
+    groups
+        .into_iter()
+        .map(|b| Group {
+            id: b.id,
+            eval_ids: b.eval_ids,
+            rationale: b.rationale,
+        })
+        .collect()
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn input<'a>(
+        id: &'a str,
+        isolation: Option<Isolation>,
+        fixtures: &'a [(String, String)],
+    ) -> GroupInput<'a> {
+        GroupInput {
+            eval_id: id,
+            isolation,
+            fixtures,
+        }
+    }
+
+    fn pair(dest: &str, source: &str) -> (String, String) {
+        (dest.to_string(), source.to_string())
+    }
+
+    #[test]
+    fn single_group_when_no_conflicts_or_hints() {
+        let f1 = [pair("a.txt", "/s/a.txt")];
+        let f2 = [pair("b.txt", "/s/b.txt")];
+        let evals = [
+            input("e1", None, &f1),
+            input("e2", None, &f2),
+            input("e3", None, &[]),
+        ];
+        let groups = compute_groups(&evals);
+        assert_eq!(groups.len(), 1);
+        assert_eq!(groups[0].id, "g1");
+        assert_eq!(groups[0].eval_ids, vec!["e1", "e2", "e3"]);
+        assert_eq!(groups[0].rationale, "default");
+    }
+
+    #[test]
+    fn conflicting_fixtures_split_into_two_groups() {
+        let f1 = [pair("config.json", "/a/config.json")];
+        let f2 = [pair("config.json", "/b/config.json")];
+        let evals = [input("e1", None, &f1), input("e2", None, &f2)];
+        let groups = compute_groups(&evals);
+        assert_eq!(groups.len(), 2);
+        assert_eq!(groups[0].eval_ids, vec!["e1"]);
+        assert_eq!(groups[1].id, "g2");
+        assert_eq!(groups[1].eval_ids, vec!["e2"]);
+        assert!(
+            groups[1].rationale.contains("fixture-conflict"),
+            "rationale: {}",
+            groups[1].rationale
+        );
+        assert!(
+            groups[1].rationale.contains("config.json"),
+            "rationale: {}",
+            groups[1].rationale
+        );
+    }
+
+    #[test]
+    fn idempotent_same_source_share_stays_one_group() {
+        let f = [pair("config.json", "/a/config.json")];
+        let evals = [input("e1", None, &f), input("e2", None, &f)];
+        let groups = compute_groups(&evals);
+        assert_eq!(groups.len(), 1);
+        assert_eq!(groups[0].eval_ids, vec!["e1", "e2"]);
+    }
+
+    #[test]
+    fn isolated_hint_forces_singleton_and_seals_it() {
+        let evals = [
+            input("e1", Some(Isolation::Isolated), &[]),
+            input("e2", None, &[]),
+            input("e3", None, &[]),
+        ];
+        let groups = compute_groups(&evals);
+        assert_eq!(groups.len(), 2);
+        assert_eq!(groups[0].eval_ids, vec!["e1"]);
+        assert_eq!(groups[0].rationale, "isolation: isolated");
+        // The shared evals never join the sealed singleton.
+        assert_eq!(groups[1].eval_ids, vec!["e2", "e3"]);
+        assert_eq!(groups[1].rationale, "default");
+    }
+
+    #[test]
+    fn isolated_eval_with_fixtures_is_still_a_singleton() {
+        let f = [pair("x.txt", "/s/x.txt")];
+        let evals = [input("e1", Some(Isolation::Isolated), &f)];
+        let groups = compute_groups(&evals);
+        assert_eq!(groups.len(), 1);
+        assert_eq!(groups[0].eval_ids, vec!["e1"]);
+        assert_eq!(groups[0].rationale, "isolation: isolated");
+    }
+
+    #[test]
+    fn ids_are_deterministic_in_creation_order() {
+        let f1 = [pair("c.json", "/a/c.json")];
+        let f2 = [pair("c.json", "/b/c.json")];
+        let f3 = [pair("c.json", "/d/c.json")];
+        let evals = [
+            input("e1", None, &f1),
+            input("e2", None, &f2),
+            input("e3", None, &f3),
+        ];
+        let groups = compute_groups(&evals);
+        assert_eq!(
+            groups.iter().map(|g| g.id.as_str()).collect::<Vec<_>>(),
+            vec!["g1", "g2", "g3"]
+        );
+    }
+
+    #[test]
+    fn eval_joins_first_non_conflicting_group() {
+        // e1 -> g1 (claims `a` from /s1). e2 conflicts on `a` -> g2. e3 shares `a`
+        // from /s1 (same source as g1) -> rejoins g1, not g2.
+        let f1 = [pair("a", "/s1/a")];
+        let f2 = [pair("a", "/s2/a")];
+        let f3 = [pair("a", "/s1/a")];
+        let evals = [
+            input("e1", None, &f1),
+            input("e2", None, &f2),
+            input("e3", None, &f3),
+        ];
+        let groups = compute_groups(&evals);
+        assert_eq!(groups.len(), 2);
+        assert_eq!(groups[0].eval_ids, vec!["e1", "e3"]);
+        assert_eq!(groups[1].eval_ids, vec!["e2"]);
+    }
+}
diff --git a/src/cli/run/mod.rs b/src/cli/run/mod.rs
index 5d71158..0121503 100644
--- a/src/cli/run/mod.rs
+++ b/src/cli/run/mod.rs
@@ -16,7 +16,10 @@ use std::path::Path;
 use serde::Serialize;
 
 pub mod dispatch;
+pub mod fixtures;
+pub mod grouping;
 pub mod orchestrate;
+pub mod runbook;
 pub mod staging;
 pub mod steps;
 mod util;
diff --git a/src/cli/run/orchestrate/build.rs b/src/cli/run/orchestrate/build.rs
index 24704b7..7eefc59 100644
--- a/src/cli/run/orchestrate/build.rs
+++ b/src/cli/run/orchestrate/build.rs
@@ -3,25 +3,31 @@
 //! ([`write_dispatch`]), then arm the opt-in write guard and run the plugin-shadow
 //! preflight ([`post_build`]).
 
+use std::collections::HashMap;
 use std::fs;
 use std::path::Path;
 
-use serde_json::json;
+use serde_json::{Value, json};
 
 use crate::adapters::{
     adapter_for, config_dir_from_env, detect_plugin_shadows, format_shadow_banner,
 };
-use crate::core::{AvailableSkill, ConditionEntry, ConditionsRecord, Harness, RunContext};
+use crate::core::{
+    AvailableSkill, ConditionEntry, ConditionsRecord, DispatchMechanism, Harness, RunContext,
+};
 use crate::pipeline::io::now_iso8601;
 
 use super::super::dispatch::{
-    DispatchTaskOpts, ManifestContext, build_dispatch_task, build_manifest, copy_fixtures,
-    get_skill_description,
+    DispatchTaskOpts, ManifestContext, build_dispatch_task, build_manifest, get_skill_description,
 };
+use super::super::fixtures::fixture_pairs;
+use super::super::runbook::{RunbookContext, build_runbook};
 use super::super::staging::skills_dir_for_harness;
-use super::super::util::{staging_plugin_shadow_action, unguarded_notice};
+use super::super::util::unguarded_notice;
 use super::super::{RunError, write_json};
+use super::envs::{EnvLayoutInput, env_targets, task_env_root};
 use super::{Resolved, RunOptions, Staged};
+use crate::cli::command_target_args;
 
 /// Build every `(eval, condition)` dispatch task and write `conditions.json`,
 /// `dispatch-manifest.md`, the per-task prompt files, and `dispatch.json`.
@@ -49,6 +55,7 @@ pub(super) fn write_dispatch(
         ],
         timestamp: now_iso8601(),
         harness: Some(ctx.harness),
+        run_mode: Some(ctx.run_mode),
         run_nonce: Some(r.run_nonce.clone()),
         runs: Some(opts.runs),
         agent_model: opts.agent_model.map(str::to_owned),
@@ -57,9 +64,9 @@ pub(super) fn write_dispatch(
     };
     write_json(&r.iteration_dir.join("conditions.json"), &conditions)?;
 
-    let staged_skill_path_for = |cond_slug: Option<&str>| -> Option<String> {
+    let staged_skill_path_for = |env_root: &Path, cond_slug: Option<&str>| -> Option<String> {
         cond_slug.map(|slug| {
-            skills_dir_for_harness(&ctx.stage_root, ctx.harness)
+            skills_dir_for_harness(env_root, ctx.harness)
                 .join(slug)
                 .join("SKILL.md")
                 .to_string_lossy()
@@ -67,85 +74,154 @@ pub(super) fn write_dispatch(
         })
     };
 
-    // availableSkills for a condition = siblings + the skill-under-test when that
-    // condition loads it. Empty when nothing was staged.
-    let available_skills_for =
-        |cond_skill_path: Option<&str>, cond_slug: Option<&str>| -> Vec<AvailableSkill> {
-            if opts.no_stage {
-                return Vec::new();
-            }
-            let mut skills = staged.sibling_skills.clone();
-            if let Some(csp) = cond_skill_path {
-                let name = match cond_slug {
-                    Some(slug) if adapter_for(ctx.harness).advertises_staged_slug_name() => {
-                        slug.to_string()
-                    }
-                    _ => ctx.skill_name.clone(),
-                };
-                skills.push(AvailableSkill {
-                    name,
-                    path: staged_skill_path_for(cond_slug).unwrap_or_else(|| csp.to_string()),
-                    description: get_skill_description(Path::new(csp)),
-                });
-            }
-            skills
-        };
+    // availableSkills for a condition in a given env = siblings + the
+    // skill-under-test when that condition loads it. Paths are env-specific (Cli
+    // stages a separate env per (group, condition)). Empty when nothing was staged.
+    let available_skills_for = |env_root: &Path,
+                                cond_skill_path: Option<&str>,
+                                cond_slug: Option<&str>|
+     -> Vec<AvailableSkill> {
+        if opts.no_stage {
+            return Vec::new();
+        }
+        let mut skills: Vec<AvailableSkill> = staged
+            .sibling_meta
+            .iter()
+            .map(|(name, description)| AvailableSkill {
+                name: name.clone(),
+                path: skills_dir_for_harness(env_root, ctx.harness)
+                    .join(name)
+                    .join("SKILL.md")
+                    .to_string_lossy()
+                    .into_owned(),
+                description: description.clone(),
+            })
+            .collect();
+        if let Some(csp) = cond_skill_path {
+            let name = match cond_slug {
+                Some(slug) if adapter_for(ctx.harness).advertises_staged_slug_name() => {
+                    slug.to_string()
+                }
+                _ => ctx.skill_name.clone(),
+            };
+            skills.push(AvailableSkill {
+                name,
+                path: staged_skill_path_for(env_root, cond_slug).unwrap_or_else(|| csp.to_string()),
+                description: get_skill_description(Path::new(csp)),
+            });
+        }
+        skills
+    };
 
-    let mut tasks = Vec::new();
+    // Each eval's env-relative fixture dests (for the task's `fixtures` field and
+    // the prompt's fixtures block). The copies themselves are made per env by
+    // `stage_conditions`; resolution here is read-only (and re-validated in resolve).
+    let mut fixtures_by_eval: HashMap<&str, Vec<String>> = HashMap::new();
     for ev in &r.selected_evals {
-        let eval_dir = r.iteration_dir.join(format!("eval-{}", ev.id));
-        fs::create_dir_all(&eval_dir)?;
+        let dests = fixture_pairs(ev, &ctx.skill_subdir)?
+            .into_iter()
+            .map(|(dest, _source)| dest)
+            .collect();
+        fixtures_by_eval.insert(ev.id.as_str(), dests);
+    }
+
+    let mechanism = ctx.run_mode.mechanism();
+    // A single group keeps the pre-grouping task shape (no `group`/`eval_root`
+    // keys); >1 group, or any Cli run (per-(group, condition) envs), tags tasks.
+    let multi_group = r.groups.len() > 1;
+
+    let mut tasks = Vec::new();
+    // Build tasks CONDITION-outer, GROUP-inner — so the in-session runbook reads
+    // tasks[] top to bottom as: dispatch each (condition, group) segment, with a
+    // `reset-batch` between groups and one `switch-condition` between conditions.
+    // A single group collapses this to the legacy condition-outer order.
+    for (cond_name, cond_skill_path, cond_slug) in [
+        (
+            r.cond_a,
+            r.skill_path_a.as_deref(),
+            staged.cond_a_slug.as_deref(),
+        ),
+        (
+            r.cond_b,
+            r.skill_path_b.as_deref(),
+            staged.cond_b_slug.as_deref(),
+        ),
+    ] {
+        for group in &r.groups {
+            let env_root = task_env_root(&r.iteration_dir, mechanism, &group.id, cond_name);
+            let env_root_str = env_root.to_string_lossy().into_owned();
+            let staged_path = staged_skill_path_for(&env_root, cond_slug);
+            let available_skills = available_skills_for(&env_root, cond_skill_path, cond_slug);
 
-        for (cond_name, cond_skill_path, cond_slug) in [
-            (
-                r.cond_a,
-                r.skill_path_a.as_deref(),
-                staged.cond_a_slug.as_deref(),
-            ),
-            (
-                r.cond_b,
-                r.skill_path_b.as_deref(),
-                staged.cond_b_slug.as_deref(),
-            ),
-        ] {
-            let cond_dir = eval_dir.join(cond_name);
-            let runs = ev.runs.unwrap_or(opts.runs);
-            let staged_path = staged_skill_path_for(cond_slug);
+            for eval_id in &group.eval_ids {
+                let ev = r
+                    .selected_evals
+                    .iter()
+                    .find(|e| &e.id == eval_id)
+                    .expect("group eval ids are drawn from selected_evals");
+                let cond_dir = r
+                    .iteration_dir
+                    .join(format!("eval-{}", ev.id))
+                    .join(cond_name);
+                let runs = ev.runs.unwrap_or(opts.runs);
 
-            for run_idx in 1..=runs {
-                // A single-run cell keeps the flat legacy layout; multi-run
-                // cells nest each run under run-<k>/.
-                let (run_dir, run_index) = if runs == 1 {
-                    (cond_dir.clone(), None)
-                } else {
-                    (cond_dir.join(format!("run-{run_idx}")), Some(run_idx))
-                };
-                let outputs_dir = run_dir.join("outputs");
-                fs::create_dir_all(&outputs_dir)?;
+                for run_idx in 1..=runs {
+                    // A single-run cell keeps the flat legacy layout; multi-run
+                    // cells nest each run under run-<k>/.
+                    let (run_dir, run_index) = if runs == 1 {
+                        (cond_dir.clone(), None)
+                    } else {
+                        (cond_dir.join(format!("run-{run_idx}")), Some(run_idx))
+                    };
+                    // Create the per-run meta dir (run.json / timing.json /
+                    // dispatch-prompt.txt), which lives above the env.
+                    fs::create_dir_all(&run_dir)?;
+                    // The agent-under-test's cwd is its env, so its outputs land
+                    // *inside* it — never above its sandbox.
+                    // A hidden, per-(eval, condition, run) subtree keeps concurrent
+                    // same-env subagents from colliding.
+                    let outputs_rel = match run_index {
+                        None => format!("eval-{}/{cond_name}", ev.id),
+                        Some(k) => format!("eval-{}/{cond_name}/run-{k}", ev.id),
+                    };
+                    let outputs_dir = env_root.join(".eval-magic-outputs").join(outputs_rel);
+                    fs::create_dir_all(&outputs_dir)?;
 
-                let fixtures = copy_fixtures(ev, &ctx.skill_subdir, &run_dir)?;
-                let available_skills = available_skills_for(cond_skill_path, cond_slug);
-                let outputs_dir_str = outputs_dir.to_string_lossy().into_owned();
-                let run_dir_str = run_dir.to_string_lossy().into_owned();
+                    let fixtures = fixtures_by_eval
+                        .get(ev.id.as_str())
+                        .cloned()
+                        .unwrap_or_default();
+                    let outputs_dir_str = outputs_dir.to_string_lossy().into_owned();
+                    let run_dir_str = run_dir.to_string_lossy().into_owned();
 
-                tasks.push(build_dispatch_task(&DispatchTaskOpts {
-                    eval_id: &ev.id,
-                    condition: cond_name,
-                    skill_path: cond_skill_path,
-                    staged_skill_slug: cond_slug,
-                    staged_skill_path: staged_path.as_deref(),
-                    user_prompt: &ev.prompt,
-                    fixtures,
-                    outputs_dir: &outputs_dir_str,
-                    cond_dir: &run_dir_str,
-                    bootstrap_content: staged.bootstrap_content.as_deref(),
-                    plan_mode_content: staged.plan_mode_content.as_deref(),
-                    skill_name: &ctx.skill_name,
-                    available_skills,
-                    harness: ctx.harness,
-                    run_tag: Some(&r.run_tag),
-                    run_index,
-                })?);
+                    tasks.push(build_dispatch_task(&DispatchTaskOpts {
+                        eval_id: &ev.id,
+                        condition: cond_name,
+                        skill_path: cond_skill_path,
+                        staged_skill_slug: cond_slug,
+                        staged_skill_path: staged_path.as_deref(),
+                        user_prompt: &ev.prompt,
+                        fixtures,
+                        outputs_dir: &outputs_dir_str,
+                        cond_dir: &run_dir_str,
+                        bootstrap_content: staged.bootstrap_content.as_deref(),
+                        plan_mode_content: staged.plan_mode_content.as_deref(),
+                        skill_name: &ctx.skill_name,
+                        available_skills: available_skills.clone(),
+                        harness: ctx.harness,
+                        run_tag: Some(&r.run_tag),
+                        run_index,
+                        // Tag the group only when there's more than one (keeps the
+                        // single-group task byte-identical). `eval_root` is the
+                        // per-task cwd the Cli recipe `cd`s into; the in-session
+                        // path shares one env, so it stays `None`.
+                        group: multi_group.then_some(group.id.as_str()),
+                        eval_root: match mechanism {
+                            DispatchMechanism::Cli => Some(env_root_str.as_str()),
+                            DispatchMechanism::InSession => None,
+                        },
+                    })?);
+                }
             }
         }
     }
@@ -162,6 +238,7 @@ pub(super) fn write_dispatch(
             &tasks,
             ManifestContext {
                 harness: ctx.harness,
+                mechanism: ctx.run_mode.mechanism(),
                 guard: opts.guard,
                 agent_model: opts.agent_model,
             },
@@ -174,7 +251,7 @@ pub(super) fn write_dispatch(
     }
 
     let dispatch_json_path = r.iteration_dir.join("dispatch.json");
-    let dispatch_json = json!({
+    let mut dispatch_json = json!({
         "skill_name": ctx.skill_name,
         "iteration": r.iteration,
         "run_nonce": r.run_nonce,
@@ -188,10 +265,74 @@ pub(super) fn write_dispatch(
         "label": conditions.label,
         "conditions": conditions.conditions,
         "harness": ctx.harness,
+        "run_mode": ctx.run_mode,
         "tasks": tasks,
     });
+    // The isolation-batch plan the executing session/human follows: which evals
+    // share an env, why, and (per condition) the env each batch runs in. Omitted in
+    // the trivial single-group in-session case so its dispatch.json stays
+    // byte-identical; emitted whenever the layout is non-trivial (>1 group, or any
+    // Cli run with per-(group, condition) envs).
+    if multi_group || mechanism == DispatchMechanism::Cli {
+        let groups: Vec<Value> = r
+            .groups
+            .iter()
+            .map(|g| {
+                let envs: Vec<Value> = [r.cond_a, r.cond_b]
+                    .iter()
+                    .map(|cond| {
+                        json!({
+                            "condition": cond,
+                            "dir": task_env_root(&r.iteration_dir, mechanism, &g.id, cond)
+                                .to_string_lossy(),
+                        })
+                    })
+                    .collect();
+                json!({
+                    "id": g.id,
+                    "evals": g.eval_ids,
+                    "rationale": g.rationale,
+                    "envs": envs,
+                })
+            })
+            .collect();
+        dispatch_json
+            .as_object_mut()
+            .expect("dispatch_json is a JSON object")
+            .insert("groups".to_string(), Value::Array(groups));
+    }
     write_json(&dispatch_json_path, &dispatch_json)?;
 
+    // The followable handoff artifact: a fresh isolated session (interactive) or
+    // a human (headless) reads RUNBOOK.md to run the loop. It references eval-magic
+    // meta (dispatch.json, benchmark.json) under `iteration_dir`, so `RunbookContext`
+    // keeps `iteration_dir`, not the env. Generated, not version controlled.
+    let target_args = command_target_args(ctx);
+    let group_ids: Vec<String> = r.groups.iter().map(|g| g.id.clone()).collect();
+    let runbook = build_runbook(&RunbookContext {
+        harness: ctx.harness,
+        run_mode: ctx.run_mode,
+        skill_name: &ctx.skill_name,
+        iteration: r.iteration,
+        iteration_dir: &r.iteration_dir,
+        mode: r.mode,
+        cond_a: r.cond_a,
+        cond_b: r.cond_b,
+        num_tasks: tasks.len(),
+        groups: &group_ids,
+        target_args: &target_args,
+        guard: opts.guard,
+        agent_model: opts.agent_model,
+    });
+    // In-session: written into the single `env/` (the isolated session's cwd, =
+    // `ctx.stage_root`). Cli: there is no single env (one per (group, condition)),
+    // and the human drives from the iteration dir, so it lands there.
+    let runbook_path = match mechanism {
+        DispatchMechanism::InSession => ctx.stage_root.join("RUNBOOK.md"),
+        DispatchMechanism::Cli => r.iteration_dir.join("RUNBOOK.md"),
+    };
+    fs::write(runbook_path, runbook)?;
+
     Ok(tasks.len())
 }
 
@@ -201,30 +342,34 @@ pub(super) fn post_build(
     ctx: &RunContext,
     opts: &RunOptions,
     r: &Resolved,
-    staged: &Staged,
 ) -> Result<(), RunError> {
+    // Every env this run staged: one shared `env/` for in-session, one per
+    // (group, condition) for Cli. Computed once and reused below to arm the guard in
+    // each env and to point the plugin-shadow preflight at a real staged env.
+    let targets = env_targets(&EnvLayoutInput {
+        iteration_dir: &r.iteration_dir,
+        mechanism: ctx.run_mode.mechanism(),
+        groups: &r.groups,
+        cond_a: r.cond_a,
+        cond_b: r.cond_b,
+        skill_path_a: r.skill_path_a.as_deref(),
+        skill_path_b: r.skill_path_b.as_deref(),
+    });
+
     // Opt-in hard guard: a PreToolUse hook blocking subagent writes/installs
-    // outside the eval sandbox while dispatches run.
+    // outside the eval sandbox while dispatches run. Armed in *every* env the run
+    // staged — since each subprocess loads its hook from its own cwd.
     if opts.guard && !opts.dry_run {
         if opts.no_stage {
             eprintln!("\n⚠ --guard requires staging enabled; skipping guard install.");
         } else {
-            adapter_for(ctx.harness).install_guard(
-                &ctx.stage_root,
-                &ctx.workspace_root,
-                &std::env::current_exe()?,
-                None,
-            )?;
-            match ctx.harness {
-                Harness::ClaudeCode => println!(
-                    "\n🛡 Write guard armed: a PreToolUse hook is staged in .claude/settings.local.json\n   and will block writes/installs outside the eval sandbox during dispatches.\n   It auto-expires in 6h and is removed on the next run; to remove it now:\n     eval-magic teardown-guard"
-                ),
-                Harness::Codex => println!(
-                    "\n🛡 Write guard armed: a PreToolUse hook is staged in .codex/hooks.json\n   and will block writes/installs outside the eval sandbox during Codex dispatches.\n   Dispatch with codex exec --dangerously-bypass-hook-trust so the vetted eval hook runs.\n   It auto-expires in 6h and is removed on the next run; to remove it now:\n     eval-magic teardown-guard"
-                ),
-                Harness::OpenCode => unreachable!(
-                    "install_guard_for_harness rejects OpenCode before this message prints"
-                ),
+            let adapter = adapter_for(ctx.harness);
+            let exe = std::env::current_exe()?;
+            for target in &targets {
+                adapter.install_guard(&target.root, &exe, None)?;
+            }
+            if let Some(msg) = adapter.guard_armed_message() {
+                println!("{msg}");
             }
         }
     }
@@ -238,27 +383,23 @@ pub(super) fn post_build(
     }
 
     // Plugin-shadow preflight (Claude Code): a staged skill name also discoverable
-    // from an enabled plugin or the global skills dir contaminates the run.
+    // from an enabled plugin or the global skills dir contaminates the run. Scan the
+    // first staged env, not `ctx.stage_root` — under Cli the legacy single `env/` is
+    // never created, so the project-local `.claude/settings.json` enabledPlugins the
+    // scan reads must come from a real staged env. In-session's first target *is*
+    // `env/` (== `ctx.stage_root`), so this is unchanged there.
     if ctx.harness == Harness::ClaudeCode {
         let mut names: Vec<&str> = vec![ctx.skill_name.as_str()];
         names.extend(ctx.sibling_skill_names.iter().map(String::as_str));
-        let report = detect_plugin_shadows(&config_dir_from_env(), &ctx.stage_root, &names);
+        let scan_root = targets
+            .first()
+            .map(|t| t.root.as_path())
+            .unwrap_or(ctx.stage_root.as_path());
+        let report = detect_plugin_shadows(&config_dir_from_env(), scan_root, &names);
         if !report.shadowed.is_empty() {
             write_json(&r.iteration_dir.join("plugin-shadow.json"), &report)?;
             eprintln!("{}", format_shadow_banner(&report));
         }
-        // When the staging-discovery miss and a plugin shadow both bite, the
-        // individual warnings don't add up to an obvious action — summarize it. The discovery
-        // miss only applies when `run` created .claude/skills/ fresh (an existing dir is watched,
-        // so the staged skill is discoverable), so gate on `!skills_dir_preexisted`.
-        if let Some(action) = staging_plugin_shadow_action(
-            ctx.harness,
-            opts.no_stage,
-            !report.shadowed.is_empty(),
-            staged.skills_dir_preexisted,
-        ) {
-            eprintln!("{action}");
-        }
     }
     Ok(())
 }
diff --git a/src/cli/run/orchestrate/envs.rs b/src/cli/run/orchestrate/envs.rs
new file mode 100644
index 0000000..c6f469f
--- /dev/null
+++ b/src/cli/run/orchestrate/envs.rs
@@ -0,0 +1,195 @@
+//! Env-layout planning: turn the computed isolation [`Group`]s into the concrete
+//! environment directories a run stages into, which differs by dispatch mechanism.
+//!
+//! - **InSession** keeps a single `iteration-N/env/` that hosts *both* conditions
+//!   (the off-condition skill is removed by `switch-condition`) and the first
+//!   group's fixtures (later groups are swapped in by `reset-batch`). One env, one
+//!   session — byte-identical to the pre-grouping layout in the single-group case.
+//! - **Cli** materializes one `iteration-N/env-<group>-<condition>/` per
+//!   `(group, condition)`: each subprocess `cd`s into its own env, which holds only
+//!   that condition's skill (or none) and that group's fixtures — real physical
+//!   isolation along both axes.
+
+use std::path::{Path, PathBuf};
+
+use crate::core::DispatchMechanism;
+
+use super::super::grouping::Group;
+
+/// One environment directory to stage for a run.
+pub(super) struct EnvTarget {
+    pub root: PathBuf,
+    /// `(condition name, that condition's skill path)` staged into this env.
+    /// InSession stages both conditions here; Cli stages exactly one.
+    pub conditions: Vec<(&'static str, Option<String>)>,
+    /// Eval ids whose fixtures populate this env (its group's evals).
+    pub eval_ids: Vec<String>,
+}
+
+/// Inputs to [`env_targets`].
+pub(super) struct EnvLayoutInput<'a> {
+    pub iteration_dir: &'a Path,
+    pub mechanism: DispatchMechanism,
+    pub groups: &'a [Group],
+    pub cond_a: &'static str,
+    pub cond_b: &'static str,
+    pub skill_path_a: Option<&'a str>,
+    pub skill_path_b: Option<&'a str>,
+}
+
+/// The env dir a `(group, condition)` task runs in: the shared `env/` for
+/// InSession, or the per-`(group, condition)` env for Cli.
+pub(super) fn task_env_root(
+    iteration_dir: &Path,
+    mechanism: DispatchMechanism,
+    group_id: &str,
+    condition: &str,
+) -> PathBuf {
+    match mechanism {
+        DispatchMechanism::InSession => iteration_dir.join("env"),
+        DispatchMechanism::Cli => iteration_dir.join(format!("env-{group_id}-{condition}")),
+    }
+}
+
+/// Plan the environments to stage. InSession returns a single env hosting both
+/// conditions and the *first* group's fixtures; Cli returns one env per
+/// `(group, condition)`.
+pub(super) fn env_targets(input: &EnvLayoutInput) -> Vec<EnvTarget> {
+    let conds: [(&'static str, Option<String>); 2] = [
+        (input.cond_a, input.skill_path_a.map(str::to_owned)),
+        (input.cond_b, input.skill_path_b.map(str::to_owned)),
+    ];
+    match input.mechanism {
+        DispatchMechanism::InSession => {
+            // One env, staged for the first group; reset-batch swaps later groups in.
+            let first = input
+                .groups
+                .first()
+                .expect("at least one group is always computed");
+            vec![EnvTarget {
+                root: task_env_root(
+                    input.iteration_dir,
+                    input.mechanism,
+                    &first.id,
+                    input.cond_a,
+                ),
+                conditions: conds.to_vec(),
+                eval_ids: first.eval_ids.clone(),
+            }]
+        }
+        DispatchMechanism::Cli => input
+            .groups
+            .iter()
+            .flat_map(|g| {
+                conds
+                    .clone()
+                    .into_iter()
+                    .map(move |(cond, skill)| EnvTarget {
+                        root: task_env_root(input.iteration_dir, input.mechanism, &g.id, cond),
+                        conditions: vec![(cond, skill)],
+                        eval_ids: g.eval_ids.clone(),
+                    })
+            })
+            .collect(),
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn groups() -> Vec<Group> {
+        vec![
+            Group {
+                id: "g1".into(),
+                eval_ids: vec!["e1".into()],
+                rationale: "default".into(),
+            },
+            Group {
+                id: "g2".into(),
+                eval_ids: vec!["e2".into()],
+                rationale: "fixture-conflict: e2 vs e1 at c.json".into(),
+            },
+        ]
+    }
+
+    #[test]
+    fn insession_single_env_hosts_both_conditions_and_first_group() {
+        let iter = Path::new("/w/iteration-1");
+        let gs = groups();
+        let targets = env_targets(&EnvLayoutInput {
+            iteration_dir: iter,
+            mechanism: DispatchMechanism::InSession,
+            groups: &gs,
+            cond_a: "with_skill",
+            cond_b: "without_skill",
+            skill_path_a: Some("/s/SKILL.md"),
+            skill_path_b: None,
+        });
+        assert_eq!(targets.len(), 1);
+        assert_eq!(targets[0].root, Path::new("/w/iteration-1/env"));
+        assert_eq!(
+            targets[0]
+                .conditions
+                .iter()
+                .map(|(c, _)| *c)
+                .collect::<Vec<_>>(),
+            vec!["with_skill", "without_skill"]
+        );
+        // Only the first group's fixtures populate the env up front.
+        assert_eq!(targets[0].eval_ids, vec!["e1"]);
+    }
+
+    #[test]
+    fn cli_one_env_per_group_condition_with_only_that_conditions_skill() {
+        let iter = Path::new("/w/iteration-1");
+        let gs = groups();
+        let targets = env_targets(&EnvLayoutInput {
+            iteration_dir: iter,
+            mechanism: DispatchMechanism::Cli,
+            groups: &gs,
+            cond_a: "with_skill",
+            cond_b: "without_skill",
+            skill_path_a: Some("/s/SKILL.md"),
+            skill_path_b: None,
+        });
+        assert_eq!(targets.len(), 4, "2 groups × 2 conditions");
+        let roots: Vec<String> = targets
+            .iter()
+            .map(|t| t.root.to_string_lossy().into_owned())
+            .collect();
+        assert_eq!(
+            roots,
+            vec![
+                "/w/iteration-1/env-g1-with_skill",
+                "/w/iteration-1/env-g1-without_skill",
+                "/w/iteration-1/env-g2-with_skill",
+                "/w/iteration-1/env-g2-without_skill",
+            ]
+        );
+        // The with_skill env carries the skill; the control arm's env carries none.
+        let with = &targets[0];
+        assert_eq!(
+            with.conditions,
+            vec![("with_skill", Some("/s/SKILL.md".to_string()))]
+        );
+        let without = &targets[1];
+        assert_eq!(without.conditions, vec![("without_skill", None)]);
+        // Each env only holds its group's evals.
+        assert_eq!(targets[0].eval_ids, vec!["e1"]);
+        assert_eq!(targets[2].eval_ids, vec!["e2"]);
+    }
+
+    #[test]
+    fn task_env_root_is_bare_env_for_insession_and_suffixed_for_cli() {
+        let iter = Path::new("/w/iteration-1");
+        assert_eq!(
+            task_env_root(iter, DispatchMechanism::InSession, "g2", "without_skill"),
+            Path::new("/w/iteration-1/env")
+        );
+        assert_eq!(
+            task_env_root(iter, DispatchMechanism::Cli, "g2", "without_skill"),
+            Path::new("/w/iteration-1/env-g2-without_skill")
+        );
+    }
+}
diff --git a/src/cli/run/orchestrate/mod.rs b/src/cli/run/orchestrate/mod.rs
index dbe9a33..6130f14 100644
--- a/src/cli/run/orchestrate/mod.rs
+++ b/src/cli/run/orchestrate/mod.rs
@@ -15,12 +15,13 @@ use std::path::PathBuf;
 
 use crate::adapters::{CliDispatchContext, adapter_for};
 use crate::cli::command_target_args;
-use crate::core::{AvailableSkill, DispatchMechanism, Eval, Mode, RunContext, mechanism_for};
+use crate::core::{DispatchMechanism, Eval, Mode, RunContext};
 
 use super::RunError;
-use super::util::mode_str;
+use super::util::{insession_isolated_handoff, mode_str};
 
 mod build;
+mod envs;
 mod resolve;
 mod stage;
 
@@ -64,6 +65,10 @@ struct Resolved {
     skill_path_b: Option<String>,
     selected_evals: Vec<Eval>,
     total_evals: usize,
+    /// Isolation groups computed from the selected evals' fixtures + hints, in
+    /// config order. Always at least one group (`g1`); a single group is the
+    /// common no-conflict case.
+    groups: Vec<super::grouping::Group>,
 }
 
 /// The product of [`stage::stage_conditions`]: the staged slugs plus the
@@ -71,23 +76,31 @@ struct Resolved {
 struct Staged {
     cond_a_slug: Option<String>,
     cond_b_slug: Option<String>,
-    sibling_skills: Vec<AvailableSkill>,
+    /// Sibling skills' `(name, description)` — env-independent. `build` resolves
+    /// the on-disk path per env (Cli stages a separate env per (group, condition)).
+    sibling_meta: Vec<(String, String)>,
     bootstrap_content: Option<String>,
     plan_mode_content: Option<String>,
-    /// Whether the harness skills dir existed when `run` started — i.e. before this run staged
-    /// anything. Drives the Claude Code staged-skill discovery warning: an existing dir is already
-    /// watched, so live change detection surfaces the staged skills; a dir this run had to create
-    /// isn't watched until the session re-scans. See [`super::util::staging_discovery_warning`].
-    skills_dir_preexisted: bool,
 }
 
 /// Build the iteration workspace and dispatch plan for a run.
 pub fn command_run(ctx: &RunContext, opts: &RunOptions) -> Result<(), RunError> {
     let resolved = resolve::resolve_request(ctx, opts)?;
+
+    // Redirect staging into the isolated env dir. `resolve_request` has now
+    // computed `iteration_dir`; `env/` becomes the agent-under-test's cwd and the
+    // staging root, so the existing root-parameterized staging path follows it
+    // (every `skills_dir_for_harness(&ctx.stage_root, …)` site). eval-magic meta
+    // stays above the env in `iteration_dir`. Only `run` overrides the cwd default
+    // set in `detect_run_context`; teardown/finalize keep operating at cwd.
+    let mut owned_ctx = ctx.clone();
+    owned_ctx.stage_root = resolved.iteration_dir.join("env");
+    let ctx = &owned_ctx;
+
     print_run_plan(ctx, opts, &resolved);
     let staged = stage::stage_conditions(ctx, opts, &resolved)?;
     let num_tasks = build::write_dispatch(ctx, opts, &resolved, &staged)?;
-    build::post_build(ctx, opts, &resolved, &staged)?;
+    build::post_build(ctx, opts, &resolved)?;
     print_next_steps(ctx, opts, &resolved, num_tasks);
     Ok(())
 }
@@ -142,6 +155,17 @@ fn print_next_steps(ctx: &RunContext, opts: &RunOptions, r: &Resolved, num_tasks
         "Dispatch tasks:     {}",
         r.iteration_dir.join("dispatch.json").display()
     );
+
+    match ctx.run_mode.mechanism() {
+        DispatchMechanism::InSession => println!(
+            "Runbook:            {} — start a fresh session in env/ and \"Read and follow RUNBOOK.md\".",
+            ctx.stage_root.join("RUNBOOK.md").display()
+        ),
+        DispatchMechanism::Cli => println!(
+            "Runbook:            {} — a human-followed copy of the steps below.",
+            r.iteration_dir.join("RUNBOOK.md").display()
+        ),
+    }
     let run_counts: Vec<u32> = r
         .selected_evals
         .iter()
@@ -173,11 +197,14 @@ fn print_next_steps(ctx: &RunContext, opts: &RunOptions, r: &Resolved, num_tasks
         return;
     }
     let target_args = command_target_args(ctx);
-    match mechanism_for(ctx.harness) {
-        // In-session subagent dispatch (Claude Code's Task tool today).
-        DispatchMechanism::InSession => println!(
-            "\nNext: iterate the tasks[] array in dispatch.json and dispatch each task as a subagent, passing its `agent_description` verbatim as the subagent description (that string is the key that links each transcript back — without it tool calls, tokens, and duration come back empty). Then run:\n  eval-magic ingest{target_args} --iteration {iteration}\n(ingest auto-resolves the subagents dir from CLAUDE_CODE_SESSION_ID; outside that session, add --session-id <id> or --subagents-dir <path>.)"
-        ),
+    match ctx.run_mode.mechanism() {
+        // In-session subagent dispatch (Claude Code's Task tool today). The env is
+        // built before the isolated session starts, so the summary just hands off:
+        // cd into env/, start a fresh session, "Read and follow RUNBOOK.md" — which
+        // carries the full dispatch → switch-condition → ingest → finalize loop.
+        DispatchMechanism::InSession => {
+            println!("\nNext: {}", insession_isolated_handoff(&ctx.stage_root))
+        }
         // One-shot CLI dispatch; the exact command is harness-specific.
         DispatchMechanism::Cli => println!(
             "{}",
diff --git a/src/cli/run/orchestrate/resolve.rs b/src/cli/run/orchestrate/resolve.rs
index 67101af..48fd403 100644
--- a/src/cli/run/orchestrate/resolve.rs
+++ b/src/cli/run/orchestrate/resolve.rs
@@ -11,6 +11,8 @@ use crate::validation::validate_evals_config;
 
 use super::super::RunError;
 use super::super::dispatch::select_evals;
+use super::super::fixtures::fixture_pairs;
+use super::super::grouping::{GroupInput, compute_groups};
 use super::super::util::{
     condition_names_for, make_run_nonce, next_iteration, validate_harness_run_options,
 };
@@ -58,6 +60,24 @@ pub(super) fn resolve_request(ctx: &RunContext, opts: &RunOptions) -> Result<Res
     let selected_evals = select_evals(&config.evals, opts.only, opts.skip)?;
     let total_evals = config.evals.len();
 
+    // Compute isolation groups up front (fixture-conflict + explicit hint), before
+    // any env is staged: staging and dispatch both consume this plan. `fixture_pairs`
+    // also fails fast here if a declared fixture is missing.
+    let fixture_pairs_by_eval = selected_evals
+        .iter()
+        .map(|ev| fixture_pairs(ev, &ctx.skill_subdir))
+        .collect::<Result<Vec<_>, _>>()?;
+    let group_inputs: Vec<GroupInput> = selected_evals
+        .iter()
+        .zip(&fixture_pairs_by_eval)
+        .map(|(ev, fixtures)| GroupInput {
+            eval_id: &ev.id,
+            isolation: ev.isolation,
+            fixtures,
+        })
+        .collect();
+    let groups = compute_groups(&group_inputs);
+
     let workspace_skill_dir = ctx.workspace_root.join(&ctx.skill_name);
     let iteration = next_iteration(&workspace_skill_dir, opts.iteration);
     let iteration_dir = workspace_skill_dir.join(format!("iteration-{iteration}"));
@@ -108,5 +128,6 @@ pub(super) fn resolve_request(ctx: &RunContext, opts: &RunOptions) -> Result<Res
         skill_path_b,
         selected_evals,
         total_evals,
+        groups,
     })
 }
diff --git a/src/cli/run/orchestrate/stage.rs b/src/cli/run/orchestrate/stage.rs
index 18cd68b..99dfd4e 100644
--- a/src/cli/run/orchestrate/stage.rs
+++ b/src/cli/run/orchestrate/stage.rs
@@ -4,16 +4,18 @@
 use std::fs;
 use std::path::Path;
 
-use crate::core::{AvailableSkill, RunContext};
+use crate::core::RunContext;
 use crate::sandbox::teardown_guard;
 
 use super::super::RunError;
 use super::super::dispatch::get_skill_description;
+use super::super::fixtures::{FixtureClaims, copy_fixtures};
 use super::super::staging::{
     StageSiblingOpts, StageSkillOpts, cleanup_staged_skills, register_staged_skill_for_cleanup,
     skills_dir_for_harness, stage_sibling_skills, stage_skill_for_harness,
 };
-use super::super::util::{harness_label, resolve_plan_mode_profile, staging_discovery_warning};
+use super::super::util::{harness_label, resolve_plan_mode_profile};
+use super::envs::{EnvLayoutInput, env_targets};
 use super::{Resolved, RunOptions, Staged};
 
 pub(super) fn stage_conditions(
@@ -24,35 +26,6 @@ pub(super) fn stage_conditions(
     fs::create_dir_all(&r.iteration_dir)?;
     fs::copy(&r.skill_md_path, r.iteration_dir.join("skill-snapshot.md"))?;
 
-    // Capture whether the harness skills dir already existed BEFORE this run touches anything:
-    // cleanup may prune an empty dir and sibling/skill staging below create it, so reading
-    // `.exists()` later would always be true. Claude Code only watches skill dirs that existed at
-    // session start, so this is the signal for whether the staged skills are discoverable
-    // in-session. See `staging_discovery_warning`.
-    let skills_dir_preexisted = skills_dir_for_harness(&ctx.stage_root, ctx.harness).exists();
-
-    // Always disarm a prior run's guard before re-staging, so a crashed run can't
-    // leave the write-blocking hook armed across runs.
-    teardown_guard(&ctx.stage_root);
-
-    if !opts.no_stage {
-        cleanup_staged_skills(&ctx.stage_root, ctx.harness)?;
-        if ctx.stage_siblings {
-            stage_sibling_skills(&StageSiblingOpts {
-                skill_under_test: &ctx.skill_name,
-                skills_source_dir: &ctx.skill_dir,
-                repo_root: &ctx.stage_root,
-                harness: ctx.harness,
-            })?;
-        }
-    }
-
-    if let Some(warning) =
-        staging_discovery_warning(ctx.harness, opts.no_stage, skills_dir_preexisted)
-    {
-        eprintln!("{warning}");
-    }
-
     let bootstrap_content = match &ctx.bootstrap_path {
         Some(path) => Some(fs::read_to_string(path)?),
         None => None,
@@ -69,80 +42,149 @@ pub(super) fn stage_conditions(
         None
     };
 
-    // Sibling skill metadata, shared across conditions. Empty when --no-stage.
-    let sibling_skills: Vec<AvailableSkill> = if opts.no_stage {
+    // Sibling skill `(name, description)`, env-independent. `build` resolves each
+    // path per env. Empty when --no-stage.
+    let sibling_meta: Vec<(String, String)> = if opts.no_stage {
         Vec::new()
     } else {
         ctx.sibling_skill_names
             .iter()
-            .map(|name| AvailableSkill {
-                name: name.clone(),
-                path: skills_dir_for_harness(&ctx.stage_root, ctx.harness)
-                    .join(name)
-                    .join("SKILL.md")
-                    .to_string_lossy()
-                    .into_owned(),
-                description: get_skill_description(&ctx.skill_dir.join(name).join("SKILL.md")),
+            .map(|name| {
+                (
+                    name.clone(),
+                    get_skill_description(&ctx.skill_dir.join(name).join("SKILL.md")),
+                )
             })
             .collect()
     };
 
     // --stage-name overrides the conspicuous slug with a verbatim name; it targets
-    // the single staging condition, so reject the both-stage case and refuse to
-    // clobber a pre-existing dir.
-    if let Some(stage_name) = opts.stage_name
+    // the single staging condition, so reject the both-stage case up front.
+    if let Some(_stage_name) = opts.stage_name
         && !opts.no_stage
+        && r.skill_path_a.is_some()
+        && r.skill_path_b.is_some()
     {
-        if r.skill_path_a.is_some() && r.skill_path_b.is_some() {
-            return Err(RunError::msg(
-                "--stage-name is only supported when exactly one condition stages the skill (e.g. --mode new-skill); both conditions stage here.",
-            ));
+        return Err(RunError::msg(
+            "--stage-name is only supported when exactly one condition stages the skill (e.g. --mode new-skill); both conditions stage here.",
+        ));
+    }
+
+    // The environments to stage: one shared `env/` for in-session (hosting both
+    // conditions + the first group's fixtures), or one per (group, condition) for
+    // Cli (each with only its condition's skill + its group's fixtures).
+    let targets = env_targets(&EnvLayoutInput {
+        iteration_dir: &r.iteration_dir,
+        mechanism: ctx.run_mode.mechanism(),
+        groups: &r.groups,
+        cond_a: r.cond_a,
+        cond_b: r.cond_b,
+        skill_path_a: r.skill_path_a.as_deref(),
+        skill_path_b: r.skill_path_b.as_deref(),
+    });
+
+    let mut cond_a_slug = None;
+    let mut cond_b_slug = None;
+
+    for target in &targets {
+        // Disarm a prior run's guard before re-staging, so a crashed run can't leave
+        // the write-blocking hook armed across runs. Created unconditionally — even
+        // under --no-stage, fixtures (and the in-session RUNBOOK) still land here.
+        teardown_guard(&target.root);
+        fs::create_dir_all(&target.root)?;
+
+        if !opts.no_stage {
+            cleanup_staged_skills(&target.root, ctx.harness)?;
+            if ctx.stage_siblings {
+                stage_sibling_skills(&StageSiblingOpts {
+                    skill_under_test: &ctx.skill_name,
+                    skills_source_dir: &ctx.skill_dir,
+                    repo_root: &target.root,
+                    harness: ctx.harness,
+                })?;
+            }
         }
-        let target = skills_dir_for_harness(&ctx.stage_root, ctx.harness).join(stage_name);
-        if target.exists() {
-            return Err(RunError::msg(format!(
-                "--stage-name \"{stage_name}\": {} already exists; refusing to clobber it. Remove it or choose a different name.",
-                target.display()
-            )));
+
+        for (cond_name, cond_skill_path) in &target.conditions {
+            // Refuse to clobber a pre-existing --stage-name dir in this env.
+            if let Some(stage_name) = opts.stage_name
+                && !opts.no_stage
+                && cond_skill_path.is_some()
+            {
+                let dir = skills_dir_for_harness(&target.root, ctx.harness).join(stage_name);
+                if dir.exists() {
+                    return Err(RunError::msg(format!(
+                        "--stage-name \"{stage_name}\": {} already exists; refusing to clobber it. Remove it or choose a different name.",
+                        dir.display()
+                    )));
+                }
+            }
+
+            if let Some(slug) = stage_for(
+                ctx,
+                opts,
+                r,
+                cond_name,
+                cond_skill_path.as_deref(),
+                &target.root,
+            )? {
+                if *cond_name == r.cond_a {
+                    cond_a_slug = Some(slug.clone());
+                }
+                if *cond_name == r.cond_b {
+                    cond_b_slug = Some(slug.clone());
+                }
+                // A custom-named dir isn't caught by the prefix scan; record it in
+                // this env's manifest so cleanup removes it.
+                if opts.stage_name == Some(slug.as_str()) {
+                    register_staged_skill_for_cleanup(&target.root, &slug, ctx.harness)?;
+                }
+            }
         }
-    }
 
-    let stage_for =
-        |cond_name: &str, cond_skill_path: Option<&str>| -> Result<Option<String>, RunError> {
-            let Some(path) = cond_skill_path.filter(|_| !opts.no_stage) else {
-                return Ok(None);
-            };
-            let content = fs::read_to_string(path)?;
-            let slug = stage_skill_for_harness(&StageSkillOpts {
-                content: &content,
-                iteration: r.iteration,
-                condition: cond_name,
-                skill_name: &ctx.skill_name,
-                repo_root: &ctx.stage_root,
-                assets_dir: Path::new(path).parent(),
-                stage_name_override: opts.stage_name,
-                harness: ctx.harness,
-            })?;
-            Ok(Some(slug))
-        };
-
-    let cond_a_slug = stage_for(r.cond_a, r.skill_path_a.as_deref())?;
-    let cond_b_slug = stage_for(r.cond_b, r.skill_path_b.as_deref())?;
-
-    // A custom-named dir isn't caught by the prefix scan; record it for cleanup.
-    if let Some(stage_name) = opts.stage_name
-        && (cond_a_slug.as_deref() == Some(stage_name)
-            || cond_b_slug.as_deref() == Some(stage_name))
-    {
-        register_staged_skill_for_cleanup(&ctx.stage_root, stage_name, ctx.harness)?;
+        // Copy this env's group's fixtures. Claims are per env (each env is
+        // independent); grouping has already routed clobbering evals into separate
+        // groups, so within one env the same-source/idempotent rule never trips.
+        let mut claims = FixtureClaims::new();
+        for eval_id in &target.eval_ids {
+            if let Some(ev) = r.selected_evals.iter().find(|e| &e.id == eval_id) {
+                copy_fixtures(ev, &ctx.skill_subdir, &target.root, &mut claims)?;
+            }
+        }
     }
 
     Ok(Staged {
         cond_a_slug,
         cond_b_slug,
-        sibling_skills,
+        sibling_meta,
         bootstrap_content,
         plan_mode_content,
-        skills_dir_preexisted,
     })
 }
+
+/// Stage one condition's skill into `root` and return its slug; `Ok(None)` when
+/// the condition stages no skill (the new-skill control arm) or under --no-stage.
+fn stage_for(
+    ctx: &RunContext,
+    opts: &RunOptions,
+    r: &Resolved,
+    cond_name: &str,
+    cond_skill_path: Option<&str>,
+    root: &Path,
+) -> Result<Option<String>, RunError> {
+    let Some(path) = cond_skill_path.filter(|_| !opts.no_stage) else {
+        return Ok(None);
+    };
+    let content = fs::read_to_string(path)?;
+    let slug = stage_skill_for_harness(&StageSkillOpts {
+        content: &content,
+        iteration: r.iteration,
+        condition: cond_name,
+        skill_name: &ctx.skill_name,
+        repo_root: root,
+        assets_dir: Path::new(path).parent(),
+        stage_name_override: opts.stage_name,
+        harness: ctx.harness,
+    })?;
+    Ok(Some(slug))
+}
diff --git a/src/cli/run/runbook.rs b/src/cli/run/runbook.rs
new file mode 100644
index 0000000..fb76b90
--- /dev/null
+++ b/src/cli/run/runbook.rs
@@ -0,0 +1,449 @@
+//! `RUNBOOK.md` generation — the followable handoff artifact written into an
+//! iteration directory during `run`.
+//!
+//! The runbook turns the prep session's "what to do next" guidance into a file
+//! a *fresh, isolated* session (or a human at a terminal) can read end-to-end:
+//! "Read and follow RUNBOOK.md". Which template is used is keyed on the run mode's
+//! [`DispatchMechanism`](crate::core::DispatchMechanism), not the harness:
+//!
+//! - `InSession` (interactive) → the harness's interactive, agent-followed template.
+//! - `Cli` (hybrid / headless) → the shared headless, human-followed template —
+//!   including Claude Code under `--run-mode hybrid`.
+//!
+//! The per-mode prose skeletons live in `profiles/` (checked in, loaded via
+//! [`HarnessAdapter::runbook_template`](crate::adapters::HarnessAdapter::runbook_template))
+//! and carry `{{TOKEN}}` placeholders the renderer fills with run-specific values.
+//! The generated `RUNBOOK.md` itself is a workspace artifact and is not version
+//! controlled.
+
+use std::path::Path;
+
+use crate::adapters::{
+    CliDispatchContext, CliJudgeContext, HEADLESS_RUNBOOK_TEMPLATE, adapter_for,
+};
+use crate::core::{DispatchMechanism, Harness, Mode, RunMode};
+
+use super::util::{
+    harness_label, insession_dispatch_batch, insession_dispatch_segment, insession_ingest_command,
+    insession_reset_batch_command, insession_switch_command, mode_str,
+};
+
+/// Run-specific values the renderer substitutes into a runbook template. Built by
+/// the orchestrator from the resolved run; kept as primitives so the renderer is
+/// decoupled from the orchestrator's private `Resolved`/`RunContext` types and is
+/// unit-testable on its own.
+pub(crate) struct RunbookContext<'a> {
+    pub harness: Harness,
+    pub run_mode: RunMode,
+    pub skill_name: &'a str,
+    pub iteration: u32,
+    pub iteration_dir: &'a Path,
+    pub mode: Mode,
+    pub cond_a: &'a str,
+    pub cond_b: &'a str,
+    pub num_tasks: usize,
+    /// Isolation-group ids in order. One entry → the byte-identical single-batch
+    /// dispatch; more → per-group batches with `reset-batch` barriers (in-session).
+    pub groups: &'a [String],
+    /// The self-sufficient `--skill-dir … --skill …` selector (leading space),
+    /// from [`command_target_args`](crate::cli::command_target_args).
+    pub target_args: &'a str,
+    pub guard: bool,
+    pub agent_model: Option<&'a str>,
+}
+
+/// The per-condition dispatch block for the interactive runbook. A single group
+/// renders the legacy single-batch instruction (byte-identical to the pre-grouping
+/// runbook). Multiple groups render each group's batch with a `reset-batch` barrier
+/// between them; `first_condition` suppresses the reset before the very first group
+/// (condition A starts from the env already staged with group 1, while condition B
+/// must restore group 1 after A's last group mutated the env).
+fn insession_dispatch_block(
+    condition: &str,
+    groups: &[String],
+    target_args: &str,
+    iteration: u32,
+    first_condition: bool,
+) -> String {
+    if groups.len() <= 1 {
+        return insession_dispatch_batch(condition);
+    }
+    let mut parts: Vec<String> = Vec::new();
+    for (i, group) in groups.iter().enumerate() {
+        if !(first_condition && i == 0) {
+            parts.push(format!(
+                "Reset the env to group `{group}` (wait for the previous batch to finish first):\n\n```\n{}\n```",
+                insession_reset_batch_command(target_args, iteration, group)
+            ));
+        }
+        parts.push(format!(
+            "Dispatch group `{group}`: {}",
+            insession_dispatch_segment(condition, group)
+        ));
+    }
+    parts.join("\n\n")
+}
+
+/// Render `RUNBOOK.md` for a run: pick the harness's template (interactive vs.
+/// headless) and fill its `{{TOKEN}}` placeholders with run-specific values.
+pub(crate) fn build_runbook(ctx: &RunbookContext) -> String {
+    let adapter = adapter_for(ctx.harness);
+    // The runbook template is mechanism-keyed, not harness-keyed: an in-session
+    // run uses the harness's interactive (agent-followed) template; every Cli run
+    // uses the shared headless (human-followed) one — including Claude Code in
+    // hybrid, whose `runbook_template()` is the interactive variant.
+    let template = match ctx.run_mode.mechanism() {
+        DispatchMechanism::InSession => adapter.runbook_template(),
+        DispatchMechanism::Cli => HEADLESS_RUNBOOK_TEMPLATE,
+    };
+
+    let iteration = ctx.iteration.to_string();
+    let num_tasks = ctx.num_tasks.to_string();
+    let dispatch_json = ctx
+        .iteration_dir
+        .join("dispatch.json")
+        .display()
+        .to_string();
+    let benchmark_path = ctx
+        .iteration_dir
+        .join("benchmark.json")
+        .display()
+        .to_string();
+
+    // Shared identity tokens, present in both templates.
+    let mut vars: Vec<(&str, &str)> = vec![
+        ("SKILL_NAME", ctx.skill_name),
+        ("ITERATION", &iteration),
+        ("MODE", mode_str(ctx.mode)),
+        ("COND_A", ctx.cond_a),
+        ("COND_B", ctx.cond_b),
+        ("NUM_TASKS", &num_tasks),
+        ("DISPATCH_JSON", &dispatch_json),
+        ("BENCHMARK_PATH", &benchmark_path),
+    ];
+
+    // Mechanism-specific tokens. Owners outlive the `render` call below.
+    let (dispatch_cond_a, dispatch_cond_b, switch_cmd, ingest_cmd);
+    let (dispatch_recipe, judge_recipe, finalize_cmd, teardown_cmd);
+    match ctx.run_mode.mechanism() {
+        // Interactive: an agent dispatches in-session subagents one condition batch
+        // at a time, runs `switch-condition` between them, then runs the rest of the
+        // loop itself. Built from the same fragments as the post-`run` "Next:"
+        // message so the two can never drift on the dispatch / switch / ingest text.
+        DispatchMechanism::InSession => {
+            dispatch_cond_a = insession_dispatch_block(
+                ctx.cond_a,
+                ctx.groups,
+                ctx.target_args,
+                ctx.iteration,
+                true,
+            );
+            dispatch_cond_b = insession_dispatch_block(
+                ctx.cond_b,
+                ctx.groups,
+                ctx.target_args,
+                ctx.iteration,
+                false,
+            );
+            switch_cmd = insession_switch_command(ctx.target_args, ctx.iteration, ctx.cond_b);
+            ingest_cmd = insession_ingest_command(ctx.target_args, ctx.iteration);
+            finalize_cmd = format!(
+                "eval-magic finalize{} --iteration {}",
+                ctx.target_args, ctx.iteration
+            );
+            teardown_cmd = format!("eval-magic teardown{}", ctx.target_args);
+            vars.push(("DISPATCH_COND_A", &dispatch_cond_a));
+            vars.push(("DISPATCH_COND_B", &dispatch_cond_b));
+            vars.push(("SWITCH_CMD", &switch_cmd));
+            vars.push(("INGEST_CMD", &ingest_cmd));
+            vars.push(("FINALIZE_CMD", &finalize_cmd));
+            vars.push(("TEARDOWN_CMD", &teardown_cmd));
+        }
+        // Headless: a human pastes commands. The harness-specific dispatch +
+        // judge recipes come from the adapter's existing CLI generators, so the
+        // runbook stays in lockstep with `dispatch-manifest.md` and the printed
+        // next steps; pipeline commands carry `--harness`.
+        DispatchMechanism::Cli => {
+            let label = harness_label(ctx.harness);
+            dispatch_recipe = adapter.cli_next_steps(CliDispatchContext {
+                guard: ctx.guard,
+                target_args: ctx.target_args,
+                iteration: ctx.iteration,
+                agent_model: ctx.agent_model,
+            });
+            judge_recipe = adapter
+                .cli_judge_next_steps(CliJudgeContext { guard: ctx.guard })
+                .unwrap_or_else(|| {
+                    "Dispatch each judge task `ingest` listed through the same harness CLI, \
+                     capturing its transcript output, then finalize."
+                        .to_string()
+                });
+            finalize_cmd = format!(
+                "eval-magic finalize{} --iteration {} --harness {label}",
+                ctx.target_args, ctx.iteration
+            );
+            teardown_cmd = format!("eval-magic teardown{} --harness {label}", ctx.target_args);
+            vars.push(("HARNESS", label));
+            vars.push(("DISPATCH_RECIPE", &dispatch_recipe));
+            vars.push(("JUDGE_RECIPE", &judge_recipe));
+            vars.push(("FINALIZE_CMD", &finalize_cmd));
+            vars.push(("TEARDOWN_CMD", &teardown_cmd));
+        }
+    }
+
+    render(template, &vars)
+}
+
+/// Substitute `{{KEY}}` placeholders in `template` with their values.
+///
+/// Each `(key, value)` replaces every `{{key}}` occurrence. Keys are matched
+/// verbatim (the braces are added here), so callers pass `"SKILL_NAME"`, not
+/// `"{{SKILL_NAME}}"`. Replacement is a single ordered pass per key, so a value
+/// that itself contains `{{...}}` is never re-expanded.
+fn render(template: &str, vars: &[(&str, &str)]) -> String {
+    let mut out = String::with_capacity(template.len());
+    let mut rest = template;
+    // Single left-to-right pass: only the original template is scanned, so a
+    // substituted value that itself contains `{{...}}` is emitted verbatim and
+    // never re-expanded (order-independent). Unknown / unterminated tokens are
+    // left as-is.
+    while let Some(start) = rest.find("{{") {
+        out.push_str(&rest[..start]);
+        let after = &rest[start + 2..];
+        let Some(end) = after.find("}}") else {
+            out.push_str("{{");
+            rest = after;
+            continue;
+        };
+        let key = &after[..end];
+        match vars.iter().find(|(k, _)| *k == key) {
+            Some((_, value)) => out.push_str(value),
+            None => {
+                out.push_str("{{");
+                out.push_str(key);
+                out.push_str("}}");
+            }
+        }
+        rest = &after[end + 2..];
+    }
+    out.push_str(rest);
+    out
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::path::PathBuf;
+
+    fn claude_ctx(dir: &Path) -> RunbookContext<'_> {
+        RunbookContext {
+            harness: Harness::ClaudeCode,
+            run_mode: RunMode::Interactive,
+            skill_name: "widget-skill",
+            iteration: 5,
+            iteration_dir: dir,
+            mode: Mode::NewSkill,
+            cond_a: "with_skill",
+            cond_b: "without_skill",
+            num_tasks: 4,
+            groups: &[],
+            target_args: " --skill-dir /tmp/skills --skill widget-skill",
+            guard: true,
+            agent_model: None,
+        }
+    }
+
+    #[test]
+    fn interactive_runbook_carries_run_specifics_and_full_loop() {
+        let dir = PathBuf::from("/work/.eval-magic/widget-skill/iteration-5");
+        let book = build_runbook(&claude_ctx(&dir));
+
+        // Run-specific identity.
+        assert!(book.contains("widget-skill"), "names the skill: {book}");
+        assert!(book.contains("iteration 5"), "names the iteration: {book}");
+        assert!(
+            book.contains("with_skill") && book.contains("without_skill"),
+            "names both conditions: {book}"
+        );
+        assert!(book.contains("new-skill"), "names the mode: {book}");
+
+        // The dispatch step reuses the in-session guidance (agent_description is
+        // the transcript-linking key).
+        assert!(
+            book.contains("agent_description"),
+            "carries the dispatch-loop guidance: {book}"
+        );
+
+        // The per-condition batch loop: each condition dispatched as its own batch,
+        // with a `switch-condition` barrier (naming the kept condition) between them.
+        assert!(
+            book.contains("`condition` is `with_skill`")
+                && book.contains("`condition` is `without_skill`"),
+            "dispatches each condition as its own batch: {book}"
+        );
+        assert!(
+            book.contains(
+                "eval-magic switch-condition --skill-dir /tmp/skills --skill widget-skill --iteration 5 --condition without_skill"
+            ),
+            "carries the switch-condition barrier command: {book}"
+        );
+
+        // The full single-session loop: ingest → finalize → teardown, each a
+        // copy-pasteable command threaded with the target selector + iteration.
+        assert!(
+            book.contains(
+                "eval-magic ingest --skill-dir /tmp/skills --skill widget-skill --iteration 5"
+            ),
+            "carries the ingest command: {book}"
+        );
+        assert!(
+            book.contains(
+                "eval-magic finalize --skill-dir /tmp/skills --skill widget-skill --iteration 5"
+            ),
+            "carries the finalize command: {book}"
+        );
+        assert!(
+            book.contains("eval-magic teardown --skill-dir /tmp/skills --skill widget-skill"),
+            "carries the teardown command: {book}"
+        );
+        assert!(
+            book.contains("benchmark.json"),
+            "points at the result: {book}"
+        );
+
+        // No interactive run is dispatched through a harness CLI — that is the
+        // headless path.
+        assert!(
+            !book.contains("codex exec"),
+            "interactive runbook is not a CLI-dispatch recipe: {book}"
+        );
+        // Every template token must be filled.
+        assert!(
+            !book.contains("{{"),
+            "no unsubstituted tokens remain: {book}"
+        );
+    }
+
+    #[test]
+    fn interactive_runbook_with_multiple_groups_carries_reset_batch_barriers() {
+        let dir = PathBuf::from("/work/.eval-magic/widget-skill/iteration-5");
+        let groups = ["g1".to_string(), "g2".to_string()];
+        let book = build_runbook(&RunbookContext {
+            groups: &groups,
+            ..claude_ctx(&dir)
+        });
+
+        // Each group dispatches as its own segment, filtered by group.
+        assert!(
+            book.contains("`condition` is `with_skill` and `group` is `g1`")
+                && book.contains("`condition` is `with_skill` and `group` is `g2`"),
+            "with_skill dispatches each group separately: {book}"
+        );
+        assert!(
+            book.contains("`condition` is `without_skill` and `group` is `g1`")
+                && book.contains("`condition` is `without_skill` and `group` is `g2`"),
+            "without_skill dispatches each group separately: {book}"
+        );
+        // reset-batch barriers between groups, naming the group to seed.
+        assert!(
+            book.contains(
+                "eval-magic reset-batch --skill-dir /tmp/skills --skill widget-skill --iteration 5 --group g2"
+            ),
+            "carries the reset-batch barrier for g2: {book}"
+        );
+        // The switch-condition barrier is still present, once, between conditions.
+        assert!(
+            book.contains("eval-magic switch-condition")
+                && book.contains("--condition without_skill"),
+            "still carries the switch-condition barrier: {book}"
+        );
+        assert!(!book.contains("{{"), "no unsubstituted tokens: {book}");
+    }
+
+    #[test]
+    fn headless_runbook_is_human_followed_cli_recipe() {
+        let dir = PathBuf::from("/work/.eval-magic/widget-skill/iteration-2");
+        let ctx = RunbookContext {
+            harness: Harness::Codex,
+            run_mode: RunMode::Hybrid,
+            skill_name: "widget-skill",
+            iteration: 2,
+            iteration_dir: &dir,
+            mode: Mode::Revision,
+            cond_a: "old_skill",
+            cond_b: "new_skill",
+            num_tasks: 6,
+            groups: &[],
+            target_args: " --skill-dir /tmp/skills --skill widget-skill",
+            guard: false,
+            agent_model: Some("gpt-5-mini"),
+        };
+        let book = build_runbook(&ctx);
+
+        // Run-specific identity, including the revision-mode condition names.
+        assert!(book.contains("widget-skill"), "names the skill: {book}");
+        assert!(book.contains("iteration 2"), "names the iteration: {book}");
+        assert!(
+            book.contains("old_skill") && book.contains("new_skill"),
+            "names both conditions: {book}"
+        );
+
+        // Human-followed framing (the shared headless template), not the agent
+        // in-session framing.
+        assert!(
+            book.contains("human driving"),
+            "frames the run for a human at a terminal: {book}"
+        );
+
+        // The CLI dispatch recipe comes from the Codex adapter; pipeline commands
+        // carry --harness codex so they are copy-pasteable.
+        assert!(
+            book.contains("codex --ask-for-approval never exec"),
+            "carries the Codex CLI dispatch recipe: {book}"
+        );
+        assert!(
+            book.contains("eval-magic finalize --skill-dir /tmp/skills --skill widget-skill --iteration 2 --harness codex"),
+            "finalize carries --harness codex: {book}"
+        );
+        assert!(
+            book.contains(
+                "eval-magic teardown --skill-dir /tmp/skills --skill widget-skill --harness codex"
+            ),
+            "teardown carries --harness codex: {book}"
+        );
+        assert!(
+            book.contains("benchmark.json"),
+            "points at the result: {book}"
+        );
+        assert!(
+            !book.contains("{{"),
+            "no unsubstituted tokens remain: {book}"
+        );
+    }
+
+    #[test]
+    fn render_substitutes_each_token_everywhere() {
+        let out = render(
+            "skill {{SKILL_NAME}} iteration {{ITERATION}} — run {{SKILL_NAME}} now",
+            &[("SKILL_NAME", "my-skill"), ("ITERATION", "3")],
+        );
+        assert_eq!(out, "skill my-skill iteration 3 — run my-skill now");
+    }
+
+    #[test]
+    fn render_leaves_unknown_tokens_untouched() {
+        let out = render("{{KNOWN}} {{UNKNOWN}}", &[("KNOWN", "ok")]);
+        assert_eq!(out, "ok {{UNKNOWN}}");
+    }
+
+    #[test]
+    fn render_does_not_re_expand_a_substituted_value() {
+        // A value that happens to contain a token must not be expanded by a
+        // later (key, value) pair — each key gets exactly one pass.
+        let out = render(
+            "{{A}} {{B}}",
+            &[("A", "value-with-{{B}}-inside"), ("B", "second")],
+        );
+        assert_eq!(out, "value-with-{{B}}-inside second");
+    }
+}
diff --git a/src/cli/run/staging/mod.rs b/src/cli/run/staging/mod.rs
index d6bea4b..d15ba80 100644
--- a/src/cli/run/staging/mod.rs
+++ b/src/cli/run/staging/mod.rs
@@ -257,7 +257,7 @@ pub fn stage_skill_for_harness(opts: &StageSkillOpts) -> Result<String, RunError
                 "SKILL.md"
                     | "evals"
                     | SNAPSHOT_META
-                    | "skills-workspace"
+                    | ".eval-magic"
                     | ".claude"
                     | ".agents"
                     | ".codex"
diff --git a/src/cli/run/steps.rs b/src/cli/run/steps.rs
index 34eecdd..532184c 100644
--- a/src/cli/run/steps.rs
+++ b/src/cli/run/steps.rs
@@ -13,7 +13,7 @@
 //! parameter; the production runner — which maps each [`StepKind`] to its stage
 //! handler — lives in [`crate::cli`] alongside those handlers.
 
-use crate::core::{DispatchMechanism, Harness, mechanism_for};
+use crate::core::{DispatchMechanism, Harness, RunMode};
 
 /// Which post-dispatch stage a [`StepCommand`] runs. The production runner
 /// matches on this to call the corresponding handler; tests assert on it.
@@ -37,6 +37,10 @@ pub struct StepCommand {
     pub skill: Option<String>,
     pub iteration: u32,
     pub harness: Harness,
+    /// The run mode, re-derived at each stage so the transcript source matches
+    /// the dispatch mechanism. Round-trips through `CommonArgs` exactly like
+    /// `harness`, so ingest sub-stages don't silently re-default it.
+    pub run_mode: RunMode,
     /// Only the transcript-reading stages (record-runs, fill-transcripts) carry a
     /// subagents dir; the others leave it `None`.
     pub subagents_dir: Option<String>,
@@ -50,6 +54,7 @@ pub struct StepParams<'a> {
     pub skill: Option<&'a str>,
     pub iteration: u32,
     pub harness: Harness,
+    pub run_mode: RunMode,
     pub subagents_dir: Option<&'a str>,
     pub workspace_dir: Option<&'a str>,
 }
@@ -61,6 +66,7 @@ impl Default for StepParams<'_> {
             skill: None,
             iteration: 0,
             harness: Harness::ClaudeCode,
+            run_mode: RunMode::Interactive,
             subagents_dir: None,
             workspace_dir: None,
         }
@@ -81,6 +87,7 @@ impl StepParams<'_> {
             skill: self.skill.map(str::to_string),
             iteration: self.iteration,
             harness: self.harness,
+            run_mode: self.run_mode,
             subagents_dir,
             workspace_dir: self.workspace_dir.map(str::to_string),
         }
@@ -92,7 +99,7 @@ impl StepParams<'_> {
 /// in-session dispatch mechanism (a Cli-dispatch harness reads its transcript
 /// from each task's `outputs/` dir instead).
 pub fn build_ingest_commands(p: &StepParams) -> Vec<StepCommand> {
-    let transcripts = match mechanism_for(p.harness) {
+    let transcripts = match p.run_mode.mechanism() {
         DispatchMechanism::InSession => p.subagents_dir.map(str::to_string),
         DispatchMechanism::Cli => None,
     };
@@ -185,6 +192,7 @@ mod tests {
             skill: Some("mr-review"),
             iteration: 2,
             harness: Harness::Codex,
+            run_mode: RunMode::Hybrid,
             ..Default::default()
         });
         assert_eq!(
@@ -201,6 +209,38 @@ mod tests {
         assert_eq!(steps[1].subagents_dir, None);
     }
 
+    #[test]
+    fn ingest_omits_subagents_for_claude_hybrid() {
+        // Claude Code in hybrid mode dispatches via the CLI, so it reads each
+        // task's events file — not a subagents dir — even though the harness is
+        // ClaudeCode and a subagents dir was passed.
+        let steps = build_ingest_commands(&StepParams {
+            skill_dir: Some("/skills"),
+            skill: Some("mr-review"),
+            iteration: 2,
+            harness: Harness::ClaudeCode,
+            run_mode: RunMode::Hybrid,
+            subagents_dir: Some("/subagents"),
+            ..Default::default()
+        });
+        assert!(steps.iter().all(|s| s.harness == Harness::ClaudeCode));
+        assert!(steps.iter().all(|s| s.run_mode == RunMode::Hybrid));
+        assert_eq!(steps[0].subagents_dir, None);
+        assert_eq!(steps[1].subagents_dir, None);
+    }
+
+    #[test]
+    fn ingest_keeps_subagents_for_claude_interactive() {
+        // The default (interactive) Claude path still reads the subagents dir.
+        let steps = build_ingest_commands(&StepParams {
+            iteration: 2,
+            subagents_dir: Some("/subagents"),
+            ..Default::default()
+        });
+        assert_eq!(steps[0].subagents_dir.as_deref(), Some("/subagents"));
+        assert_eq!(steps[1].subagents_dir.as_deref(), Some("/subagents"));
+    }
+
     #[test]
     fn finalize_runs_grade_finalize_then_aggregate() {
         let steps = build_finalize_commands(&StepParams {
@@ -225,6 +265,7 @@ mod tests {
             skill: None,
             iteration: 0,
             harness: Harness::ClaudeCode,
+            run_mode: RunMode::Interactive,
             subagents_dir: None,
             workspace_dir: None,
         }
diff --git a/src/cli/run/util.rs b/src/cli/run/util.rs
index 6d85ab3..0a5738b 100644
--- a/src/cli/run/util.rs
+++ b/src/cli/run/util.rs
@@ -42,91 +42,6 @@ pub(crate) fn next_iteration(workspace_skill_dir: &Path, override_n: Option<u32>
     max.map_or(1, |m| m + 1)
 }
 
-/// Build-time heads-up about staged-skill discovery on Claude Code, keyed on whether the project
-/// `.claude/skills/` dir existed when the orchestrator session started.
-///
-/// Claude Code's file watcher only watches skill directories that existed at session start. When
-/// `.claude/skills/` already existed, live change detection surfaces mid-session-staged skills
-/// in-session (and to subagents dispatched afterward) — no fallback, so this returns a short
-/// confirmation note. When `run` had to *create* `.claude/skills/`, that new top-level dir isn't
-/// watched until the session re-scans (a restart, or a plugin reload / other refresh event), so
-/// subagents won't discover the staged skills yet and with-skill arms fall back — this returns the
-/// actionable warning. `None` when staging is off or the harness isn't Claude Code (Codex/OpenCode
-/// dispatch as fresh processes that rediscover skills each time).
-pub(crate) fn staging_discovery_warning(
-    harness: Harness,
-    no_stage: bool,
-    skills_dir_preexisted: bool,
-) -> Option<String> {
-    if no_stage || harness != Harness::ClaudeCode {
-        return None;
-    }
-    if skills_dir_preexisted {
-        return Some(
-            [
-                "\nℹ Staged into the existing .claude/skills/ — Claude Code's live change detection",
-                "  surfaces these skills in-session, so subagents dispatched from this session",
-                "  discover them (a freshly-staged skill can lag the watcher by a moment; if you",
-                "  created .claude/skills/ after this session started, restart once so it's watched).",
-                "  Run detect-stray-writes (folded into `ingest`) to confirm no with-skill arm fell back.",
-            ]
-            .join("\n"),
-        );
-    }
-    Some(
-        [
-            "\n⚠ This run created .claude/skills/, which did not exist when your session started.",
-            "  Claude Code only watches skill directories that existed at session start, so subagents",
-            "  dispatched from this session won't discover the staged skills until the session",
-            "  re-scans — with-skill arms fall back until then. The staged skills are now on disk and",
-            "  persist, so do one of:",
-            "    1. restart this Claude Code session, then dispatch (the staged skills are discovered",
-            "       at session start); or",
-            "    2. dispatch the subagents from a fresh Claude Code session started after this run; or",
-            "    3. re-run with --no-stage to inline each condition's SKILL.md into the dispatch",
-            "       prompt (correct when the description: frontmatter is unchanged, since there's",
-            "       nothing to measure on the discovery axis).",
-            "  Either way, run detect-stray-writes (folded into `ingest`) before trusting a staged",
-            "  result — it flags live-source reads that reveal a discovery miss after the fact.",
-        ]
-        .join("\n"),
-    )
-}
-
-/// The combined "what to do now" upshot when *both* build-time hazards apply at once: the staged
-/// skill won't be discovered by subagents ([`staging_discovery_warning`]'s fresh-dir condition,
-/// i.e. `!skills_dir_preexisted`) AND an installed plugin shadows the control arm. Each warning is
-/// clear alone, but together the only valid recovery takes some reasoning — so spell it out.
-/// `None` unless both hold; when the skills dir pre-existed the staged skill *is* discoverable, so
-/// the discovery hazard does not apply and the plain plugin-shadow banner suffices.
-pub(crate) fn staging_plugin_shadow_action(
-    harness: Harness,
-    no_stage: bool,
-    has_shadows: bool,
-    skills_dir_preexisted: bool,
-) -> Option<String> {
-    // Mirror the staging-discovery gate: the discovery hazard only bites a staged Claude Code run
-    // that had to create .claude/skills/ fresh (otherwise live change detection finds the skill).
-    let staging_bites = !no_stage && harness == Harness::ClaudeCode && !skills_dir_preexisted;
-    if !staging_bites || !has_shadows {
-        return None;
-    }
-    Some(
-        [
-            "\n▶ Bottom line: both hazards above apply to this run — this run created",
-            "  .claude/skills/ fresh so subagents won't discover the staged skill until the session",
-            "  re-scans (with-skill arms fall back to no skill), AND an installed plugin shadows the",
-            "  staged copy (so the control arm isn't skill-absent). Two clean ways out:",
-            "    1. dispatch from a fresh, isolated Claude Code session with the shadowing plugin",
-            "       disabled — staging is discovered at session start and the control arm is clean; or",
-            "    2. re-run with --no-stage AND disable the shadowing plugin — inlines SKILL.md into",
-            "       the prompt and leaves nothing for the plugin to shadow.",
-            "  Until then, treat with-skill arms as fallen-back and the control arm as contaminated.",
-        ]
-        .join("\n"),
-    )
-}
-
 /// Run-summary heads-up that a `--no-stage` run is unguarded: the write guard
 /// requires staging, so `--no-stage` can't arm it, and stray writes are only
 /// *detected* after the fact by `detect-stray-writes`. `None` for staged runs.
@@ -141,6 +56,79 @@ pub(crate) fn unguarded_notice(no_stage: bool) -> Option<String> {
     )
 }
 
+/// The shared dispatch-instruction body, parameterized on the `tasks[]` filter so
+/// the condition-only and condition+group variants stay in lockstep.
+fn insession_dispatch_instruction(filter: &str) -> String {
+    format!(
+        "iterate the `tasks[]` entries in dispatch.json whose {filter} and \
+         dispatch each as a subagent, passing its `agent_description` verbatim as the subagent \
+         description (that string is the key that links each transcript back — without it tool \
+         calls, tokens, and duration come back empty)."
+    )
+}
+
+/// Dispatch instruction for one condition batch: iterate the matching `tasks[]`
+/// and dispatch each as a subagent with its `agent_description` verbatim. A building
+/// block of the interactive runbook's per-condition steps ([`super::runbook`]).
+pub(crate) fn insession_dispatch_batch(condition: &str) -> String {
+    insession_dispatch_instruction(&format!("`condition` is `{condition}`"))
+}
+
+/// Dispatch instruction for one `(condition, group)` segment — used when a run has
+/// more than one isolation group, so each group's batch dispatches separately with
+/// a [`insession_reset_batch_command`] barrier between groups ([`super::runbook`]).
+pub(crate) fn insession_dispatch_segment(condition: &str, group: &str) -> String {
+    insession_dispatch_instruction(&format!(
+        "`condition` is `{condition}` and `group` is `{group}`"
+    ))
+}
+
+/// The `reset-batch` barrier command between isolation-group batches: wipe the
+/// env working tree and re-seed it with `group`'s fixtures before dispatching it.
+/// A building block of the interactive runbook ([`super::runbook`]).
+pub(crate) fn insession_reset_batch_command(
+    target_args: &str,
+    iteration: u32,
+    group: &str,
+) -> String {
+    format!("eval-magic reset-batch{target_args} --iteration {iteration} --group {group}")
+}
+
+/// The `switch-condition` barrier command between batches: name the condition about
+/// to be dispatched (the one to keep). A building block of the interactive runbook
+/// ([`super::runbook`]).
+pub(crate) fn insession_switch_command(target_args: &str, iteration: u32, keep: &str) -> String {
+    format!("eval-magic switch-condition{target_args} --iteration {iteration} --condition {keep}")
+}
+
+/// The `ingest` hand-off command + its session-resolution hint. A building block of
+/// the interactive runbook ([`super::runbook`]).
+pub(crate) fn insession_ingest_command(target_args: &str, iteration: u32) -> String {
+    format!(
+        "eval-magic ingest{target_args} --iteration {iteration}\n\
+         (ingest auto-resolves the subagents dir from CLAUDE_CODE_SESSION_ID; outside that \
+         session, add --session-id <id> or --subagents-dir <path>.)"
+    )
+}
+
+/// The post-`run` handoff for the isolated in-session flow: cd into the env, start a
+/// *fresh* Claude Code session there, and have it read `RUNBOOK.md` — which carries the
+/// full dispatch → switch-condition → ingest → finalize loop. The env (incl.
+/// `env/.claude/skills/`) is built before that session starts, so the fresh session is
+/// structural, not a watcher workaround; the orchestrator no longer juggles the dispatch
+/// loop itself.
+pub(crate) fn insession_isolated_handoff(env_dir: &Path) -> String {
+    format!(
+        "start the isolated run in a fresh session:\n  \
+         1. cd {env}\n  \
+         2. start a fresh Claude Code session there (`claude`)\n  \
+         3. say: Read and follow RUNBOOK.md\n\
+         RUNBOOK.md walks the whole loop (dispatch → switch-condition → ingest → finalize) and \
+         writes benchmark.json; resume here to read it.",
+        env = env_dir.display()
+    )
+}
+
 /// Resolve the verbatim plan-mode procedure profile for a harness.
 /// The profile is a compile-time bundled asset, mirroring the schema embedding in
 /// `validation`.
@@ -223,103 +211,60 @@ pub(crate) fn harness_label(harness: Harness) -> &'static str {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::core::{DetectInput, RunMode, detect_run_context};
+    use std::fs;
 
-    #[test]
-    fn discovery_warning_when_skills_dir_created_fresh() {
-        // The skills dir did not exist at session start, so `run` creates it; Claude Code's file
-        // watcher won't pick up the new top-level dir until the session re-scans.
-        let warning = staging_discovery_warning(Harness::ClaudeCode, false, false).unwrap();
-        assert!(
-            warning.contains("session start"),
-            "names the real cause (watcher only sees dirs present at session start): {warning}"
-        );
-        assert!(warning.contains("restart"), "offers a restart: {warning}");
-        assert!(
-            warning.contains("--no-stage"),
-            "offers --no-stage: {warning}"
-        );
-        assert!(
-            warning.contains("detect-stray-writes"),
-            "names the after-the-fact backstop: {warning}"
-        );
-        assert!(
-            !warning.contains("every with-skill arm falls"),
-            "drops the false absolute claim: {warning}"
-        );
-    }
-
-    #[test]
-    fn discovery_note_when_skills_dir_preexisting() {
-        // The skills dir already existed at session start, so live change detection surfaces the
-        // staged skills in-session — no fallback, just a confirmation + the backstop reminder.
-        let note = staging_discovery_warning(Harness::ClaudeCode, false, true).unwrap();
-        assert!(
-            note.contains("live change detection"),
-            "explains why discovery works: {note}"
-        );
-        assert!(
-            note.contains("detect-stray-writes"),
-            "still points at the backstop: {note}"
-        );
-        assert!(
-            !note.contains("falls back"),
-            "no fallback claim when the skills are discoverable: {note}"
-        );
-    }
-
-    #[test]
-    fn silent_when_no_stage() {
-        assert!(staging_discovery_warning(Harness::ClaudeCode, true, false).is_none());
-        assert!(staging_discovery_warning(Harness::ClaudeCode, true, true).is_none());
-    }
-
-    #[test]
-    fn silent_for_codex() {
-        assert!(staging_discovery_warning(Harness::Codex, false, false).is_none());
-    }
-
-    #[test]
-    fn silent_for_opencode() {
-        assert!(staging_discovery_warning(Harness::OpenCode, false, false).is_none());
-    }
-
-    #[test]
-    fn combined_action_when_fresh_dir_and_shadow_both_apply() {
-        // The discovery hazard is real only when the dir was created fresh
-        // (skills_dir_preexisted = false); paired with a plugin shadow, the recovery takes
-        // reasoning — so spell it out.
-        let action = staging_plugin_shadow_action(Harness::ClaudeCode, false, true, false).unwrap();
-        assert!(
-            action.contains("fresh") || action.contains("restart"),
-            "offers a clean session: {action}"
-        );
-        assert!(action.contains("--no-stage"), "offers --no-stage: {action}");
-        assert!(
-            action.to_lowercase().contains("disable"),
-            "says to disable the plugin: {action}"
-        );
-    }
-
-    #[test]
-    fn no_combined_action_when_skills_dir_preexisting() {
-        // Dir existed at session start: the staged skill is discoverable, so the discovery hazard
-        // does not apply and the plain plugin-shadow banner suffices.
-        assert!(staging_plugin_shadow_action(Harness::ClaudeCode, false, true, true).is_none());
+    /// Build a `RunContext` for `harness`/`run_mode` against a throwaway skill dir.
+    fn ctx_for(harness: Harness, run_mode: RunMode) -> (tempfile::TempDir, RunContext) {
+        let tmp = tempfile::TempDir::new().unwrap();
+        let skill = tmp.path().join("widget");
+        fs::create_dir_all(&skill).unwrap();
+        fs::write(
+            skill.join("SKILL.md"),
+            "---\nname: widget\ndescription: t\n---\n\nbody\n",
+        )
+        .unwrap();
+        let ctx = detect_run_context(DetectInput {
+            skill: Some(skill.display().to_string()),
+            harness: Some(harness),
+            run_mode: Some(run_mode),
+            cwd: Some(tmp.path().to_path_buf()),
+            ..Default::default()
+        })
+        .unwrap();
+        (tmp, ctx)
     }
 
     #[test]
-    fn no_combined_action_without_shadow() {
-        assert!(staging_plugin_shadow_action(Harness::ClaudeCode, false, false, false).is_none());
+    fn claude_hybrid_allows_guard() {
+        // `claude -p` loads the project `.claude/settings.local.json` PreToolUse
+        // hook from its cwd, so the write guard fires under Cli dispatch too.
+        let (_t, ctx) = ctx_for(Harness::ClaudeCode, RunMode::Hybrid);
+        let opts = RunOptions {
+            guard: true,
+            ..Default::default()
+        };
+        assert!(validate_harness_run_options(&opts, &ctx).is_ok());
     }
 
     #[test]
-    fn no_combined_action_under_no_stage() {
-        assert!(staging_plugin_shadow_action(Harness::ClaudeCode, true, true, false).is_none());
+    fn claude_headless_allows_guard() {
+        let (_t, ctx) = ctx_for(Harness::ClaudeCode, RunMode::Headless);
+        let opts = RunOptions {
+            guard: true,
+            ..Default::default()
+        };
+        assert!(validate_harness_run_options(&opts, &ctx).is_ok());
     }
 
     #[test]
-    fn no_combined_action_for_codex() {
-        assert!(staging_plugin_shadow_action(Harness::Codex, false, true, false).is_none());
+    fn claude_interactive_allows_guard() {
+        let (_t, ctx) = ctx_for(Harness::ClaudeCode, RunMode::Interactive);
+        let opts = RunOptions {
+            guard: true,
+            ..Default::default()
+        };
+        assert!(validate_harness_run_options(&opts, &ctx).is_ok());
     }
 
     #[test]
@@ -340,6 +285,30 @@ mod tests {
         assert!(unguarded_notice(false).is_none());
     }
 
+    #[test]
+    fn isolated_handoff_points_into_env_and_at_the_runbook() {
+        let env = Path::new("/work/.eval-magic/widget/iteration-3/env");
+        let handoff = insession_isolated_handoff(env);
+        assert!(
+            handoff.contains("/work/.eval-magic/widget/iteration-3/env"),
+            "names the env to cd into: {handoff}"
+        );
+        assert!(handoff.contains("cd "), "spells out the cd step: {handoff}");
+        assert!(
+            handoff.contains("Read and follow RUNBOOK.md"),
+            "hands off to the runbook in a fresh session: {handoff}"
+        );
+        assert!(
+            handoff.contains("fresh"),
+            "names the fresh isolated session: {handoff}"
+        );
+        // The handoff replaces the old printed dispatch loop — it must not re-print it.
+        assert!(
+            !handoff.contains("one batch at a time"),
+            "the dispatch loop lives in RUNBOOK.md now, not the summary: {handoff}"
+        );
+    }
+
     #[test]
     fn opencode_plan_mode_profile_resolves() {
         let profile = resolve_plan_mode_profile(Harness::OpenCode).unwrap();
diff --git a/src/core/context.rs b/src/core/context.rs
index 48a2aae..9939f8c 100644
--- a/src/core/context.rs
+++ b/src/core/context.rs
@@ -9,6 +9,8 @@ use std::path::{Path, PathBuf};
 
 use serde::{Deserialize, Serialize};
 
+use crate::core::run_mode::{RunMode, resolve_run_mode};
+
 /// The agent harness an eval runs against. Single source of truth, shared with
 /// the CLI layer (it derives `clap::ValueEnum` so flags can parse it directly).
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize, clap::ValueEnum)]
@@ -37,6 +39,9 @@ pub struct RunContext {
     pub stage_root: PathBuf,
     pub bootstrap_path: Option<PathBuf>,
     pub harness: Harness,
+    /// The resolved run mode (the dispatch mechanism + who drives the loop).
+    /// Resolved per harness from the `--run-mode` flag in [`detect_run_context`].
+    pub run_mode: RunMode,
 }
 
 /// Already-parsed flag values handed to [`detect_run_context`]. `clap` owns the
@@ -49,6 +54,7 @@ pub struct DetectInput {
     pub bootstrap: Option<String>,
     pub workspace_dir: Option<String>,
     pub harness: Option<Harness>,
+    pub run_mode: Option<RunMode>,
     pub cwd: Option<PathBuf>,
 }
 
@@ -71,6 +77,8 @@ pub enum ContextError {
     SkillNotFound(String),
     #[error("--bootstrap file not found: {0}")]
     BootstrapNotFound(String),
+    #[error("{0}")]
+    UnsupportedRunMode(String),
     #[error("io error: {0}")]
     Io(#[from] std::io::Error),
 }
@@ -199,11 +207,13 @@ pub fn detect_run_context(input: DetectInput) -> Result<RunContext, ContextError
 
     let workspace_root = match input.workspace_dir {
         Some(raw) => absolutize(&cwd, &raw)?,
-        None => cwd.join("skills-workspace"),
+        None => cwd.join(".eval-magic"),
     };
     let stage_root = cwd;
 
     let harness = input.harness.unwrap_or_default();
+    let run_mode =
+        resolve_run_mode(harness, input.run_mode).map_err(ContextError::UnsupportedRunMode)?;
 
     Ok(RunContext {
         skill_dir,
@@ -215,6 +225,7 @@ pub fn detect_run_context(input: DetectInput) -> Result<RunContext, ContextError
         stage_root,
         bootstrap_path,
         harness,
+        run_mode,
     })
 }
 
@@ -433,7 +444,7 @@ mod tests {
         let tmp = TempDir::new().unwrap();
         let skill_dir = make_skill_dir(tmp.path(), &["foo"]);
         let ctx = detect_run_context(input(&skill_dir, "foo")).unwrap();
-        let expected = std::env::current_dir().unwrap().join("skills-workspace");
+        let expected = std::env::current_dir().unwrap().join(".eval-magic");
         assert_eq!(ctx.workspace_root, expected);
     }
 
diff --git a/src/core/mod.rs b/src/core/mod.rs
index c9df56c..a0d12b8 100644
--- a/src/core/mod.rs
+++ b/src/core/mod.rs
@@ -14,6 +14,8 @@ pub mod runtime;
 pub mod types;
 
 pub use context::{ContextError, DetectInput, Harness, RunContext, detect_run_context};
-pub use run_mode::{DispatchMechanism, HarnessRunCapabilities, capabilities_for, mechanism_for};
+pub use run_mode::{
+    DispatchMechanism, HarnessRunCapabilities, RunMode, capabilities_for, resolve_run_mode,
+};
 pub use runtime::{GitOutput, run_git};
 pub use types::*;
diff --git a/src/core/run_mode.rs b/src/core/run_mode.rs
index 90a24a8..a2457df 100644
--- a/src/core/run_mode.rs
+++ b/src/core/run_mode.rs
@@ -18,6 +18,8 @@
 //! (`new-skill` / `revision`), which selects the two conditions being compared,
 //! not the dispatch path.
 
+use serde::{Deserialize, Serialize};
+
 use crate::core::Harness;
 
 /// How a single dispatch is delivered to a harness. The primary code axis for
@@ -30,6 +32,98 @@ pub enum DispatchMechanism {
     Cli,
 }
 
+/// The user-facing run mode — *who/what drives the loop* plus which dispatch
+/// mechanism each task rides on. This is the parity vocabulary documented in the
+/// README (§Run modes); it maps down to a [`DispatchMechanism`] via
+/// [`RunMode::mechanism`]. `hybrid` and `headless` both ride on
+/// [`Cli`](DispatchMechanism::Cli) and differ only in whether a session drives
+/// the loop — a distinction we persist (in `conditions.json`) even though it
+/// doesn't change how a single task reaches the harness.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, clap::ValueEnum)]
+#[serde(rename_all = "kebab-case")]
+#[value(rename_all = "kebab-case")]
+pub enum RunMode {
+    /// In-session subagent dispatch (Claude Code's Task tool).
+    Interactive,
+    /// An agent session orchestrates while each dispatch shells out to the
+    /// harness CLI (`claude -p`, `codex exec`).
+    Hybrid,
+    /// No session drives the loop; eval-magic commands dispatch through the
+    /// harness CLI end to end.
+    Headless,
+}
+
+impl RunMode {
+    /// The dispatch mechanism this run mode rides on.
+    pub fn mechanism(self) -> DispatchMechanism {
+        match self {
+            RunMode::Interactive => DispatchMechanism::InSession,
+            RunMode::Hybrid | RunMode::Headless => DispatchMechanism::Cli,
+        }
+    }
+
+    /// The default run mode for a harness when `--run-mode` is omitted, chosen to
+    /// preserve today's behavior: Claude Code → interactive, the CLI-dispatch
+    /// harnesses → hybrid.
+    pub fn default_for(harness: Harness) -> RunMode {
+        match harness {
+            Harness::ClaudeCode => RunMode::Interactive,
+            Harness::Codex | Harness::OpenCode => RunMode::Hybrid,
+        }
+    }
+
+    /// The kebab-case identifier (matches the `--run-mode` flag values and the
+    /// serialized form in `conditions.json`).
+    pub fn as_str(self) -> &'static str {
+        match self {
+            RunMode::Interactive => "interactive",
+            RunMode::Hybrid => "hybrid",
+            RunMode::Headless => "headless",
+        }
+    }
+}
+
+/// Resolve the effective run mode for a harness, defaulting per harness when
+/// unspecified and rejecting unsupported `(harness, mode)` combinations. The
+/// `Err` string is operator-facing.
+pub fn resolve_run_mode(harness: Harness, requested: Option<RunMode>) -> Result<RunMode, String> {
+    let mode = requested.unwrap_or_else(|| RunMode::default_for(harness));
+    let supported: &[RunMode] = match harness {
+        // Claude Code wires every mode: in-session (interactive) plus both CLI
+        // modes (hybrid and headless ride the same `claude -p` mechanism).
+        Harness::ClaudeCode => &[RunMode::Interactive, RunMode::Hybrid, RunMode::Headless],
+        // Codex dispatches via subprocess, so in-session doesn't translate, but
+        // both CLI modes do (hybrid is agent-driven, headless human-driven).
+        Harness::Codex => &[RunMode::Hybrid, RunMode::Headless],
+        // OpenCode's CLI path is only partially wired (no transcript ingest), so
+        // only hybrid is advertised for now.
+        Harness::OpenCode => &[RunMode::Hybrid],
+    };
+    if supported.contains(&mode) {
+        return Ok(mode);
+    }
+    let supported_list = supported
+        .iter()
+        .map(|m| m.as_str())
+        .collect::<Vec<_>>()
+        .join(", ");
+    Err(format!(
+        "--run-mode {} is not supported for --harness {}; supported: {}",
+        mode.as_str(),
+        harness_label(harness),
+        supported_list,
+    ))
+}
+
+/// The kebab-case CLI identifier for a harness (for operator-facing messages).
+fn harness_label(harness: Harness) -> &'static str {
+    match harness {
+        Harness::ClaudeCode => "claude-code",
+        Harness::Codex => "codex",
+        Harness::OpenCode => "opencode",
+    }
+}
+
 /// Run-option support for a harness's currently wired dispatch mechanism.
 ///
 /// This is intentionally narrower than full harness parity: it only describes
@@ -67,29 +161,10 @@ pub fn capabilities_for(harness: Harness) -> HarnessRunCapabilities {
     }
 }
 
-/// The dispatch mechanism a harness uses today. This is the single, documented
-/// place where the current 1:1 harness↔mechanism coupling lives — when a harness
-/// gains a second mechanism (e.g. a true headless Claude Code mode), the choice
-/// stops being derivable from the harness alone and this is the seam that grows
-/// to take an explicit selection.
-pub fn mechanism_for(harness: Harness) -> DispatchMechanism {
-    capabilities_for(harness).mechanism
-}
-
 #[cfg(test)]
 mod tests {
     use super::*;
 
-    #[test]
-    fn maps_each_harness_to_its_mechanism_today() {
-        assert_eq!(
-            mechanism_for(Harness::ClaudeCode),
-            DispatchMechanism::InSession
-        );
-        assert_eq!(mechanism_for(Harness::Codex), DispatchMechanism::Cli);
-        assert_eq!(mechanism_for(Harness::OpenCode), DispatchMechanism::Cli);
-    }
-
     #[test]
     fn capabilities_capture_run_option_support_by_harness() {
         let claude = capabilities_for(Harness::ClaudeCode);
@@ -110,4 +185,78 @@ mod tests {
         assert!(opencode.supports_bootstrap_with_no_stage);
         assert!(opencode.supports_stage_name_with_no_stage);
     }
+
+    #[test]
+    fn run_mode_mechanism_maps_each_mode() {
+        assert_eq!(
+            RunMode::Interactive.mechanism(),
+            DispatchMechanism::InSession
+        );
+        assert_eq!(RunMode::Hybrid.mechanism(), DispatchMechanism::Cli);
+        assert_eq!(RunMode::Headless.mechanism(), DispatchMechanism::Cli);
+    }
+
+    #[test]
+    fn run_mode_default_per_harness_preserves_today() {
+        assert_eq!(
+            RunMode::default_for(Harness::ClaudeCode),
+            RunMode::Interactive
+        );
+        assert_eq!(RunMode::default_for(Harness::Codex), RunMode::Hybrid);
+        assert_eq!(RunMode::default_for(Harness::OpenCode), RunMode::Hybrid);
+    }
+
+    #[test]
+    fn resolve_run_mode_defaults_when_unspecified() {
+        assert_eq!(
+            resolve_run_mode(Harness::ClaudeCode, None).unwrap(),
+            RunMode::Interactive
+        );
+        assert_eq!(
+            resolve_run_mode(Harness::Codex, None).unwrap(),
+            RunMode::Hybrid
+        );
+    }
+
+    #[test]
+    fn resolve_run_mode_accepts_claude_hybrid() {
+        assert_eq!(
+            resolve_run_mode(Harness::ClaudeCode, Some(RunMode::Hybrid)).unwrap(),
+            RunMode::Hybrid
+        );
+    }
+
+    #[test]
+    fn resolve_run_mode_rejects_interactive_for_cli_harnesses() {
+        let err = resolve_run_mode(Harness::Codex, Some(RunMode::Interactive)).unwrap_err();
+        assert!(err.contains("interactive"), "message was: {err}");
+        assert!(err.contains("codex"), "message was: {err}");
+        assert!(resolve_run_mode(Harness::OpenCode, Some(RunMode::Interactive)).is_err());
+    }
+
+    #[test]
+    fn resolve_run_mode_accepts_claude_headless() {
+        assert_eq!(
+            resolve_run_mode(Harness::ClaudeCode, Some(RunMode::Headless)).unwrap(),
+            RunMode::Headless
+        );
+    }
+
+    #[test]
+    fn resolve_run_mode_accepts_codex_headless() {
+        assert_eq!(
+            resolve_run_mode(Harness::Codex, Some(RunMode::Headless)).unwrap(),
+            RunMode::Headless
+        );
+    }
+
+    #[test]
+    fn run_mode_serde_roundtrips_kebab_case() {
+        assert_eq!(
+            serde_json::to_string(&RunMode::Hybrid).unwrap(),
+            "\"hybrid\""
+        );
+        let parsed: RunMode = serde_json::from_str("\"headless\"").unwrap();
+        assert_eq!(parsed, RunMode::Headless);
+    }
 }
diff --git a/src/core/types.rs b/src/core/types.rs
index 5519d2d..dddd9d7 100644
--- a/src/core/types.rs
+++ b/src/core/types.rs
@@ -10,6 +10,7 @@ use serde::{Deserialize, Serialize};
 use serde_json::Value;
 
 use crate::core::context::Harness;
+use crate::core::run_mode::RunMode;
 
 /// Meta-assertion id reserved for the skill-invocation check.
 pub const SKILL_INVOKED_META_ID: &str = "__skill_invoked";
@@ -68,6 +69,23 @@ pub struct Eval {
     /// to the flag's value (1 unless raised).
     #[serde(skip_serializing_if = "Option::is_none")]
     pub runs: Option<u32>,
+    /// Explicit isolation hint for run batching. `shared` (default, omitted) lets
+    /// the eval batch with others; `isolated` forces it into its own singleton
+    /// group, for confounds the framework can't auto-detect (e.g. the agent
+    /// mutates a shared fixture another eval reads). Conflicting fixtures
+    /// auto-isolate into separate groups regardless of this hint.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub isolation: Option<Isolation>,
+}
+
+/// Per-eval isolation hint controlling how an eval is grouped into run batches.
+/// `Shared` is the default (an eval may share an env with non-conflicting evals);
+/// `Isolated` forces the eval into its own singleton group.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub enum Isolation {
+    Shared,
+    Isolated,
 }
 
 /// The parsed `evals.json` for one skill.
@@ -106,6 +124,10 @@ pub struct ConditionsRecord {
     pub timestamp: String,
     #[serde(skip_serializing_if = "Option::is_none")]
     pub harness: Option<Harness>,
+    /// The run mode this iteration was built with (provenance + recoverability).
+    /// `None` on older artifacts written before run-mode selection existed.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub run_mode: Option<RunMode>,
     /// Per-run nonce; namespaces dispatch descriptions so transcripts can't
     /// collide across iterations sharing one parent session's subagents dir.
     #[serde(skip_serializing_if = "Option::is_none")]
@@ -289,12 +311,35 @@ mod tests {
             assertions: None,
             skill_should_trigger: None,
             runs: None,
+            isolation: None,
         };
         let out = serde_json::to_value(&eval).unwrap();
         assert!(out.get("files").is_none());
         assert!(out.get("assertions").is_none());
         assert!(out.get("skill_should_trigger").is_none());
         assert!(out.get("runs").is_none());
+        assert!(out.get("isolation").is_none());
+    }
+
+    #[test]
+    fn isolation_round_trips_snake_case() {
+        let eval = Eval {
+            id: "e1".into(),
+            prompt: "p".into(),
+            expected_output: "o".into(),
+            files: None,
+            assertions: None,
+            skill_should_trigger: None,
+            runs: None,
+            isolation: Some(Isolation::Isolated),
+        };
+        let out = serde_json::to_value(&eval).unwrap();
+        assert_eq!(
+            out.get("isolation"),
+            Some(&Value::String("isolated".into()))
+        );
+        let back: Eval = serde_json::from_value(out).unwrap();
+        assert_eq!(back.isolation, Some(Isolation::Isolated));
     }
 
     #[test]
@@ -361,6 +406,7 @@ mod tests {
             conditions: vec![],
             timestamp: "2026-06-08T00:00:00Z".into(),
             harness: Some(Harness::ClaudeCode),
+            run_mode: Some(RunMode::Hybrid),
             run_nonce: None,
             runs: None,
             agent_model: None,
@@ -373,6 +419,7 @@ mod tests {
             out.get("harness"),
             Some(&Value::String("claude-code".into()))
         );
+        assert_eq!(out.get("run_mode"), Some(&Value::String("hybrid".into())));
         // Absent optionals omitted.
         assert!(out.get("baseline").is_none());
         assert!(out.get("run_nonce").is_none());
diff --git a/src/pipeline/detect_stray_writes.rs b/src/pipeline/detect_stray_writes.rs
index 3a0f0e0..a1a5c57 100644
--- a/src/pipeline/detect_stray_writes.rs
+++ b/src/pipeline/detect_stray_writes.rs
@@ -333,13 +333,31 @@ pub fn detect_stray_writes_report(
                     &source,
                 )?;
 
-                let outputs_dir = outputs_by_key
-                    .get(&run_key(eval_id, cond, slot.run_index))
-                    .cloned()
-                    .unwrap_or_else(|| slot.dir.join("outputs").to_string_lossy().into_owned());
+                let outputs_dir = outputs_by_key.get(&run_key(eval_id, cond, slot.run_index));
 
                 invocations_inspected += run.tool_invocations.len();
-                let findings = detect_stray_writes(&run.tool_invocations, &outputs_dir, repo_root);
+                // `dispatch.json` is the authoritative source of the outputs
+                // boundary: an absolute path into the isolated env
+                // (`env/.eval-magic-outputs/...`). Without it we cannot honor the
+                // outputs-only contract, so we skip out-of-bounds *write*
+                // classification for that run rather than guess a boundary — the old
+                // `<slot>/outputs` convention no longer matches where agents write and
+                // would mis-flag every legitimate write. Live-source-read detection is
+                // independent of the boundary and still runs.
+                let findings = match outputs_dir {
+                    Some(dir) => detect_stray_writes(&run.tool_invocations, dir, repo_root),
+                    None => {
+                        let run_label = slot
+                            .run_index
+                            .map(|k| format!(" run-{k}"))
+                            .unwrap_or_default();
+                        eprintln!(
+                            "⚠ {eval_id}/{cond}{run_label}: no outputs_dir in dispatch.json — \
+                             skipping out-of-bounds write classification (boundary unknown)"
+                        );
+                        RunFindings::default()
+                    }
+                };
                 let live_reads =
                     detect_live_source_reads(&run.tool_invocations, live_skill_dir, repo_root);
 
@@ -729,7 +747,7 @@ mod tests {
         let f = detect_live_source_reads(
             &[
                 inv("Read", json!({"file_path": format!("{OUTPUTS}/x.md")}), 0),
-                inv("Bash", json!({"command": "ls skills-workspace"}), 1),
+                inv("Bash", json!({"command": "ls .eval-magic"}), 1),
                 // Write tools are detect_stray_writes' jurisdiction — reads only here.
                 inv(
                     "Write",
diff --git a/src/pipeline/fill_transcripts.rs b/src/pipeline/fill_transcripts.rs
index ada5710..06bd0b6 100644
--- a/src/pipeline/fill_transcripts.rs
+++ b/src/pipeline/fill_transcripts.rs
@@ -13,9 +13,7 @@ use std::path::Path;
 use serde::Deserialize;
 
 use crate::adapters::{adapter_for, find_by_description};
-use crate::core::{
-    ConditionsRecord, DispatchMechanism, Harness, RunRecord, ToolInvocation, mechanism_for,
-};
+use crate::core::{ConditionsRecord, DispatchMechanism, Harness, RunRecord, ToolInvocation};
 use crate::pipeline::error::PipelineError;
 use crate::pipeline::io::write_json;
 use crate::pipeline::slots::{run_key, run_slots};
@@ -79,6 +77,7 @@ pub fn resolve_agent_description(
 pub fn fill_transcripts(
     iteration_dir: &Path,
     harness: Harness,
+    mechanism: DispatchMechanism,
     subagents_dir: Option<&Path>,
     overwrite: bool,
 ) -> Result<FillTranscriptsResult, PipelineError> {
@@ -132,13 +131,16 @@ pub fn fill_transcripts(
                     .cloned()
                     .unwrap_or_else(|| slot.dir.join("outputs").to_string_lossy().into_owned());
 
+                // Resolve the in-session description lazily — only the InSession
+                // branch needs it, so a Cli run skips the dispatch.json re-read.
+                let description = (mechanism == DispatchMechanism::InSession).then(|| {
+                    resolve_agent_description(iteration_dir, eval_id, cond, slot.run_index)
+                });
                 let Some(invocations) = invocations_for_run(
                     harness,
+                    mechanism,
                     subagents_dir,
-                    iteration_dir,
-                    eval_id,
-                    cond,
-                    slot.run_index,
+                    description.as_deref(),
                     Path::new(&outputs_dir),
                 ) else {
                     result.missing += 1;
@@ -174,30 +176,27 @@ fn outputs_dirs_by_key(iteration_dir: &Path) -> HashMap<String, String> {
 
 /// Parse the invocations for one run, keyed on the dispatch mechanism: a
 /// `Cli`-mechanism harness reads the events file its CLI wrote under
-/// `outputs_dir` (e.g. Codex's `codex-events.jsonl`); an `InSession` harness
-/// reads the subagent transcript matched by the resolved description.
+/// `outputs_dir` (e.g. Codex's `codex-events.jsonl`, Claude Code hybrid's
+/// `claude-events.jsonl`); an `InSession` harness reads the subagent transcript
+/// matched by `description` (resolved by the caller).
 fn invocations_for_run(
     harness: Harness,
+    mechanism: DispatchMechanism,
     subagents_dir: Option<&Path>,
-    iteration_dir: &Path,
-    eval_id: &str,
-    condition: &str,
-    run_index: Option<u32>,
+    description: Option<&str>,
     outputs_dir: &Path,
 ) -> Option<Vec<ToolInvocation>> {
-    match mechanism_for(harness) {
+    match mechanism {
         DispatchMechanism::Cli => {
             let events_path = outputs_dir.join(adapter_for(harness).cli_events_filename()?);
             if !events_path.exists() {
                 return None;
             }
-            adapter_for(harness).parse_transcript(&events_path).ok()
+            adapter_for(harness).parse_cli_events(&events_path).ok()
         }
         DispatchMechanism::InSession => {
-            let description =
-                resolve_agent_description(iteration_dir, eval_id, condition, run_index);
             let subagent =
-                find_by_description(subagents_dir.unwrap_or_else(|| Path::new("")), &description)?;
+                find_by_description(subagents_dir.unwrap_or_else(|| Path::new("")), description?)?;
             adapter_for(harness)
                 .parse_transcript(&subagent.jsonl_path)
                 .ok()
@@ -307,6 +306,61 @@ mod tests {
 
     // --- fillTranscripts ---
 
+    #[test]
+    fn fills_a_claude_hybrid_run_record_from_outputs_events() {
+        let root = TempDir::new().unwrap();
+        let iteration_dir: PathBuf = root.path().join("iter-claude-fill");
+        let cond_dir = iteration_dir.join("eval-crash").join("with_skill");
+        let outputs_dir = cond_dir.join("outputs");
+        fs::create_dir_all(&outputs_dir).unwrap();
+        let run_path = cond_dir.join("run.json");
+        write_run_record(&run_path, json!([]));
+        fs::write(
+            iteration_dir.join("conditions.json"),
+            json!({
+                "mode": "new-skill",
+                "conditions": [{"name": "with_skill", "skill_path": "/skill/SKILL.md"}],
+                "timestamp": "2026-06-07T00:00:00.000Z",
+                "harness": "claude-code",
+                "run_mode": "hybrid"
+            })
+            .to_string(),
+        )
+        .unwrap();
+        write_dispatch(
+            &iteration_dir,
+            json!([{"eval_id": "crash", "condition": "with_skill", "outputs_dir": outputs_dir.to_string_lossy()}]),
+        );
+        // `claude -p` stream-json: assistant tool_use + user tool_result + result.
+        fs::write(
+            outputs_dir.join("claude-events.jsonl"),
+            jsonl(&[
+                json!({"type": "assistant", "message": {"id": "msg_1", "role": "assistant", "content": [{"type": "tool_use", "id": "toolu_1", "name": "Bash", "input": {"command": "bun test"}}]}}),
+                json!({"type": "user", "message": {"role": "user", "content": [{"type": "tool_result", "tool_use_id": "toolu_1", "content": "ok"}]}}),
+                json!({"type": "result", "subtype": "success", "is_error": false, "result": "Done", "duration_ms": 10, "usage": {"input_tokens": 1, "output_tokens": 1, "cache_creation_input_tokens": 0, "cache_read_input_tokens": 0}}),
+            ]),
+        )
+        .unwrap();
+
+        let result = fill_transcripts(
+            &iteration_dir,
+            Harness::ClaudeCode,
+            DispatchMechanism::Cli,
+            None,
+            false,
+        )
+        .unwrap();
+        assert_eq!(result.filled, 1);
+        assert_eq!(result.missing, 0);
+
+        let updated: RunRecord =
+            serde_json::from_str(&fs::read_to_string(&run_path).unwrap()).unwrap();
+        assert_eq!(
+            serde_json::to_value(&updated.tool_invocations).unwrap(),
+            json!([{"name": "Bash", "ordinal": 0, "args": {"command": "bun test"}, "result": "ok"}])
+        );
+    }
+
     #[test]
     fn fills_a_codex_run_record_from_outputs_events() {
         let root = TempDir::new().unwrap();
@@ -339,7 +393,14 @@ mod tests {
         )
         .unwrap();
 
-        let result = fill_transcripts(&iteration_dir, Harness::Codex, None, false).unwrap();
+        let result = fill_transcripts(
+            &iteration_dir,
+            Harness::Codex,
+            DispatchMechanism::Cli,
+            None,
+            false,
+        )
+        .unwrap();
         assert_eq!(result.filled, 1);
         assert_eq!(result.missing, 0);
 
@@ -382,7 +443,14 @@ mod tests {
             .unwrap();
         }
 
-        let result = fill_transcripts(&iteration_dir, Harness::Codex, None, false).unwrap();
+        let result = fill_transcripts(
+            &iteration_dir,
+            Harness::Codex,
+            DispatchMechanism::Cli,
+            None,
+            false,
+        )
+        .unwrap();
         assert_eq!(result.filled, 2);
         assert_eq!(result.missing, 0);
 
diff --git a/src/pipeline/record_runs.rs b/src/pipeline/record_runs.rs
index 00868e9..3d84277 100644
--- a/src/pipeline/record_runs.rs
+++ b/src/pipeline/record_runs.rs
@@ -19,9 +19,7 @@ use std::path::Path;
 use serde::Deserialize;
 
 use crate::adapters::{TranscriptSummary, adapter_for, find_by_description};
-use crate::core::{
-    DispatchMechanism, Harness, RunRecord, TimingRecord, TimingSource, mechanism_for,
-};
+use crate::core::{DispatchMechanism, Harness, RunRecord, TimingRecord, TimingSource};
 use crate::pipeline::error::PipelineError;
 use crate::pipeline::io::write_json;
 use crate::validation::{SchemaName, validate_against_schema};
@@ -66,7 +64,11 @@ impl RecordRunsResult {
     /// unverifiable. `None` when every run matched its transcript. The hint is
     /// tailored to how the harness correlates transcripts (description match vs.
     /// the Codex events file).
-    pub fn transcript_warning(&self, harness: Harness) -> Option<String> {
+    pub fn transcript_warning(
+        &self,
+        harness: Harness,
+        mechanism: DispatchMechanism,
+    ) -> Option<String> {
         if self.missing_transcript == 0 {
             return None;
         }
@@ -78,9 +80,17 @@ impl RecordRunsResult {
         } else {
             format!("⚠ {n} run{plural} missing a transcript")
         };
-        let cause = match harness {
-            Harness::Codex => "expected `outputs/codex-events.jsonl` was not found".to_string(),
-            Harness::ClaudeCode | Harness::OpenCode => {
+        // The cause is keyed on the dispatch mechanism, not the harness: a
+        // Cli-dispatch run (Codex, or Claude Code in hybrid/headless) misses the
+        // per-task events file; an in-session run misses the subagent transcript.
+        let cause = match mechanism {
+            DispatchMechanism::Cli => {
+                let file = adapter_for(harness)
+                    .cli_events_filename()
+                    .unwrap_or("the events file");
+                format!("expected `outputs/{file}` was not found")
+            }
+            DispatchMechanism::InSession => {
                 "did you pass each task's `agent_description` verbatim as the subagent \
                  description? If so, confirm `--subagents-dir` points at the parent session's \
                  subagents dir"
@@ -100,6 +110,7 @@ impl RecordRunsResult {
 pub fn record_runs(
     iteration_dir: &Path,
     harness: Harness,
+    mechanism: DispatchMechanism,
     subagents_dir: Option<&Path>,
     overwrite: bool,
 ) -> Result<RecordRunsResult, PipelineError> {
@@ -117,7 +128,7 @@ pub fn record_runs(
 
     let mut result = RecordRunsResult::default();
     for task in &tasks {
-        let summary = transcript_summary_for_task(harness, subagents_dir, task);
+        let summary = transcript_summary_for_task(harness, mechanism, subagents_dir, task);
         if summary.is_none() {
             result.missing_transcript += 1;
         }
@@ -188,10 +199,11 @@ pub fn record_runs(
 /// `None` when no transcript is found.
 fn transcript_summary_for_task(
     harness: Harness,
+    mechanism: DispatchMechanism,
     subagents_dir: Option<&Path>,
     task: &DispatchTask,
 ) -> Option<TranscriptSummary> {
-    match mechanism_for(harness) {
+    match mechanism {
         DispatchMechanism::Cli => {
             let events_path =
                 Path::new(&task.outputs_dir).join(adapter_for(harness).cli_events_filename()?);
@@ -199,7 +211,7 @@ fn transcript_summary_for_task(
                 return None;
             }
             adapter_for(harness)
-                .parse_transcript_full(&events_path)
+                .parse_cli_events_full(&events_path)
                 .ok()
         }
         DispatchMechanism::InSession => {
@@ -273,6 +285,20 @@ mod tests {
         fs::write(outputs_dir.join("codex-events.jsonl"), jsonl(&lines)).unwrap();
     }
 
+    /// A `claude -p --output-format stream-json` events fixture: a `system/init`
+    /// line, one tool call, and a terminal `result` event carrying the final
+    /// text + duration + usage (there are no per-line timestamps). Tokens sum to
+    /// 125 (100 + 20 + 0 + 5).
+    fn write_claude_events(outputs_dir: &Path, final_text: &str) {
+        let lines = vec![
+            json!({"type": "system", "subtype": "init", "cwd": "/env"}),
+            json!({"type": "assistant", "message": {"id": "msg_1", "role": "assistant", "content": [{"type": "tool_use", "id": "toolu_1", "name": "Bash", "input": {"command": "bun test"}}]}}),
+            json!({"type": "user", "message": {"role": "user", "content": [{"type": "tool_result", "tool_use_id": "toolu_1", "content": "ok"}]}}),
+            json!({"type": "result", "subtype": "success", "is_error": false, "result": final_text, "duration_ms": 30_000, "usage": {"input_tokens": 100, "output_tokens": 20, "cache_creation_input_tokens": 0, "cache_read_input_tokens": 5}}),
+        ];
+        fs::write(outputs_dir.join("claude-events.jsonl"), jsonl(&lines)).unwrap();
+    }
+
     struct FixtureTask {
         eval_id: &'static str,
         condition: &'static str,
@@ -410,7 +436,14 @@ mod tests {
             &transcript_lines("unused"),
         );
 
-        let result = record_runs(&iter, Harness::ClaudeCode, Some(&sub), false).unwrap();
+        let result = record_runs(
+            &iter,
+            Harness::ClaudeCode,
+            DispatchMechanism::InSession,
+            Some(&sub),
+            false,
+        )
+        .unwrap();
         assert_eq!(result.recorded, 2);
         assert_eq!(result.missing_transcript, 0);
 
@@ -473,7 +506,8 @@ mod tests {
         )
         .unwrap();
 
-        let result = record_runs(&iter, Harness::Codex, None, false).unwrap();
+        let result =
+            record_runs(&iter, Harness::Codex, DispatchMechanism::Cli, None, false).unwrap();
         assert_eq!(result.recorded, 2);
 
         for k in [1u32, 2] {
@@ -499,7 +533,8 @@ mod tests {
         );
         write_codex_events(&paths[0].outputs_dir, "Codex final.");
 
-        let result = record_runs(&iter, Harness::Codex, None, false).unwrap();
+        let result =
+            record_runs(&iter, Harness::Codex, DispatchMechanism::Cli, None, false).unwrap();
         assert_eq!(result.recorded, 1);
         assert_eq!(result.missing_transcript, 0);
 
@@ -531,7 +566,8 @@ mod tests {
         );
         write_codex_events(&paths[0].outputs_dir, "Closing summary from Codex.");
 
-        let result = record_runs(&iter, Harness::Codex, None, false).unwrap();
+        let result =
+            record_runs(&iter, Harness::Codex, DispatchMechanism::Cli, None, false).unwrap();
         assert_eq!(result.recorded, 1);
         assert_eq!(
             read_run(&iter, "crash", "with_skill").final_message,
@@ -564,7 +600,14 @@ mod tests {
         });
         fs::write(&paths[0].run_record_path, hand_written.to_string()).unwrap();
 
-        let skipped = record_runs(&iter, Harness::ClaudeCode, Some(&sub), false).unwrap();
+        let skipped = record_runs(
+            &iter,
+            Harness::ClaudeCode,
+            DispatchMechanism::InSession,
+            Some(&sub),
+            false,
+        )
+        .unwrap();
         assert_eq!(skipped.recorded, 0);
         assert_eq!(skipped.skipped_existing, 1);
         assert_eq!(
@@ -572,7 +615,14 @@ mod tests {
             "Agent-authored."
         );
 
-        let replaced = record_runs(&iter, Harness::ClaudeCode, Some(&sub), true).unwrap();
+        let replaced = record_runs(
+            &iter,
+            Harness::ClaudeCode,
+            DispatchMechanism::InSession,
+            Some(&sub),
+            true,
+        )
+        .unwrap();
         assert_eq!(replaced.recorded, 1);
         assert_eq!(read_run(&iter, "crash", "with_skill").final_message, "New.");
     }
@@ -601,7 +651,14 @@ mod tests {
         )
         .unwrap();
 
-        record_runs(&iter, Harness::ClaudeCode, Some(&sub), false).unwrap();
+        record_runs(
+            &iter,
+            Harness::ClaudeCode,
+            DispatchMechanism::InSession,
+            Some(&sub),
+            false,
+        )
+        .unwrap();
 
         // Agent-captured completion-event timing wins; not overwritten.
         let timing = read_timing_value(&iter, "crash", "with_skill");
@@ -629,7 +686,14 @@ mod tests {
             &transcript_lines("Closing summary from transcript."),
         );
 
-        let result = record_runs(&iter, Harness::ClaudeCode, Some(&sub), false).unwrap();
+        let result = record_runs(
+            &iter,
+            Harness::ClaudeCode,
+            DispatchMechanism::InSession,
+            Some(&sub),
+            false,
+        )
+        .unwrap();
         assert_eq!(result.recorded, 1);
         assert_eq!(
             read_run(&iter, "crash", "with_skill").final_message,
@@ -651,7 +715,14 @@ mod tests {
         );
         // No final-message.md, no transcript.
 
-        let result = record_runs(&iter, Harness::ClaudeCode, Some(&sub), false).unwrap();
+        let result = record_runs(
+            &iter,
+            Harness::ClaudeCode,
+            DispatchMechanism::InSession,
+            Some(&sub),
+            false,
+        )
+        .unwrap();
         assert_eq!(result.recorded, 0);
         assert_eq!(result.skipped_no_final_message, 1);
         assert!(!run_exists(&iter, "crash", "with_skill"));
@@ -672,7 +743,14 @@ mod tests {
         );
         // final-message.md exists but no subagent transcript matches.
 
-        let result = record_runs(&iter, Harness::ClaudeCode, Some(&sub), false).unwrap();
+        let result = record_runs(
+            &iter,
+            Harness::ClaudeCode,
+            DispatchMechanism::InSession,
+            Some(&sub),
+            false,
+        )
+        .unwrap();
         assert_eq!(result.recorded, 1);
         assert_eq!(result.missing_transcript, 1);
 
@@ -687,7 +765,14 @@ mod tests {
         let root = TempDir::new().unwrap();
         let (iter, sub) = dirs(&root);
         // Hand-authored/operator runs have no dispatch.json — the manual path owns them.
-        let err = record_runs(&iter, Harness::ClaudeCode, Some(&sub), false).unwrap_err();
+        let err = record_runs(
+            &iter,
+            Harness::ClaudeCode,
+            DispatchMechanism::InSession,
+            Some(&sub),
+            false,
+        )
+        .unwrap_err();
         assert!(
             err.to_string().contains("dispatch.json"),
             "error was: {err}"
@@ -701,7 +786,11 @@ mod tests {
             missing_transcript: 0,
             ..Default::default()
         };
-        assert!(result.transcript_warning(Harness::ClaudeCode).is_none());
+        assert!(
+            result
+                .transcript_warning(Harness::ClaudeCode, DispatchMechanism::InSession)
+                .is_none()
+        );
     }
 
     #[test]
@@ -711,7 +800,9 @@ mod tests {
             missing_transcript: 8,
             ..Default::default()
         };
-        let warning = result.transcript_warning(Harness::ClaudeCode).unwrap();
+        let warning = result
+            .transcript_warning(Harness::ClaudeCode, DispatchMechanism::InSession)
+            .unwrap();
         assert!(warning.contains("8"), "names the count: {warning}");
         assert!(
             warning.contains("agent_description"),
@@ -734,7 +825,9 @@ mod tests {
             missing_transcript: 1,
             ..Default::default()
         };
-        let warning = result.transcript_warning(Harness::ClaudeCode).unwrap();
+        let warning = result
+            .transcript_warning(Harness::ClaudeCode, DispatchMechanism::InSession)
+            .unwrap();
         assert!(warning.contains('1'), "names the count: {warning}");
     }
 
@@ -745,7 +838,9 @@ mod tests {
             missing_transcript: 2,
             ..Default::default()
         };
-        let warning = result.transcript_warning(Harness::Codex).unwrap();
+        let warning = result
+            .transcript_warning(Harness::Codex, DispatchMechanism::Cli)
+            .unwrap();
         assert!(
             warning.contains("codex-events.jsonl"),
             "names the Codex source: {warning}"
@@ -755,4 +850,94 @@ mod tests {
             "Codex doesn't use agent_description: {warning}"
         );
     }
+
+    #[test]
+    fn assembles_claude_hybrid_records_from_each_tasks_events() {
+        let root = TempDir::new().unwrap();
+        let (iter, _sub) = dirs(&root);
+        let paths = write_iteration(
+            &iter,
+            &[FixtureTask {
+                eval_id: "crash",
+                condition: "with_skill",
+                final_message: Some("Fixed it."),
+            }],
+        );
+        write_claude_events(&paths[0].outputs_dir, "Closing summary.");
+
+        let result = record_runs(
+            &iter,
+            Harness::ClaudeCode,
+            DispatchMechanism::Cli,
+            None,
+            false,
+        )
+        .unwrap();
+        assert_eq!(result.recorded, 1);
+        assert_eq!(result.missing_transcript, 0);
+
+        let run = read_run(&iter, "crash", "with_skill");
+        // final-message.md wins when present.
+        assert_eq!(run.final_message, "Fixed it.");
+        assert_eq!(
+            serde_json::to_value(&run.tool_invocations).unwrap(),
+            json!([{"name": "Bash", "ordinal": 0, "args": {"command": "bun test"}, "result": "ok"}])
+        );
+        let timing = read_timing_value(&iter, "crash", "with_skill");
+        assert_eq!(
+            timing,
+            json!({"total_tokens": 125, "duration_ms": 30_000, "source": "transcript"})
+        );
+    }
+
+    #[test]
+    fn falls_back_to_claude_result_final_text_when_final_message_md_missing() {
+        // Claude `-p` has no --output-last-message, so the result event's text is
+        // the primary final-message source.
+        let root = TempDir::new().unwrap();
+        let (iter, _sub) = dirs(&root);
+        let paths = write_iteration(
+            &iter,
+            &[FixtureTask {
+                eval_id: "crash",
+                condition: "with_skill",
+                final_message: None,
+            }],
+        );
+        write_claude_events(&paths[0].outputs_dir, "Closing summary from claude -p.");
+
+        let result = record_runs(
+            &iter,
+            Harness::ClaudeCode,
+            DispatchMechanism::Cli,
+            None,
+            false,
+        )
+        .unwrap();
+        assert_eq!(result.recorded, 1);
+        assert_eq!(
+            read_run(&iter, "crash", "with_skill").final_message,
+            "Closing summary from claude -p."
+        );
+    }
+
+    #[test]
+    fn claude_hybrid_warning_points_at_events_file() {
+        let result = RecordRunsResult {
+            recorded: 2,
+            missing_transcript: 2,
+            ..Default::default()
+        };
+        let warning = result
+            .transcript_warning(Harness::ClaudeCode, DispatchMechanism::Cli)
+            .unwrap();
+        assert!(
+            warning.contains("claude-events.jsonl"),
+            "names the Claude hybrid source: {warning}"
+        );
+        assert!(
+            !warning.contains("agent_description"),
+            "hybrid doesn't use agent_description: {warning}"
+        );
+    }
 }
diff --git a/src/sandbox/decide.rs b/src/sandbox/decide.rs
index 7beeb58..ba417a2 100644
--- a/src/sandbox/decide.rs
+++ b/src/sandbox/decide.rs
@@ -139,7 +139,7 @@ mod tests {
     use crate::sandbox::now_ms;
     use serde_json::json;
 
-    const ROOTS: [&str; 2] = ["/work/skills-workspace", "/work/.claude/skills"];
+    const ROOTS: [&str; 2] = ["/work/.eval-magic", "/work/.claude/skills"];
 
     /// An RFC3339 timestamp `offset_ms` from now — `future`/`past` bracket the
     /// current wall clock used by `decide`.
@@ -209,7 +209,7 @@ mod tests {
     fn allows_a_write_under_an_allowed_root() {
         let d = decide_now(
             "Write",
-            json!({ "file_path": "/work/skills-workspace/x/outputs/a.md" }),
+            json!({ "file_path": "/work/.eval-magic/x/outputs/a.md" }),
             Some(&marker()),
         );
         assert!(d.allow);
@@ -241,7 +241,7 @@ mod tests {
     fn allows_a_bash_command_scoped_to_an_allowed_root() {
         let d = decide_now(
             "Bash",
-            json!({ "command": "echo hi > /work/skills-workspace/x/outputs/log" }),
+            json!({ "command": "echo hi > /work/.eval-magic/x/outputs/log" }),
             Some(&marker()),
         );
         assert!(d.allow);
@@ -286,7 +286,7 @@ mod tests {
     fn allows_apply_patch_inside_allowed_roots() {
         let d = decide_now(
             "apply_patch",
-            json!({ "files": ["/work/skills-workspace/eval/outputs/out.md"] }),
+            json!({ "files": ["/work/.eval-magic/eval/outputs/out.md"] }),
             Some(&marker()),
         );
         assert!(d.allow);
@@ -363,10 +363,13 @@ mod tests {
     }
 
     #[test]
-    fn does_not_flag_skills_workspace_as_a_bare_skills_write() {
+    fn does_not_flag_a_skills_prefixed_dir_as_a_bare_skills_write() {
+        // A `skills`-prefixed path that is NOT an allowed root: the bare-`skills/`
+        // heuristic only fires on a bare `skills` at a path boundary, so a
+        // `skills-`-prefixed dir must not be flagged and the write is allowed.
         let d = decide_now(
             "Bash",
-            json!({ "command": "mkdir -p /work/skills-workspace/x/outputs" }),
+            json!({ "command": "mkdir -p /work/skills-data/x/outputs" }),
             Some(&marker()),
         );
         assert!(d.allow);
diff --git a/src/sandbox/guard.rs b/src/sandbox/guard.rs
index a4ff94b..51af2db 100644
--- a/src/sandbox/guard.rs
+++ b/src/sandbox/guard.rs
@@ -118,7 +118,7 @@ mod tests {
     fn marker() -> GuardMarker {
         GuardMarker {
             active: Some(true),
-            allowed_roots: Some(vec!["/work/skills-workspace".to_string()]),
+            allowed_roots: Some(vec!["/work/.eval-magic".to_string()]),
             expires_at: None,
         }
     }
@@ -170,7 +170,7 @@ mod tests {
 
     #[test]
     fn codex_apply_patch_inside_allowed_roots_allows() {
-        let payload = r#"{ "hook_event_name": "PreToolUse", "tool_name": "apply_patch", "tool_input": { "files": ["/work/skills-workspace/out.md"] } }"#;
+        let payload = r#"{ "hook_event_name": "PreToolUse", "tool_name": "apply_patch", "tool_input": { "files": ["/work/.eval-magic/out.md"] } }"#;
         assert_eq!(codex_guard_decision(payload, Some(marker())), None);
     }
 
diff --git a/src/sandbox/install.rs b/src/sandbox/install.rs
index 7dca347..6f9d774 100644
--- a/src/sandbox/install.rs
+++ b/src/sandbox/install.rs
@@ -68,26 +68,29 @@ fn write_json(path: &Path, value: &Value) -> io::Result<()> {
     fs::write(path, text)
 }
 
-fn marker_allowed_roots(workspace_root: &Path, skills_dir: &Path) -> Vec<String> {
+/// The guard's allowed write roots: the isolated env (`stage_root`, the
+/// agent-under-test's cwd) and the OS temp dir. The staged skills dir
+/// (`stage_root/.claude/skills` or `.agents/skills`) and the per-task outputs dir
+/// both live *inside* `stage_root`, so a single env root covers every legitimate
+/// agent write. Scoping to the env — not the parent `.eval-magic/` — keeps the
+/// guard boundary identical to the isolation boundary: the agent can't reach a
+/// sibling iteration or the `iteration-N/` meta tree above its cwd. eval-magic's own
+/// above-env writes (e.g. `benchmark.json`) are not gated here: they run as
+/// non-mutating `eval-magic` subprocesses the guard's Bash classifier passes.
+fn marker_allowed_roots(stage_root: &Path) -> Vec<String> {
     vec![
-        absolutize(workspace_root).display().to_string(),
-        absolutize(skills_dir).display().to_string(),
+        absolutize(stage_root).display().to_string(),
         absolutize(&std::env::temp_dir()).display().to_string(),
     ]
 }
 
-fn write_marker(
-    marker_path: &Path,
-    workspace_root: &Path,
-    skills_dir: &Path,
-    ttl: Option<Duration>,
-) -> io::Result<()> {
+fn write_marker(marker_path: &Path, stage_root: &Path, ttl: Option<Duration>) -> io::Result<()> {
     let expires_ms = now_ms() + ttl.unwrap_or(GUARD_TTL).as_millis() as i64;
     write_json(
         marker_path,
         &json!({
             "active": true,
-            "allowedRoots": marker_allowed_roots(workspace_root, skills_dir),
+            "allowedRoots": marker_allowed_roots(stage_root),
             "expiresAt": iso_millis(expires_ms),
         }),
     )
@@ -120,16 +123,14 @@ fn write_manifest(
 /// `std::env::current_exe()`); `ttl` overrides the default 6h lifetime.
 pub fn install_guard(
     stage_root: &Path,
-    workspace_root: &Path,
     guard_exe: &Path,
     ttl: Option<Duration>,
 ) -> io::Result<PathBuf> {
-    install_claude_guard(stage_root, workspace_root, guard_exe, ttl)
+    install_claude_guard(stage_root, guard_exe, ttl)
 }
 
 pub(crate) fn install_claude_guard(
     stage_root: &Path,
-    workspace_root: &Path,
     guard_exe: &Path,
     ttl: Option<Duration>,
 ) -> io::Result<PathBuf> {
@@ -137,7 +138,7 @@ pub(crate) fn install_claude_guard(
     fs::create_dir_all(&skills_dir)?;
 
     let marker_path = skills_dir.join(GUARD_MARKER);
-    write_marker(&marker_path, workspace_root, &skills_dir, ttl)?;
+    write_marker(&marker_path, stage_root, ttl)?;
 
     let settings_path = stage_root.join(".claude").join("settings.local.json");
     let settings_existed = settings_path.exists();
@@ -189,7 +190,6 @@ pub(crate) fn install_claude_guard(
 
 pub(crate) fn install_codex_guard(
     stage_root: &Path,
-    workspace_root: &Path,
     guard_exe: &Path,
     ttl: Option<Duration>,
 ) -> io::Result<PathBuf> {
@@ -197,7 +197,7 @@ pub(crate) fn install_codex_guard(
     fs::create_dir_all(&skills_dir)?;
 
     let marker_path = skills_dir.join(GUARD_MARKER);
-    write_marker(&marker_path, workspace_root, &skills_dir, ttl)?;
+    write_marker(&marker_path, stage_root, ttl)?;
 
     let hooks_path = stage_root.join(".codex").join("hooks.json");
     if let Some(parent) = hooks_path.parent() {
@@ -324,18 +324,15 @@ mod tests {
     struct Case {
         _tmp: TempDir,
         stage_root: PathBuf,
-        workspace_root: PathBuf,
     }
 
     fn setup() -> Case {
         let tmp = TempDir::new().unwrap();
         let stage_root = tmp.path().join("stage");
         fs::create_dir_all(&stage_root).unwrap();
-        let workspace_root = stage_root.join("skills-workspace");
         Case {
             _tmp: tmp,
             stage_root,
-            workspace_root,
         }
     }
 
@@ -359,7 +356,7 @@ mod tests {
     fn install_writes_an_active_marker_hook_and_manifest() {
         let c = setup();
         let exe = Path::new("/g/eval-magic");
-        install_guard(&c.stage_root, &c.workspace_root, exe, None).unwrap();
+        install_guard(&c.stage_root, exe, None).unwrap();
 
         let marker = read_json(&skills_dir(&c.stage_root).join(GUARD_MARKER));
         assert_eq!(marker["active"], json!(true));
@@ -368,12 +365,13 @@ mod tests {
             .unwrap()
             .timestamp_millis();
         assert!(exp_ms > now_ms());
+        let env = absolutize(&c.stage_root).display().to_string();
         assert!(
             marker["allowedRoots"]
                 .as_array()
                 .unwrap()
                 .iter()
-                .any(|r| r.as_str().unwrap().contains("skills-workspace"))
+                .any(|r| r.as_str().unwrap() == env)
         );
 
         let settings = read_json(&settings_path(&c.stage_root));
@@ -389,11 +387,37 @@ mod tests {
         assert!(skills_dir(&c.stage_root).join(GUARD_MANIFEST).exists());
     }
 
+    #[test]
+    fn marker_scopes_allowed_roots_to_the_env_and_temp_only() {
+        let c = setup();
+        let exe = Path::new("/g/eval-magic");
+        install_guard(&c.stage_root, exe, None).unwrap();
+
+        let marker = read_json(&skills_dir(&c.stage_root).join(GUARD_MARKER));
+        let roots: Vec<String> = marker["allowedRoots"]
+            .as_array()
+            .unwrap()
+            .iter()
+            .map(|r| r.as_str().unwrap().to_string())
+            .collect();
+
+        // The guard boundary is the isolated env (stage_root) plus temp — nothing
+        // above it. The parent workspace tree must NOT be an allowed root, or the
+        // agent could write into sibling iterations / the meta dir above `env/`.
+        let env = absolutize(&c.stage_root).display().to_string();
+        let temp = absolutize(&std::env::temp_dir()).display().to_string();
+        assert_eq!(roots, vec![env, temp]);
+        assert!(
+            !roots.iter().any(|r| r.ends_with(".eval-magic")),
+            "workspace_root must not be an allowed root: {roots:?}"
+        );
+    }
+
     #[test]
     fn hook_command_invokes_the_binary_guard_subcommand() {
         let c = setup();
         let exe = Path::new("/g/eval-magic");
-        let marker = install_guard(&c.stage_root, &c.workspace_root, exe, None).unwrap();
+        let marker = install_guard(&c.stage_root, exe, None).unwrap();
         let settings = read_json(&settings_path(&c.stage_root));
         let command = settings["hooks"]["PreToolUse"][0]["hooks"][0]["command"]
             .as_str()
@@ -409,7 +433,7 @@ mod tests {
     fn teardown_deletes_settings_it_created() {
         let c = setup();
         let exe = Path::new("/g/eval-magic");
-        install_guard(&c.stage_root, &c.workspace_root, exe, None).unwrap();
+        install_guard(&c.stage_root, exe, None).unwrap();
         assert!(settings_path(&c.stage_root).exists());
 
         assert!(teardown_guard(&c.stage_root));
@@ -432,7 +456,7 @@ mod tests {
         fs::write(settings_path(&c.stage_root), &original).unwrap();
 
         let exe = Path::new("/g/eval-magic");
-        install_guard(&c.stage_root, &c.workspace_root, exe, None).unwrap();
+        install_guard(&c.stage_root, exe, None).unwrap();
         // hook present while armed
         assert!(
             fs::read_to_string(settings_path(&c.stage_root))
@@ -456,24 +480,12 @@ mod tests {
     #[test]
     fn guard_is_armed_detects_claude_or_codex_marker() {
         let c = setup();
-        install_guard(
-            &c.stage_root,
-            &c.workspace_root,
-            Path::new("/g/eval-magic"),
-            None,
-        )
-        .unwrap();
+        install_guard(&c.stage_root, Path::new("/g/eval-magic"), None).unwrap();
         assert!(guard_is_armed(&c.stage_root));
         teardown_guard(&c.stage_root);
         assert!(!guard_is_armed(&c.stage_root));
 
-        install_codex_guard(
-            &c.stage_root,
-            &c.workspace_root,
-            Path::new("/g/eval-magic"),
-            None,
-        )
-        .unwrap();
+        install_codex_guard(&c.stage_root, Path::new("/g/eval-magic"), None).unwrap();
         assert!(guard_is_armed(&c.stage_root));
     }
 
@@ -520,7 +532,7 @@ mod tests {
     fn codex_install_writes_project_hook_marker_and_manifest() {
         let c = setup();
         let exe = Path::new("/g/eval-magic");
-        install_codex_guard(&c.stage_root, &c.workspace_root, exe, None).unwrap();
+        install_codex_guard(&c.stage_root, exe, None).unwrap();
 
         let marker = read_json(
             &c.stage_root
@@ -529,12 +541,15 @@ mod tests {
                 .join(GUARD_MARKER),
         );
         assert_eq!(marker["active"], json!(true));
+        // The Codex guard shares the env-scoped roots: the staged `.agents/skills`
+        // dir lives inside `stage_root`, so the single env root already covers it.
+        let env = absolutize(&c.stage_root).display().to_string();
         assert!(
             marker["allowedRoots"]
                 .as_array()
                 .unwrap()
                 .iter()
-                .any(|r| r.as_str().unwrap().contains(".agents/skills"))
+                .any(|r| r.as_str().unwrap() == env)
         );
 
         let hooks = read_json(&codex_hooks_path(&c.stage_root));
@@ -575,13 +590,7 @@ mod tests {
         );
         fs::write(codex_hooks_path(&c.stage_root), &original).unwrap();
 
-        install_codex_guard(
-            &c.stage_root,
-            &c.workspace_root,
-            Path::new("/g/eval-magic"),
-            None,
-        )
-        .unwrap();
+        install_codex_guard(&c.stage_root, Path::new("/g/eval-magic"), None).unwrap();
         assert!(
             fs::read_to_string(codex_hooks_path(&c.stage_root))
                 .unwrap()
diff --git a/src/sandbox/policy.rs b/src/sandbox/policy.rs
index 6f5b99e..02e9d0a 100644
--- a/src/sandbox/policy.rs
+++ b/src/sandbox/policy.rs
@@ -51,7 +51,7 @@ static BASH_MUTATION_PATTERNS: LazyLock<Vec<(Regex, &'static str)>> = LazyLock::
             "path under .claude",
         ),
         // The same create verbs whose operand is a top-level `skills/` directory —
-        // catches a bare `skills/` left in the cwd. `skills-workspace` and other
+        // catches a bare `skills/` left in the cwd. `skills-data` and other
         // `skills`-prefixed names are excluded by the trailing `/`, whitespace, or
         // end-of-string boundary.
         (
@@ -156,7 +156,7 @@ fn absolutize(target: &str, repo_root: &Path) -> std::path::PathBuf {
 
 /// True when `target` resolves to `dir` or a descendant of it. Relative `target`s
 /// resolve against `repo_root`. `Path::starts_with` matches whole path
-/// components, so `skills-workspace2` is correctly not under `skills-workspace`.
+/// components, so `.eval-magic2` is correctly not under `.eval-magic`.
 pub fn is_under(target: &str, dir: &str, repo_root: &Path) -> bool {
     let base = absolutize(dir, repo_root);
     let abs = absolutize(target, repo_root);
@@ -189,7 +189,7 @@ mod tests {
     use super::*;
     use serde_json::json;
 
-    const ROOTS: [&str; 2] = ["/work/skills-workspace", "/work/.claude/skills"];
+    const ROOTS: [&str; 2] = ["/work/.eval-magic", "/work/.claude/skills"];
 
     fn roots() -> Vec<String> {
         ROOTS.iter().map(|s| s.to_string()).collect()
@@ -243,37 +243,21 @@ mod tests {
     #[test]
     fn is_under_matches_dir_and_descendants() {
         let repo = Path::new("/work");
+        assert!(is_under("/work/.eval-magic", "/work/.eval-magic", repo));
         assert!(is_under(
-            "/work/skills-workspace",
-            "/work/skills-workspace",
-            repo
-        ));
-        assert!(is_under(
-            "/work/skills-workspace/x/out.md",
-            "/work/skills-workspace",
-            repo
-        ));
-        assert!(!is_under(
-            "/work/runner/run.ts",
-            "/work/skills-workspace",
-            repo
-        ));
-        // `skills-workspace2` is not under `skills-workspace` (separator boundary).
-        assert!(!is_under(
-            "/work/skills-workspace2/x",
-            "/work/skills-workspace",
+            "/work/.eval-magic/x/out.md",
+            "/work/.eval-magic",
             repo
         ));
+        assert!(!is_under("/work/runner/run.ts", "/work/.eval-magic", repo));
+        // `.eval-magic2` is not under `.eval-magic` (separator boundary).
+        assert!(!is_under("/work/.eval-magic2/x", "/work/.eval-magic", repo));
     }
 
     #[test]
     fn is_under_resolves_relative_targets_against_repo_root() {
         let repo = Path::new("/work");
-        assert!(is_under(
-            "skills-workspace/x",
-            "/work/skills-workspace",
-            repo
-        ));
+        assert!(is_under(".eval-magic/x", "/work/.eval-magic", repo));
     }
 
     #[test]
@@ -303,7 +287,7 @@ mod tests {
     fn classify_bash_allows_scoped_and_readonly_commands() {
         // Textually references an allowed root → scoped → allowed.
         assert_eq!(
-            classify_bash("echo hi > /work/skills-workspace/x/log", &roots()),
+            classify_bash("echo hi > /work/.eval-magic/x/log", &roots()),
             None
         );
         assert_eq!(classify_bash("ls -la /", &roots()), None);
diff --git a/src/validation/evals.rs b/src/validation/evals.rs
index 3e5b97c..388e302 100644
--- a/src/validation/evals.rs
+++ b/src/validation/evals.rs
@@ -74,6 +74,45 @@ mod tests {
         assert!(err.contains("skill_should_trigger"), "error was: {err}");
     }
 
+    #[test]
+    fn accepts_isolation_isolated() {
+        let mut config = base();
+        config["evals"][0]["isolation"] = json!("isolated");
+        let parsed = validate_evals_config(&config, "evals.json").unwrap();
+        assert_eq!(
+            parsed.evals[0].isolation,
+            Some(crate::core::Isolation::Isolated)
+        );
+    }
+
+    #[test]
+    fn accepts_isolation_shared() {
+        let mut config = base();
+        config["evals"][0]["isolation"] = json!("shared");
+        let parsed = validate_evals_config(&config, "evals.json").unwrap();
+        assert_eq!(
+            parsed.evals[0].isolation,
+            Some(crate::core::Isolation::Shared)
+        );
+    }
+
+    #[test]
+    fn defaults_isolation_to_none_when_absent() {
+        let config = base();
+        let parsed = validate_evals_config(&config, "evals.json").unwrap();
+        assert_eq!(parsed.evals[0].isolation, None);
+    }
+
+    #[test]
+    fn rejects_an_unknown_isolation_value() {
+        let mut config = base();
+        config["evals"][0]["isolation"] = json!("sometimes");
+        let err = validate_evals_config(&config, "evals.json")
+            .unwrap_err()
+            .to_string();
+        assert!(err.contains("isolation"), "error was: {err}");
+    }
+
     #[test]
     fn rejects_a_non_kebab_case_id() {
         let mut config = base();
diff --git a/src/workspace/promote.rs b/src/workspace/promote.rs
index 4688b16..d4193fa 100644
--- a/src/workspace/promote.rs
+++ b/src/workspace/promote.rs
@@ -269,7 +269,7 @@ fn provenance(opts: &PromoteOptions, conditions: Option<&ConditionsRecord>, head
             "`eval-magic promote-baseline --iteration {}` after aggregating. The ephemeral workspace (run records, timing,",
             opts.iteration
         ),
-        "dispatch files, produced outputs) stays gitignored under `skills-workspace/`".to_string(),
+        "dispatch files, produced outputs) stays gitignored under `.eval-magic/`".to_string(),
         "and is reclaimable by `eval-magic teardown` once promoted (this commit's marker)."
             .to_string(),
         String::new(),
@@ -322,7 +322,7 @@ mod tests {
             &skill_subdir.join("SKILL.md"),
             "---\nname: mr-review\ndescription: review MRs\n---\n\nbody\n",
         );
-        let workspace_root = tmp.path().join("work").join("skills-workspace");
+        let workspace_root = tmp.path().join("work").join(".eval-magic");
         let iteration_dir = workspace_root
             .join("mr-review")
             .join(format!("iteration-{iteration}"));
diff --git a/src/workspace/snapshot.rs b/src/workspace/snapshot.rs
index 861124f..11455fa 100644
--- a/src/workspace/snapshot.rs
+++ b/src/workspace/snapshot.rs
@@ -1,7 +1,7 @@
 //! Skill snapshotting.
 //!
 //! Capture a skill's `SKILL.md` plus
-//! sibling assets into `skills-workspace/<skill>/snapshots/<label>/`, either from
+//! sibling assets into `.eval-magic/<skill>/snapshots/<label>/`, either from
 //! the working tree or — read straight from the git object database without
 //! touching the working tree — as it existed at a git ref. The
 //! `evals/` directory is always excluded; a `.snapshot-meta.json` records the
@@ -225,7 +225,7 @@ mod tests {
         // Working tree diverges to v2; the commit still holds v1.
         write(&skill_subdir.join("SKILL.md"), "v2 working tree\n");
 
-        let workspace_root = root.join("work").join("skills-workspace");
+        let workspace_root = root.join("work").join(".eval-magic");
         Repo {
             _tmp: tmp,
             skill_subdir,
diff --git a/src/workspace/teardown.rs b/src/workspace/teardown.rs
index e4920be..fb9f649 100644
--- a/src/workspace/teardown.rs
+++ b/src/workspace/teardown.rs
@@ -1,7 +1,7 @@
 //! End-of-run workspace cleanup.
 //!
 //! Reclaim a skill's ephemeral
-//! `skills-workspace/<skill>/` subtree without ever destroying results the user
+//! `.eval-magic/<skill>/` subtree without ever destroying results the user
 //! hasn't moved into version control.
 
 use std::fs;
@@ -42,7 +42,7 @@ pub struct WorkspaceCleanupSummary {
 /// The reason string attached to a kept, unpromoted iteration.
 const UNCOMMITTED_REASON: &str = "uncommitted results — not promoted to evals/baseline/";
 
-/// End-of-run cleanup of a skill's `skills-workspace/<skill>/` subtree.
+/// End-of-run cleanup of a skill's `.eval-magic/<skill>/` subtree.
 ///
 /// Per iteration: promoted (marker present) → removed; unpromoted but holding
 /// captured results → kept and reported; unpromoted scaffolding → removed. Per
@@ -251,7 +251,7 @@ mod tests {
     #[test]
     fn removes_promoted_iteration_and_prunes_workspace() {
         let tmp = TempDir::new().unwrap();
-        let ws = tmp.path().join("skills-workspace");
+        let ws = tmp.path().join(".eval-magic");
         fs::create_dir_all(&ws).unwrap();
         let iter = make_iteration(
             &ws,
@@ -277,7 +277,7 @@ mod tests {
     #[test]
     fn keeps_unpromoted_iteration_with_benchmark_and_reports_it() {
         let tmp = TempDir::new().unwrap();
-        let ws = tmp.path().join("skills-workspace");
+        let ws = tmp.path().join(".eval-magic");
         fs::create_dir_all(&ws).unwrap();
         let iter = make_iteration(
             &ws,
@@ -300,7 +300,7 @@ mod tests {
     #[test]
     fn keeps_unpromoted_iteration_with_only_a_run_record() {
         let tmp = TempDir::new().unwrap();
-        let ws = tmp.path().join("skills-workspace");
+        let ws = tmp.path().join(".eval-magic");
         fs::create_dir_all(&ws).unwrap();
         let iter = make_iteration(
             &ws,
@@ -321,7 +321,7 @@ mod tests {
     #[test]
     fn removes_unpromoted_scaffolding_only_iteration() {
         let tmp = TempDir::new().unwrap();
-        let ws = tmp.path().join("skills-workspace");
+        let ws = tmp.path().join(".eval-magic");
         fs::create_dir_all(&ws).unwrap();
         let iter = make_iteration(
             &ws,
@@ -342,7 +342,7 @@ mod tests {
     #[test]
     fn mixed_promoted_removed_kept_with_results_skill_dir_not_pruned() {
         let tmp = TempDir::new().unwrap();
-        let ws = tmp.path().join("skills-workspace");
+        let ws = tmp.path().join(".eval-magic");
         fs::create_dir_all(&ws).unwrap();
         let promoted = make_iteration(
             &ws,
@@ -377,7 +377,7 @@ mod tests {
     #[test]
     fn removes_ref_snapshots_keeps_working_tree_and_legacy() {
         let tmp = TempDir::new().unwrap();
-        let ws = tmp.path().join("skills-workspace");
+        let ws = tmp.path().join(".eval-magic");
         fs::create_dir_all(&ws).unwrap();
         let ref_snap = make_snapshot(&ws, "mr-review", "old-ref", Some("ref"));
         let wt_snap = make_snapshot(&ws, "mr-review", "wt", Some("working-tree"));
@@ -393,9 +393,9 @@ mod tests {
     }
 
     #[test]
-    fn never_touches_another_skills_workspace_and_leaves_root_intact() {
+    fn never_touches_another_skill_and_leaves_workspace_root_intact() {
         let tmp = TempDir::new().unwrap();
-        let ws = tmp.path().join("skills-workspace");
+        let ws = tmp.path().join(".eval-magic");
         fs::create_dir_all(&ws).unwrap();
         make_iteration(
             &ws,
@@ -426,7 +426,7 @@ mod tests {
     #[test]
     fn empty_summary_when_skill_has_no_workspace() {
         let tmp = TempDir::new().unwrap();
-        let ws = tmp.path().join("skills-workspace");
+        let ws = tmp.path().join(".eval-magic");
         fs::create_dir_all(&ws).unwrap();
 
         let summary = cleanup_workspace(&ws, "never-ran");
diff --git a/tests/cli/aggregate.rs b/tests/cli/aggregate.rs
index a47d97c..d7f51b4 100644
--- a/tests/cli/aggregate.rs
+++ b/tests/cli/aggregate.rs
@@ -25,7 +25,7 @@ fn setup_agg(
     let skill_md = skill_sub.join("SKILL.md").to_string_lossy().into_owned();
     let cwd = root.join("work");
     let iteration_dir = cwd
-        .join("skills-workspace")
+        .join(".eval-magic")
         .join("mr-review")
         .join("iteration-1");
     fs::create_dir_all(&iteration_dir).unwrap();
diff --git a/tests/cli/grade.rs b/tests/cli/grade.rs
index 655b3eb..6bbb434 100644
--- a/tests/cli/grade.rs
+++ b/tests/cli/grade.rs
@@ -52,7 +52,7 @@ fn grade_codex_staged_run_uses_llm_meta_check_with_skill_content() {
 
     let cwd = root.join("work");
     let iteration_dir = cwd
-        .join("skills-workspace")
+        .join(".eval-magic")
         .join("mr-review")
         .join("iteration-1");
     let cond_dir = iteration_dir.join("eval-pos-eval").join("with_skill");
@@ -126,7 +126,7 @@ fn grade_omits_meta_check_for_negative_evals() {
 
     let cwd = root.join("work");
     let iteration_dir = cwd
-        .join("skills-workspace")
+        .join(".eval-magic")
         .join("mr-review")
         .join("iteration-1");
     fs::create_dir_all(&iteration_dir).unwrap();
@@ -202,7 +202,7 @@ fn grade_emits_and_finalizes_per_nested_run_dir() {
 
     let cwd = root.join("work");
     let iteration_dir = cwd
-        .join("skills-workspace")
+        .join(".eval-magic")
         .join("mr-review")
         .join("iteration-1");
     fs::create_dir_all(&iteration_dir).unwrap();
@@ -304,7 +304,7 @@ fn grade_fails_fast_on_malformed_run_record() {
 
     let cwd = root.join("work");
     let iteration_dir = cwd
-        .join("skills-workspace")
+        .join(".eval-magic")
         .join("mr-review")
         .join("iteration-1");
     fs::create_dir_all(&iteration_dir).unwrap();
@@ -363,7 +363,7 @@ fn grade_writes_prompt_files_and_drops_inline_prompt() {
 
     let cwd = root.join("work");
     let iteration_dir = cwd
-        .join("skills-workspace")
+        .join(".eval-magic")
         .join("mr-review")
         .join("iteration-1");
     fs::create_dir_all(&iteration_dir).unwrap();
@@ -446,7 +446,7 @@ fn grade_finalize_folds_responses_into_grading() {
 
     let cwd = root.join("work");
     let iteration_dir = cwd
-        .join("skills-workspace")
+        .join(".eval-magic")
         .join("mr-review")
         .join("iteration-1");
     let cond_dir = iteration_dir.join("eval-pos-eval").join("with_skill");
diff --git a/tests/cli/grade_models.rs b/tests/cli/grade_models.rs
index c44beec..6c21e3c 100644
--- a/tests/cli/grade_models.rs
+++ b/tests/cli/grade_models.rs
@@ -53,7 +53,7 @@ fn grade_defaults_judge_tasks_to_recorded_judge_model() {
 
     let cwd = root.join("work");
     let iteration_dir = cwd
-        .join("skills-workspace")
+        .join(".eval-magic")
         .join("mr-review")
         .join("iteration-1");
     let cond_dir = iteration_dir.join("eval-pos-eval").join("with_skill");
@@ -86,7 +86,7 @@ fn grade_defaults_judge_tasks_to_recorded_judge_model() {
         .assert()
         .success();
     let stdout = String::from_utf8(assert.get_output().stdout.clone()).unwrap();
-    assert!(stdout.contains("codex exec"));
+    assert!(stdout.contains("codex --ask-for-approval never exec"));
     assert!(stdout.contains("-m \"$model\""));
 
     let tasks: serde_json::Value =
diff --git a/tests/cli/guard.rs b/tests/cli/guard.rs
index 7efaee5..c05cd20 100644
--- a/tests/cli/guard.rs
+++ b/tests/cli/guard.rs
@@ -58,7 +58,7 @@ fn write_codex_armed_marker(
 #[test]
 fn guard_denies_out_of_bounds_write() {
     let tmp = TempDir::new().unwrap();
-    let marker = write_armed_marker(tmp.path(), &tmp.path().join("skills-workspace"));
+    let marker = write_armed_marker(tmp.path(), &tmp.path().join(".eval-magic"));
 
     skill_eval()
         .arg("guard")
@@ -73,7 +73,7 @@ fn guard_denies_out_of_bounds_write() {
 #[test]
 fn guard_allows_in_bounds_write() {
     let tmp = TempDir::new().unwrap();
-    let workspace = tmp.path().join("skills-workspace");
+    let workspace = tmp.path().join(".eval-magic");
     let marker = write_armed_marker(tmp.path(), &workspace);
 
     skill_eval()
@@ -91,7 +91,7 @@ fn guard_allows_in_bounds_write() {
 #[test]
 fn guard_codex_subcommand_blocks_with_codex_verdict_shape() {
     let tmp = TempDir::new().unwrap();
-    let marker = write_codex_armed_marker(tmp.path(), &tmp.path().join("skills-workspace"));
+    let marker = write_codex_armed_marker(tmp.path(), &tmp.path().join(".eval-magic"));
 
     skill_eval()
         .arg("guard-codex")
@@ -134,7 +134,7 @@ fn teardown_guard_reports_nothing_to_remove() {
 #[test]
 fn teardown_guard_removes_installed_guard() {
     let tmp = TempDir::new().unwrap();
-    write_armed_marker(tmp.path(), &tmp.path().join("skills-workspace"));
+    write_armed_marker(tmp.path(), &tmp.path().join(".eval-magic"));
 
     skill_eval()
         .arg("teardown-guard")
diff --git a/tests/cli/init.rs b/tests/cli/init.rs
index e3ceb86..687e494 100644
--- a/tests/cli/init.rs
+++ b/tests/cli/init.rs
@@ -84,7 +84,8 @@ fn init_from_skill_dir_prints_copy_pasteable_next_steps() {
         .assert()
         .success()
         .stdout(contains("  eval-magic run --skill-dir"))
-        .stdout(contains("--skill mr-review --guard"))
+        .stdout(contains("--skill mr-review --workspace-dir"))
+        .stdout(contains("--guard"))
         // ingest auto-resolves the subagents dir now, so the placeholder is gone.
         .stdout(contains("  eval-magic ingest --skill-dir"))
         .stdout(contains("--subagents-dir <subagents-dir>").not())
diff --git a/tests/cli/stray_writes.rs b/tests/cli/stray_writes.rs
index ad3c8b4..1e29f88 100644
--- a/tests/cli/stray_writes.rs
+++ b/tests/cli/stray_writes.rs
@@ -27,7 +27,7 @@ fn detect_stray_writes_reports_live_source_reads() {
 
     let cwd = root.join("work");
     let iteration_dir = cwd
-        .join("skills-workspace")
+        .join(".eval-magic")
         .join("mr-review")
         .join("iteration-1");
     let cond_dir = iteration_dir.join("eval-e1").join("old_skill");
@@ -119,7 +119,7 @@ fn detect_stray_writes_flags_unverifiable_when_nothing_was_inspected() {
 
     let cwd = root.join("work");
     let iteration_dir = cwd
-        .join("skills-workspace")
+        .join(".eval-magic")
         .join("mr-review")
         .join("iteration-1");
     let cond_dir = iteration_dir.join("eval-e1").join("old_skill");
@@ -174,6 +174,216 @@ fn detect_stray_writes_flags_unverifiable_when_nothing_was_inspected() {
         .stdout(contains("No out-of-bounds").not());
 }
 
+/// Without a `dispatch.json` outputs_dir for the run, the detector must NOT
+/// fabricate the old flat-layout boundary (`<cond_dir>/outputs`). Under the
+/// isolated env layout the agent writes into `env/.eval-magic-outputs/...`, an
+/// absolute path only `dispatch.json` carries; guessing the old convention would
+/// mis-flag every legitimate write as a violation. The detector instead skips
+/// out-of-bounds write classification for that run and logs why.
+#[test]
+fn detect_stray_writes_skips_write_classification_without_dispatch_outputs_dir() {
+    use serde_json::json;
+
+    let tmp = TempDir::new().unwrap();
+    let root = fs::canonicalize(tmp.path()).unwrap();
+    let skill_dir = root.join("skill-dir");
+    let skill_sub = skill_dir.join("mr-review");
+    fs::create_dir_all(&skill_sub).unwrap();
+    fs::write(
+        skill_sub.join("SKILL.md"),
+        "---\nname: mr-review\ndescription: review MRs\n---\n\nbody\n",
+    )
+    .unwrap();
+    let skill_md = skill_sub.join("SKILL.md").to_string_lossy().into_owned();
+
+    let cwd = root.join("work");
+    let iteration_dir = cwd
+        .join(".eval-magic")
+        .join("mr-review")
+        .join("iteration-1");
+    let cond_dir = iteration_dir.join("eval-e1").join("old_skill");
+    fs::create_dir_all(&cond_dir).unwrap();
+
+    fs::write(
+        iteration_dir.join("conditions.json"),
+        serde_json::to_string(&json!({
+            "mode": "revision",
+            "conditions": [
+                {"name": "old_skill", "skill_path": skill_md},
+                {"name": "new_skill", "skill_path": skill_md},
+            ],
+            "timestamp": "2026-06-08T00:00:00.000Z",
+            "harness": "claude-code",
+        }))
+        .unwrap(),
+    )
+    .unwrap();
+
+    // The agent wrote into the isolated env's outputs tree — the real new-layout
+    // location, which is NOT under the old `<cond_dir>/outputs` fallback path.
+    let env_output = iteration_dir
+        .join("env")
+        .join(".eval-magic-outputs")
+        .join("eval-e1")
+        .join("old_skill")
+        .join("answer.md")
+        .to_string_lossy()
+        .into_owned();
+
+    // No dispatch.json is written: the run has no recorded outputs_dir.
+    fs::write(
+        cond_dir.join("run.json"),
+        serde_json::to_string(&json!({
+            "eval_id": "e1",
+            "condition": "old_skill",
+            "skill_path": skill_md,
+            "prompt": "do the task",
+            "files": [],
+            "final_message": "done",
+            "tool_invocations": [
+                {"name": "Write", "args": {"file_path": env_output}, "ordinal": 0},
+            ],
+            "total_tokens": null,
+            "duration_ms": null,
+        }))
+        .unwrap(),
+    )
+    .unwrap();
+
+    skill_eval()
+        .current_dir(&cwd)
+        .arg("detect-stray-writes")
+        .arg("--skill-dir")
+        .arg(&skill_dir)
+        .arg("--skill")
+        .arg("mr-review")
+        .arg("--iteration")
+        .arg("1")
+        .assert()
+        .success()
+        .stderr(contains("no outputs_dir in dispatch.json"));
+
+    let report: serde_json::Value =
+        serde_json::from_str(&fs::read_to_string(iteration_dir.join("stray-writes.json")).unwrap())
+            .unwrap();
+    // The env-layout write is NOT mis-flagged: with no known boundary the detector
+    // refuses to guess rather than fabricating a wrong one.
+    assert_eq!(report["totals"]["violations"], json!(0));
+}
+
+/// With `dispatch.json` carrying the env-layout outputs_dir
+/// (`env/.eval-magic-outputs/...`), the detector classifies against that real
+/// boundary: a write inside it is clean, a write elsewhere in the env (the realistic
+/// repo, outside outputs) is a violation under the outputs-only contract.
+#[test]
+fn detect_stray_writes_uses_env_layout_outputs_dir_from_dispatch() {
+    use serde_json::json;
+
+    let tmp = TempDir::new().unwrap();
+    let root = fs::canonicalize(tmp.path()).unwrap();
+    let skill_dir = root.join("skill-dir");
+    let skill_sub = skill_dir.join("mr-review");
+    fs::create_dir_all(&skill_sub).unwrap();
+    fs::write(
+        skill_sub.join("SKILL.md"),
+        "---\nname: mr-review\ndescription: review MRs\n---\n\nbody\n",
+    )
+    .unwrap();
+    let skill_md = skill_sub.join("SKILL.md").to_string_lossy().into_owned();
+
+    let cwd = root.join("work");
+    let iteration_dir = cwd
+        .join(".eval-magic")
+        .join("mr-review")
+        .join("iteration-1");
+    let cond_dir = iteration_dir.join("eval-e1").join("old_skill");
+    fs::create_dir_all(&cond_dir).unwrap();
+
+    // The isolated env's outputs tree — where the agent is supposed to write.
+    let outputs_dir = iteration_dir
+        .join("env")
+        .join(".eval-magic-outputs")
+        .join("eval-e1")
+        .join("old_skill");
+    let in_bounds = outputs_dir.join("answer.md").to_string_lossy().into_owned();
+    // A write elsewhere inside the env (the realistic repo), outside outputs.
+    let stray = iteration_dir
+        .join("env")
+        .join("notes.md")
+        .to_string_lossy()
+        .into_owned();
+
+    fs::write(
+        iteration_dir.join("conditions.json"),
+        serde_json::to_string(&json!({
+            "mode": "revision",
+            "conditions": [
+                {"name": "old_skill", "skill_path": skill_md},
+                {"name": "new_skill", "skill_path": skill_md},
+            ],
+            "timestamp": "2026-06-08T00:00:00.000Z",
+            "harness": "claude-code",
+        }))
+        .unwrap(),
+    )
+    .unwrap();
+
+    // dispatch.json carries the absolute env-layout outputs_dir for the run.
+    fs::write(
+        iteration_dir.join("dispatch.json"),
+        serde_json::to_string(&json!({
+            "tasks": [
+                {
+                    "eval_id": "e1",
+                    "condition": "old_skill",
+                    "outputs_dir": outputs_dir.to_string_lossy(),
+                }
+            ],
+        }))
+        .unwrap(),
+    )
+    .unwrap();
+
+    fs::write(
+        cond_dir.join("run.json"),
+        serde_json::to_string(&json!({
+            "eval_id": "e1",
+            "condition": "old_skill",
+            "skill_path": skill_md,
+            "prompt": "do the task",
+            "files": [],
+            "final_message": "done",
+            "tool_invocations": [
+                {"name": "Write", "args": {"file_path": in_bounds}, "ordinal": 0},
+                {"name": "Write", "args": {"file_path": stray}, "ordinal": 1},
+            ],
+            "total_tokens": null,
+            "duration_ms": null,
+        }))
+        .unwrap(),
+    )
+    .unwrap();
+
+    skill_eval()
+        .current_dir(&cwd)
+        .arg("detect-stray-writes")
+        .arg("--skill-dir")
+        .arg(&skill_dir)
+        .arg("--skill")
+        .arg("mr-review")
+        .arg("--iteration")
+        .arg("1")
+        .assert()
+        .success();
+
+    let report: serde_json::Value =
+        serde_json::from_str(&fs::read_to_string(iteration_dir.join("stray-writes.json")).unwrap())
+            .unwrap();
+    assert_eq!(report["totals"]["violations"], json!(1));
+    assert_eq!(report["runs"].as_array().unwrap().len(), 1);
+    assert_eq!(report["runs"][0]["violations"][0]["path"], json!(stray));
+}
+
 /// `detect-stray-writes` scans every `run-<k>` subdirectory of a condition cell
 /// and tags each report entry with its run index.
 #[test]
@@ -194,7 +404,7 @@ fn detect_stray_writes_scans_nested_run_dirs_and_reports_run_index() {
 
     let cwd = root.join("work");
     let iteration_dir = cwd
-        .join("skills-workspace")
+        .join(".eval-magic")
         .join("mr-review")
         .join("iteration-1");
     let cond_dir = iteration_dir.join("eval-e1").join("old_skill");
diff --git a/tests/cli/workspace.rs b/tests/cli/workspace.rs
index b0c7980..3ccfc2c 100644
--- a/tests/cli/workspace.rs
+++ b/tests/cli/workspace.rs
@@ -40,7 +40,7 @@ fn promote_baseline_copies_artifacts_and_reports() {
 
     let cwd = root.join("work");
     let iteration_dir = cwd
-        .join("skills-workspace")
+        .join(".eval-magic")
         .join("mr-review")
         .join("iteration-2");
     let cond_dir = iteration_dir.join("eval-e1").join("with_skill");
@@ -83,7 +83,7 @@ fn promote_baseline_captures_multi_run_gradings() {
 
     let cwd = root.join("work");
     let iteration_dir = cwd
-        .join("skills-workspace")
+        .join(".eval-magic")
         .join("mr-review")
         .join("iteration-2");
     fs::create_dir_all(&iteration_dir).unwrap();
@@ -130,7 +130,7 @@ fn promote_baseline_warns_when_run_cells_missing_gradings() {
 
     let cwd = root.join("work");
     let iteration_dir = cwd
-        .join("skills-workspace")
+        .join(".eval-magic")
         .join("mr-review")
         .join("iteration-2");
     fs::create_dir_all(&iteration_dir).unwrap();
@@ -165,7 +165,7 @@ fn promote_baseline_writes_notes_stub_and_reports_it() {
 
     let cwd = root.join("work");
     let iteration_dir = cwd
-        .join("skills-workspace")
+        .join(".eval-magic")
         .join("mr-review")
         .join("iteration-1");
     fs::create_dir_all(&iteration_dir).unwrap();
@@ -203,7 +203,7 @@ fn promote_baseline_warns_when_prior_notes_retained() {
 
     let cwd = root.join("work");
     let iteration_dir = cwd
-        .join("skills-workspace")
+        .join(".eval-magic")
         .join("mr-review")
         .join("iteration-2");
     fs::create_dir_all(&iteration_dir).unwrap();
@@ -263,7 +263,7 @@ fn snapshot_working_tree_copies_and_records_provenance() {
         .success()
         .stdout(contains("Snapshotted mr-review →"));
 
-    let snap = cwd.join("skills-workspace/mr-review/snapshots/wt");
+    let snap = cwd.join(".eval-magic/mr-review/snapshots/wt");
     assert_eq!(
         fs::read_to_string(snap.join("SKILL.md")).unwrap(),
         "v2 working tree\n"
@@ -291,8 +291,7 @@ fn snapshot_defaults_to_baseline_label() {
         .stdout(contains("Snapshotted mr-review"));
 
     assert_eq!(
-        fs::read_to_string(cwd.join("skills-workspace/mr-review/snapshots/baseline/SKILL.md"))
-            .unwrap(),
+        fs::read_to_string(cwd.join(".eval-magic/mr-review/snapshots/baseline/SKILL.md")).unwrap(),
         "v2 working tree\n"
     );
 }
@@ -320,7 +319,7 @@ fn snapshot_ref_reads_committed_content() {
         .success()
         .stdout(contains("Snapshotted mr-review at HEAD →"));
 
-    let snap = cwd.join("skills-workspace/mr-review/snapshots/old");
+    let snap = cwd.join(".eval-magic/mr-review/snapshots/old");
     assert_eq!(
         fs::read_to_string(snap.join("SKILL.md")).unwrap(),
         "v1 baseline\n"
@@ -345,7 +344,7 @@ fn teardown_reclaims_promoted_and_keeps_uncommitted() {
     let (skill_dir, _skill_sub) = write_skill_md(&root, "---\nname: mr-review\n---\nbody\n");
 
     let cwd = root.join("work");
-    let skill_ws = cwd.join("skills-workspace").join("mr-review");
+    let skill_ws = cwd.join(".eval-magic").join("mr-review");
     let promoted = skill_ws.join("iteration-1");
     let kept = skill_ws.join("iteration-2");
     fs::create_dir_all(&promoted).unwrap();
diff --git a/tests/run/claude_cli.rs b/tests/run/claude_cli.rs
new file mode 100644
index 0000000..55a167b
--- /dev/null
+++ b/tests/run/claude_cli.rs
@@ -0,0 +1,369 @@
+//! Claude Code CLI run modes (`--run-mode hybrid` / `headless`): `claude -p`
+//! stream-json dispatch guidance, run-mode persistence + defaulting, the
+//! human-followed runbook, the write guard under Cli dispatch, and the remaining
+//! run-mode combo rejections (Codex interactive).
+
+use crate::helpers::*;
+use predicates::str::contains;
+use std::fs;
+
+#[test]
+fn claude_hybrid_dispatch_guidance_uses_claude_p() {
+    let tmp = tempfile::TempDir::new().unwrap();
+    let (skill_dir, cwd) = setup(tmp.path(), DEFAULT_EVALS);
+    let assert = skill_eval()
+        .current_dir(&cwd)
+        .args(["run", "--skill-dir"])
+        .arg(&skill_dir)
+        .args([
+            "--skill",
+            "mr-review",
+            "--mode",
+            "new-skill",
+            "--harness",
+            "claude-code",
+            "--run-mode",
+            "hybrid",
+        ])
+        .assert()
+        .success();
+    let stdout = String::from_utf8(assert.get_output().stdout.clone()).unwrap();
+
+    assert!(stdout.contains("claude -p --output-format stream-json"));
+    assert!(stdout.contains("--verbose"));
+    assert!(stdout.contains("cd <eval-root>"));
+    assert!(stdout.contains("claude-events.jsonl"));
+    assert!(!stdout.contains("--output-last-message"));
+
+    let manifest = read_str(&iteration_dir(&cwd).join("dispatch-manifest.md"));
+    assert!(manifest.contains("claude -p --output-format stream-json"));
+    assert!(manifest.contains("claude-events.jsonl"));
+    assert!(manifest.contains("xargs -0 -P"));
+
+    let conditions = read_json(&iteration_dir(&cwd).join("conditions.json"));
+    assert_eq!(conditions["harness"], "claude-code");
+    assert_eq!(conditions["run_mode"], "hybrid");
+    let dispatch = read_json(&iteration_dir(&cwd).join("dispatch.json"));
+    assert_eq!(dispatch["run_mode"], "hybrid");
+}
+
+#[test]
+fn claude_hybrid_dispatch_guidance_includes_agent_model_when_provided() {
+    let tmp = tempfile::TempDir::new().unwrap();
+    let (skill_dir, cwd) = setup(tmp.path(), DEFAULT_EVALS);
+    let assert = skill_eval()
+        .current_dir(&cwd)
+        .args(["run", "--skill-dir"])
+        .arg(&skill_dir)
+        .args([
+            "--skill",
+            "mr-review",
+            "--harness",
+            "claude-code",
+            "--run-mode",
+            "hybrid",
+            "--agent-model",
+            "opus",
+        ])
+        .assert()
+        .success();
+    let stdout = String::from_utf8(assert.get_output().stdout.clone()).unwrap();
+    assert!(stdout.contains("claude -p --output-format stream-json"));
+    assert!(stdout.contains("--model opus"));
+}
+
+#[test]
+fn claude_defaults_to_interactive_handoff() {
+    let tmp = tempfile::TempDir::new().unwrap();
+    let (skill_dir, cwd) = setup(tmp.path(), DEFAULT_EVALS);
+    skill_eval()
+        .current_dir(&cwd)
+        .args(["run", "--skill-dir"])
+        .arg(&skill_dir)
+        .args([
+            "--skill",
+            "mr-review",
+            "--harness",
+            "claude-code",
+            "--dry-run",
+        ])
+        .assert()
+        .success();
+
+    // No --run-mode → interactive default; no CLI recipe in the manifest.
+    let conditions = read_json(&iteration_dir(&cwd).join("conditions.json"));
+    assert_eq!(conditions["run_mode"], "interactive");
+    let manifest = read_str(&iteration_dir(&cwd).join("dispatch-manifest.md"));
+    assert!(!manifest.contains("claude -p"));
+}
+
+#[test]
+fn claude_hybrid_runbook_is_human_followed_cli_recipe() {
+    let tmp = tempfile::TempDir::new().unwrap();
+    let (skill_dir, cwd) = setup(tmp.path(), DEFAULT_EVALS);
+    skill_eval()
+        .current_dir(&cwd)
+        .args(["run", "--skill-dir"])
+        .arg(&skill_dir)
+        .args([
+            "--skill",
+            "mr-review",
+            "--harness",
+            "claude-code",
+            "--run-mode",
+            "hybrid",
+            "--dry-run",
+        ])
+        .assert()
+        .success();
+
+    // Cli dispatches from per-(group, condition) envs, so the human-followed
+    // runbook lives in the iteration dir, not a single env/.
+    let runbook = read_str(&iteration_dir(&cwd).join("RUNBOOK.md"));
+    assert!(
+        runbook.contains("human driving"),
+        "hybrid uses the human-followed template: {runbook}"
+    );
+    assert!(
+        runbook.contains("claude -p"),
+        "carries the claude -p dispatch recipe: {runbook}"
+    );
+}
+
+#[test]
+fn claude_headless_records_mode_and_human_runbook() {
+    let tmp = tempfile::TempDir::new().unwrap();
+    let (skill_dir, cwd) = setup(tmp.path(), DEFAULT_EVALS);
+    skill_eval()
+        .current_dir(&cwd)
+        .args(["run", "--skill-dir"])
+        .arg(&skill_dir)
+        .args([
+            "--skill",
+            "mr-review",
+            "--harness",
+            "claude-code",
+            "--run-mode",
+            "headless",
+            "--dry-run",
+        ])
+        .assert()
+        .success();
+
+    // Headless rides the same Cli mechanism as hybrid; the run mode is persisted
+    // distinctly so every post-dispatch command can carry it.
+    let conditions = read_json(&iteration_dir(&cwd).join("conditions.json"));
+    assert_eq!(conditions["run_mode"], "headless");
+    let dispatch = read_json(&iteration_dir(&cwd).join("dispatch.json"));
+    assert_eq!(dispatch["run_mode"], "headless");
+    let manifest = read_str(&iteration_dir(&cwd).join("dispatch-manifest.md"));
+    assert!(manifest.contains("claude -p --output-format stream-json"));
+
+    // The runbook is the shared human-followed template carrying the claude -p
+    // recipe and headless-threaded pipeline commands. Cli has no single env/, so
+    // it lives in the iteration dir.
+    let runbook = read_str(&iteration_dir(&cwd).join("RUNBOOK.md"));
+    assert!(
+        runbook.contains("human driving"),
+        "headless uses the human-followed template: {runbook}"
+    );
+    assert!(
+        runbook.contains("claude -p"),
+        "carries the claude -p dispatch recipe: {runbook}"
+    );
+    assert!(
+        runbook.contains("--harness claude-code"),
+        "pipeline commands carry --harness claude-code: {runbook}"
+    );
+    assert!(
+        runbook.contains("--run-mode headless"),
+        "pipeline commands carry the headless run mode: {runbook}"
+    );
+    assert!(
+        !runbook.contains("{{"),
+        "no unsubstituted tokens: {runbook}"
+    );
+}
+
+#[test]
+fn claude_hybrid_record_runs_does_not_require_a_session_id() {
+    // Regression: hybrid/headless ride the Cli mechanism and read each task's
+    // claude-events.jsonl, never the in-session subagents dir. Resolving that dir
+    // is gated on the dispatch mechanism, not the harness, so `record-runs` in
+    // hybrid mode must NOT bail on a missing CLAUDE_CODE_SESSION_ID — the way the
+    // old harness-keyed gate did for `--harness claude-code`. This is the
+    // documented headless path (no session at all).
+    let tmp = tempfile::TempDir::new().unwrap();
+    let (skill_dir, cwd) = setup(tmp.path(), DEFAULT_EVALS);
+    skill_eval()
+        .current_dir(&cwd)
+        .args(["run", "--skill-dir"])
+        .arg(&skill_dir)
+        .args([
+            "--skill",
+            "mr-review",
+            "--harness",
+            "claude-code",
+            "--run-mode",
+            "hybrid",
+        ])
+        .assert()
+        .success();
+
+    // No session id in the environment, and none passed — the pre-fix code aborted
+    // here with "could not auto-resolve the subagents dir". The fix returns early
+    // for the Cli mechanism, so record-runs proceeds to its summary.
+    skill_eval()
+        .current_dir(&cwd)
+        .env_remove("CLAUDE_CODE_SESSION_ID")
+        .args(["record-runs", "--skill-dir"])
+        .arg(&skill_dir)
+        .args(["--skill", "mr-review", "--workspace-dir"])
+        .arg(cwd.join(".eval-magic"))
+        .args(["--harness", "claude-code", "--run-mode", "hybrid"])
+        .assert()
+        .success()
+        .stdout(contains("Recorded:"));
+}
+
+#[test]
+fn codex_rejects_run_mode_interactive() {
+    let tmp = tempfile::TempDir::new().unwrap();
+    let (skill_dir, cwd) = setup(tmp.path(), DEFAULT_EVALS);
+    skill_eval()
+        .current_dir(&cwd)
+        .args(["run", "--skill-dir"])
+        .arg(&skill_dir)
+        .args([
+            "--skill",
+            "mr-review",
+            "--harness",
+            "codex",
+            "--run-mode",
+            "interactive",
+            "--dry-run",
+        ])
+        .assert()
+        .failure()
+        .stderr(contains("interactive"))
+        .stderr(contains("codex"));
+}
+
+#[test]
+fn claude_cli_guard_installs_project_hook() {
+    let tmp = tempfile::TempDir::new().unwrap();
+    let (skill_dir, cwd) = setup(tmp.path(), DEFAULT_EVALS);
+    skill_eval()
+        .current_dir(&cwd)
+        .args(["run", "--skill-dir"])
+        .arg(&skill_dir)
+        .args([
+            "--skill",
+            "mr-review",
+            "--harness",
+            "claude-code",
+            "--run-mode",
+            "hybrid",
+            "--guard",
+        ])
+        .assert()
+        .success();
+
+    // The guard installs into EACH per-(group, condition) env (the agent-under-test's
+    // cwd) — the same `.claude/settings.local.json` each `claude -p` dispatch loads
+    // from that cwd, so a PreToolUse deny fires under Cli dispatch.
+    let with_env = cli_env_dir(&cwd, "g1", "with_skill");
+    let settings_path = with_env.join(".claude/settings.local.json");
+    assert!(settings_path.exists());
+    let settings = read_json(&settings_path);
+    let hook = &settings["hooks"]["PreToolUse"][0];
+    let command = hook["hooks"][0]["command"].as_str().unwrap();
+    assert!(
+        command.contains("guard") && !command.contains("guard-codex"),
+        "hook invokes the claude guard entry point: {settings}"
+    );
+    assert!(
+        hook["matcher"].as_str().unwrap().contains("Write"),
+        "hook matches write tools: {settings}"
+    );
+    assert!(
+        with_env
+            .join(".claude/skills/.slow-powers-eval-guard.json")
+            .exists()
+    );
+
+    // The control arm's env is independently guarded too, and — the gap fix — holds
+    // no staged skill slug at all (the skill is physically absent, not just unlisted).
+    let without_env = cli_env_dir(&cwd, "g1", "without_skill");
+    assert!(
+        without_env.join(".claude/settings.local.json").exists(),
+        "the without_skill env is guarded too"
+    );
+    assert!(
+        !without_env
+            .join(".claude/skills/slow-powers-eval-1-with_skill__mr-review")
+            .exists(),
+        "the control arm's env contains no staged skill slug"
+    );
+}
+
+#[test]
+fn cli_plugin_shadow_preflight_reads_per_env_project_settings() {
+    let tmp = tempfile::TempDir::new().unwrap();
+    // The eval stages a project-local `.claude/settings.json` into its env (fixture).
+    let evals = r#"{ "skill_name": "mr-review", "evals": [ { "id": "e1", "prompt": "p", "expected_output": "o", "files": [".claude/settings.json"] } ] }"#;
+    let (skill_dir, cwd) = setup(tmp.path(), evals);
+
+    // A Claude config dir whose installed plugin provides a skill named like the SUT,
+    // but the plugin is NOT enabled at config level — only the project-local
+    // `.claude/settings.json` (staged into each env as a fixture) enables it. So the
+    // preflight can only see the override when it scans the real staged env; under Cli
+    // the legacy `env/` is never created, which is the bug this locks down.
+    let config = tmp.path().join("config");
+    let install = config.join("plugins/cache/shadowplug__test");
+    fs::create_dir_all(install.join("skills/mr-review")).unwrap();
+    fs::write(
+        install.join("skills/mr-review/SKILL.md"),
+        "---\nname: mr-review\ndescription: x\n---\n",
+    )
+    .unwrap();
+    fs::create_dir_all(config.join("plugins")).unwrap();
+    fs::write(
+        config.join("plugins/installed_plugins.json"),
+        format!(
+            "{{\"version\":2,\"plugins\":{{\"shadowplug@test\":[{{\"installPath\":{:?}}}]}}}}",
+            install.to_string_lossy()
+        ),
+    )
+    .unwrap();
+
+    // The fixture that, once staged into the env, enables the plugin project-locally.
+    // (No config-level settings.json — the plugin is enabled ONLY via the env's file.)
+    fs::create_dir_all(skill_dir.join("mr-review/evals/.claude")).unwrap();
+    fs::write(
+        skill_dir.join("mr-review/evals/.claude/settings.json"),
+        "{\"enabledPlugins\":{\"shadowplug@test\":true}}",
+    )
+    .unwrap();
+
+    skill_eval()
+        .current_dir(&cwd)
+        .env("CLAUDE_CONFIG_DIR", &config)
+        .args(["run", "--skill-dir"])
+        .arg(&skill_dir)
+        .args([
+            "--skill",
+            "mr-review",
+            "--harness",
+            "claude-code",
+            "--run-mode",
+            "hybrid",
+        ])
+        .assert()
+        .success();
+
+    assert!(
+        iteration_dir(&cwd).join("plugin-shadow.json").exists(),
+        "preflight detected the project-enabled plugin shadow by scanning the staged env"
+    );
+}
diff --git a/tests/run/codex.rs b/tests/run/codex.rs
index 4cc9a59..75c50bf 100644
--- a/tests/run/codex.rs
+++ b/tests/run/codex.rs
@@ -64,8 +64,10 @@ fn codex_stages_repo_local_skills_under_agents() {
         .assert()
         .success();
 
+    // Codex rides Cli dispatch → per-(group, condition) envs. The skill stages into
+    // the with_skill env; the control arm's env carries the siblings but NOT the SUT.
     let slug = "slow-powers-eval-1-with_skill__mr-review";
-    let codex_skills = cwd.join(".agents/skills");
+    let codex_skills = cli_env_dir(&cwd, "g1", "with_skill").join(".agents/skills");
     assert!(read_str(&codex_skills.join(slug).join("SKILL.md")).contains(&format!("name: {slug}")));
     assert_eq!(
         read_str(&codex_skills.join("release-notes/helper.md")),
@@ -74,6 +76,11 @@ fn codex_stages_repo_local_skills_under_agents() {
     assert!(!codex_skills.join("release-notes/evals").exists());
     assert!(!cwd.join(".claude/skills").exists());
 
+    // The gap fix: the control arm's env never contains the skill-under-test.
+    let without_skills = cli_env_dir(&cwd, "g1", "without_skill").join(".agents/skills");
+    assert!(!without_skills.join(slug).exists());
+    assert!(without_skills.join("release-notes/SKILL.md").exists());
+
     let dispatch = read_json(&iteration_dir(&cwd).join("dispatch.json"));
     let task = dispatch["tasks"]
         .as_array()
@@ -112,7 +119,10 @@ fn codex_supports_stage_name_when_staging() {
         .assert()
         .success();
 
-    assert!(read_str(&cwd.join(".agents/skills/mr-review/SKILL.md")).contains("name: mr-review"));
+    assert!(
+        read_str(&cli_env_dir(&cwd, "g1", "with_skill").join(".agents/skills/mr-review/SKILL.md"))
+            .contains("name: mr-review")
+    );
 }
 
 #[test]
@@ -171,7 +181,10 @@ fn codex_guard_installs_project_hook() {
         .success()
         .stdout(contains("--dangerously-bypass-hook-trust"));
 
-    let hooks_path = cwd.join(".codex/hooks.json");
+    // The guard installs into each per-(group, condition) env (the agent-under-test's
+    // cwd).
+    let with_env = cli_env_dir(&cwd, "g1", "with_skill");
+    let hooks_path = with_env.join(".codex/hooks.json");
     assert!(hooks_path.exists());
     let hooks = read_json(&hooks_path);
     let hook = &hooks["hooks"]["PreToolUse"][0];
@@ -182,7 +195,14 @@ fn codex_guard_installs_project_hook() {
             .contains("guard-codex")
     );
     assert!(
-        cwd.join(".agents/skills/.slow-powers-eval-guard.json")
+        with_env
+            .join(".agents/skills/.slow-powers-eval-guard.json")
+            .exists()
+    );
+    // The control arm's env is guarded too.
+    assert!(
+        cli_env_dir(&cwd, "g1", "without_skill")
+            .join(".codex/hooks.json")
             .exists()
     );
 }
@@ -208,14 +228,14 @@ fn codex_dispatch_guidance_detaches_stdin_and_logs_stderr() {
         .success();
     let stdout = String::from_utf8(assert.get_output().stdout.clone()).unwrap();
 
-    assert!(stdout.contains("codex exec --cd <eval-root>"));
+    assert!(stdout.contains("codex --ask-for-approval never exec --cd <eval-root>"));
     assert!(stdout.contains("--dangerously-bypass-hook-trust"));
     assert!(stdout.contains("</dev/null"));
     assert!(stdout.contains("codex-events.jsonl"));
     assert!(stdout.contains("codex-stderr.log"));
 
     let manifest = read_str(&iteration_dir(&cwd).join("dispatch-manifest.md"));
-    assert!(manifest.contains("codex exec --cd <eval-root>"));
+    assert!(manifest.contains("codex --ask-for-approval never exec --cd <eval-root>"));
     assert!(manifest.contains("--dangerously-bypass-hook-trust"));
     assert!(manifest.contains("</dev/null"));
     assert!(manifest.contains("codex-events.jsonl"));
@@ -245,14 +265,14 @@ fn codex_dispatch_guidance_includes_agent_model_when_provided() {
         .success();
     let stdout = String::from_utf8(assert.get_output().stdout.clone()).unwrap();
 
-    assert!(stdout.contains("codex exec --cd <eval-root>"));
+    assert!(stdout.contains("codex --ask-for-approval never exec --cd <eval-root>"));
     assert!(stdout.contains("-m gpt-5-mini"));
     assert!(stdout.contains("</dev/null"));
     assert!(stdout.contains("codex-events.jsonl"));
     assert!(stdout.contains("codex-stderr.log"));
 
     let manifest = read_str(&iteration_dir(&cwd).join("dispatch-manifest.md"));
-    assert!(manifest.contains("codex exec --cd <eval-root>"));
+    assert!(manifest.contains("codex --ask-for-approval never exec --cd <eval-root>"));
     assert!(manifest.contains("-m gpt-5-mini"));
     assert!(manifest.contains("xargs -0 -P"));
 }
@@ -277,11 +297,50 @@ fn codex_dispatch_guidance_omits_hook_bypass_when_unguarded() {
         .success();
     let stdout = String::from_utf8(assert.get_output().stdout.clone()).unwrap();
 
-    assert!(stdout.contains("codex exec --cd <eval-root>"));
+    assert!(stdout.contains("codex --ask-for-approval never exec --cd <eval-root>"));
     assert!(stdout.contains("</dev/null"));
     assert!(!stdout.contains("--dangerously-bypass-hook-trust"));
 }
 
+#[test]
+fn codex_headless_records_mode_and_human_runbook() {
+    let tmp = tempfile::TempDir::new().unwrap();
+    let (skill_dir, cwd) = setup(tmp.path(), DEFAULT_EVALS);
+    skill_eval()
+        .current_dir(&cwd)
+        .args(["run", "--skill-dir"])
+        .arg(&skill_dir)
+        .args([
+            "--skill",
+            "mr-review",
+            "--harness",
+            "codex",
+            "--run-mode",
+            "headless",
+            "--dry-run",
+        ])
+        .assert()
+        .success();
+
+    let conditions = read_json(&iteration_dir(&cwd).join("conditions.json"));
+    assert_eq!(conditions["run_mode"], "headless");
+
+    // Cli has no single env/, so the human-followed runbook lives in the iteration dir.
+    let runbook = read_str(&iteration_dir(&cwd).join("RUNBOOK.md"));
+    assert!(
+        runbook.contains("human driving"),
+        "headless uses the human-followed template: {runbook}"
+    );
+    assert!(
+        runbook.contains("codex --ask-for-approval never exec"),
+        "carries the Codex CLI dispatch recipe: {runbook}"
+    );
+    assert!(
+        runbook.contains("--run-mode headless"),
+        "pipeline commands carry the headless run mode: {runbook}"
+    );
+}
+
 #[test]
 fn codex_rejects_unsupported_parity_features() {
     let tmp = tempfile::TempDir::new().unwrap();
diff --git a/tests/run/env_layout.rs b/tests/run/env_layout.rs
new file mode 100644
index 0000000..f0f6073
--- /dev/null
+++ b/tests/run/env_layout.rs
@@ -0,0 +1,313 @@
+//! Isolated-run env builder: staging redirects into the per-iteration
+//! `env/` dir, fixtures are copied in like a real repo, and `RUNBOOK.md` lives in
+//! the env. eval-magic meta stays above the env in `iteration-N/`.
+
+use crate::helpers::*;
+use serde_json::json;
+use std::fs;
+use std::path::Path;
+
+#[test]
+fn stages_into_env_not_cwd() {
+    let tmp = tempfile::TempDir::new().unwrap();
+    let (skill_dir, cwd) = setup(tmp.path(), DEFAULT_EVALS);
+    skill_eval()
+        .current_dir(&cwd)
+        .args(["run", "--skill-dir"])
+        .arg(&skill_dir)
+        .args(["--skill", "mr-review", "--mode", "new-skill", "--dry-run"])
+        .assert()
+        .success();
+
+    // The staged skill lands under env/.claude/skills, not the invocation cwd.
+    assert_eq!(
+        env_staged_entries(&cwd),
+        vec!["slow-powers-eval-1-with_skill__mr-review"]
+    );
+    assert!(
+        !cwd.join(".claude/skills").exists(),
+        "nothing should be staged at the invocation cwd anymore"
+    );
+    // eval-magic meta stays above the env, in iteration-N/.
+    assert!(iteration_dir(&cwd).join("dispatch.json").exists());
+    assert!(!env_dir(&cwd).join("dispatch.json").exists());
+}
+
+#[test]
+fn env_dir_created_even_with_no_stage() {
+    let tmp = tempfile::TempDir::new().unwrap();
+    let (skill_dir, cwd) = setup(tmp.path(), DEFAULT_EVALS);
+    skill_eval()
+        .current_dir(&cwd)
+        .args(["run", "--skill-dir"])
+        .arg(&skill_dir)
+        .args([
+            "--skill",
+            "mr-review",
+            "--mode",
+            "new-skill",
+            "--no-stage",
+            "--dry-run",
+        ])
+        .assert()
+        .success();
+
+    // Even with staging disabled, the env must exist for fixtures + RUNBOOK.
+    assert!(env_dir(&cwd).is_dir());
+}
+
+#[test]
+fn fixtures_copied_into_env_like_a_real_repo() {
+    let tmp = tempfile::TempDir::new().unwrap();
+    let evals = r#"{ "skill_name": "mr-review", "evals": [
+        { "id": "e1", "prompt": "review", "expected_output": "a review",
+          "files": ["src/main.rs", "data/x.json"] } ] }"#;
+    let (skill_dir, cwd) = setup(tmp.path(), evals);
+    let evals_dir = skill_dir.join("mr-review/evals");
+    fs::create_dir_all(evals_dir.join("src")).unwrap();
+    fs::create_dir_all(evals_dir.join("data")).unwrap();
+    fs::write(evals_dir.join("src/main.rs"), "fn main() {}").unwrap();
+    fs::write(evals_dir.join("data/x.json"), "{}").unwrap();
+
+    skill_eval()
+        .current_dir(&cwd)
+        .args(["run", "--skill-dir"])
+        .arg(&skill_dir)
+        .args(["--skill", "mr-review", "--mode", "new-skill", "--dry-run"])
+        .assert()
+        .success();
+
+    // Structure preserved under env/, not flattened into an inputs/ bucket.
+    assert_eq!(read_str(&env_dir(&cwd).join("src/main.rs")), "fn main() {}");
+    assert_eq!(read_str(&env_dir(&cwd).join("data/x.json")), "{}");
+    assert!(!env_dir(&cwd).join("inputs").exists());
+
+    // The dispatch prompt lists fixtures env-relative — the agent's cwd is env.
+    let dispatch = read_json(&iteration_dir(&cwd).join("dispatch.json"));
+    let task = dispatch["tasks"]
+        .as_array()
+        .unwrap()
+        .iter()
+        .find(|t| t["condition"] == "with_skill")
+        .unwrap();
+    let prompt = read_str(Path::new(task["dispatch_prompt_path"].as_str().unwrap()));
+    assert!(prompt.contains("- src/main.rs"));
+    assert!(prompt.contains("- data/x.json"));
+    assert!(!prompt.contains("inputs/"));
+}
+
+#[test]
+fn dispatch_tasks_grouped_by_condition() {
+    let tmp = tempfile::TempDir::new().unwrap();
+    // Two evals so the interleaved-vs-grouped distinction is observable.
+    let evals = r#"{ "skill_name": "mr-review", "evals": [
+        { "id": "e1", "prompt": "review", "expected_output": "a review" },
+        { "id": "e2", "prompt": "review again", "expected_output": "a review" } ] }"#;
+    let (skill_dir, cwd) = setup(tmp.path(), evals);
+    skill_eval()
+        .current_dir(&cwd)
+        .args(["run", "--skill-dir"])
+        .arg(&skill_dir)
+        .args(["--skill", "mr-review", "--mode", "new-skill", "--dry-run"])
+        .assert()
+        .success();
+
+    let dispatch = read_json(&iteration_dir(&cwd).join("dispatch.json"));
+    let conds: Vec<String> = dispatch["tasks"]
+        .as_array()
+        .unwrap()
+        .iter()
+        .map(|t| t["condition"].as_str().unwrap().to_string())
+        .collect();
+    assert_eq!(conds.len(), 4, "2 evals × 2 conditions: {conds:?}");
+
+    // All with_skill tasks precede all without_skill tasks, so the runbook's
+    // "dispatch all of cond A → switch-condition → dispatch all of cond B" batches
+    // map to a straight top-to-bottom read of tasks[].
+    let first_b = conds.iter().position(|c| c == "without_skill").unwrap();
+    assert!(
+        conds[..first_b].iter().all(|c| c == "with_skill"),
+        "cond A not contiguous at the front: {conds:?}"
+    );
+    assert!(
+        conds[first_b..].iter().all(|c| c == "without_skill"),
+        "cond B not contiguous at the back: {conds:?}"
+    );
+}
+
+#[test]
+fn dispatch_outputs_live_under_env() {
+    let tmp = tempfile::TempDir::new().unwrap();
+    let (skill_dir, cwd) = setup(tmp.path(), DEFAULT_EVALS);
+    skill_eval()
+        .current_dir(&cwd)
+        .args(["run", "--skill-dir"])
+        .arg(&skill_dir)
+        .args(["--skill", "mr-review", "--mode", "new-skill", "--dry-run"])
+        .assert()
+        .success();
+
+    let dispatch = read_json(&iteration_dir(&cwd).join("dispatch.json"));
+    let tasks = dispatch["tasks"].as_array().unwrap();
+    assert!(!tasks.is_empty(), "run produced dispatch tasks");
+
+    // Canonicalize to compare across the macOS /var → /private/var symlink:
+    // dispatch.json stores resolved paths, but the test roots come from the raw
+    // tempdir, so a lexical starts_with would mismatch.
+    let env = fs::canonicalize(env_dir(&cwd)).unwrap();
+    let iter = fs::canonicalize(iteration_dir(&cwd)).unwrap();
+    let outputs_root = env.join(".eval-magic-outputs");
+    for task in tasks {
+        // The agent-under-test (cwd = env/) writes only inside its env.
+        let outputs_dir = fs::canonicalize(task["outputs_dir"].as_str().unwrap()).unwrap();
+        assert!(
+            outputs_dir.starts_with(&outputs_root),
+            "outputs_dir under env/.eval-magic-outputs/: {}",
+            outputs_dir.display()
+        );
+        // run.json / timing.json are eval-magic meta: above the env, in iteration-N/.
+        // The files don't exist yet (dry-run), so canonicalize their shared run dir.
+        let run_record = Path::new(task["run_record_path"].as_str().unwrap());
+        let timing = Path::new(task["timing_path"].as_str().unwrap());
+        let run_meta_dir = fs::canonicalize(run_record.parent().unwrap()).unwrap();
+        assert!(
+            run_meta_dir.starts_with(&iter) && !run_meta_dir.starts_with(&env),
+            "run dir stays above env: {}",
+            run_meta_dir.display()
+        );
+        assert_eq!(
+            timing.parent().unwrap(),
+            run_record.parent().unwrap(),
+            "run.json and timing.json share the meta run dir"
+        );
+    }
+}
+
+#[test]
+fn shared_fixture_copied_once_across_conditions_and_runs() {
+    let tmp = tempfile::TempDir::new().unwrap();
+    let evals = r#"{ "skill_name": "mr-review", "evals": [
+        { "id": "e1", "prompt": "review", "expected_output": "a review",
+          "files": ["fixture.txt"] } ] }"#;
+    let (skill_dir, cwd) = setup(tmp.path(), evals);
+    fs::write(skill_dir.join("mr-review/evals/fixture.txt"), "DATA").unwrap();
+
+    skill_eval()
+        .current_dir(&cwd)
+        .args(["run", "--skill-dir"])
+        .arg(&skill_dir)
+        .args([
+            "--skill",
+            "mr-review",
+            "--mode",
+            "new-skill",
+            "--runs",
+            "2",
+            "--dry-run",
+        ])
+        .assert()
+        .success();
+
+    // One shared copy in env, referenced env-relative by every condition × run.
+    assert_eq!(read_str(&env_dir(&cwd).join("fixture.txt")), "DATA");
+    let dispatch = read_json(&iteration_dir(&cwd).join("dispatch.json"));
+    let tasks = dispatch["tasks"].as_array().unwrap();
+    assert_eq!(tasks.len(), 4, "1 eval × 2 conditions × 2 runs");
+    for task in tasks {
+        assert_eq!(
+            task["fixtures"].as_array().unwrap(),
+            &vec![json!("fixture.txt")]
+        );
+    }
+}
+
+#[test]
+fn two_evals_sharing_a_fixture_declaration_succeeds() {
+    let tmp = tempfile::TempDir::new().unwrap();
+    let evals = r#"{ "skill_name": "mr-review", "evals": [
+        { "id": "e1", "prompt": "p1", "expected_output": "o", "files": ["shared.txt"] },
+        { "id": "e2", "prompt": "p2", "expected_output": "o", "files": ["shared.txt"] } ] }"#;
+    let (skill_dir, cwd) = setup(tmp.path(), evals);
+    fs::write(skill_dir.join("mr-review/evals/shared.txt"), "SHARED").unwrap();
+
+    skill_eval()
+        .current_dir(&cwd)
+        .args(["run", "--skill-dir"])
+        .arg(&skill_dir)
+        .args(["--skill", "mr-review", "--mode", "new-skill", "--dry-run"])
+        .assert()
+        .success();
+
+    // Two evals declaring the same fixture from the same source is an idempotent share.
+    assert_eq!(read_str(&env_dir(&cwd).join("shared.txt")), "SHARED");
+    let dispatch = read_json(&iteration_dir(&cwd).join("dispatch.json"));
+    for id in ["e1", "e2"] {
+        let task = dispatch["tasks"]
+            .as_array()
+            .unwrap()
+            .iter()
+            .find(|t| t["eval_id"] == id && t["condition"] == "with_skill")
+            .unwrap();
+        assert_eq!(
+            task["fixtures"].as_array().unwrap(),
+            &vec![json!("shared.txt")]
+        );
+    }
+}
+
+#[test]
+fn env_contains_only_the_staged_skill_no_repo_leakage() {
+    let tmp = tempfile::TempDir::new().unwrap();
+    let (skill_dir, cwd) = setup(tmp.path(), DEFAULT_EVALS);
+    // A stray skill sitting in the invocation cwd's .claude/skills must NOT leak into env:
+    // read isolation comes from env being a clean, separate cwd.
+    fs::create_dir_all(cwd.join(".claude/skills/unrelated-skill")).unwrap();
+    fs::write(
+        cwd.join(".claude/skills/unrelated-skill/SKILL.md"),
+        "---\nname: unrelated-skill\ndescription: leaked\n---\n",
+    )
+    .unwrap();
+
+    skill_eval()
+        .current_dir(&cwd)
+        .args(["run", "--skill-dir"])
+        .arg(&skill_dir)
+        .args(["--skill", "mr-review", "--mode", "new-skill", "--dry-run"])
+        .assert()
+        .success();
+
+    // env/.claude/skills holds only the staged skill-under-test.
+    assert_eq!(
+        env_staged_entries(&cwd),
+        vec!["slow-powers-eval-1-with_skill__mr-review"]
+    );
+    // The unrelated cwd skill is absent from env.
+    assert!(
+        !env_dir(&cwd)
+            .join(".claude/skills/unrelated-skill")
+            .exists()
+    );
+}
+
+#[test]
+fn guard_marker_allowed_roots_cover_meta_above_env() {
+    let tmp = tempfile::TempDir::new().unwrap();
+    let (skill_dir, cwd) = setup(tmp.path(), DEFAULT_EVALS);
+    skill_eval()
+        .current_dir(&cwd)
+        .args(["run", "--skill-dir"])
+        .arg(&skill_dir)
+        .args(["--skill", "mr-review", "--mode", "new-skill", "--guard"])
+        .assert()
+        .success();
+
+    // The guard arms inside env, but its allowedRoots include the workspace root above env,
+    // so eval-magic can still write meta (benchmark.json, dispatch.json) into iteration-N/.
+    let marker = read_json(&env_dir(&cwd).join(".claude/skills/.slow-powers-eval-guard.json"));
+    let roots = marker["allowedRoots"].as_array().unwrap();
+    let iter = iteration_dir(&cwd);
+    assert!(
+        roots.iter().any(|r| iter.starts_with(r.as_str().unwrap())),
+        "allowedRoots {roots:?} must cover the meta tree above env at {iter:?}"
+    );
+}
diff --git a/tests/run/grouping.rs b/tests/run/grouping.rs
new file mode 100644
index 0000000..e941409
--- /dev/null
+++ b/tests/run/grouping.rs
@@ -0,0 +1,202 @@
+//! Isolation-group batching during `run`: how the setup phase groups evals into
+//! environments and records the plan in `dispatch.json`. Covers the in-session
+//! single-env (byte-compat) path, the Cli per-(group, condition) split that closes
+//! the condition-isolation gap, and the explicit `isolation: isolated` hint.
+
+use crate::helpers::*;
+use serde_json::json;
+use std::fs;
+
+const TWO_EVALS_ONE_ISOLATED: &str = r#"{ "skill_name": "mr-review", "evals": [
+    { "id": "e1", "prompt": "p1", "expected_output": "o", "files": ["a.txt"] },
+    { "id": "e2", "prompt": "p2", "expected_output": "o", "files": ["b.txt"], "isolation": "isolated" } ] }"#;
+
+fn write_fixtures(skill_dir: &std::path::Path) {
+    fs::write(skill_dir.join("mr-review/evals/a.txt"), "AAA").unwrap();
+    fs::write(skill_dir.join("mr-review/evals/b.txt"), "BBB").unwrap();
+}
+
+#[test]
+fn insession_single_group_omits_groups_key_and_stays_bare_env() {
+    let tmp = tempfile::TempDir::new().unwrap();
+    let (skill_dir, cwd) = setup(tmp.path(), DEFAULT_EVALS);
+    skill_eval()
+        .current_dir(&cwd)
+        .args(["run", "--skill-dir"])
+        .arg(&skill_dir)
+        .args(["--skill", "mr-review", "--mode", "new-skill", "--dry-run"])
+        .assert()
+        .success();
+
+    // The common no-conflict in-session case is byte-identical to the pre-grouping
+    // shape: a bare env/, no `groups` summary, and no per-task group/eval_root keys.
+    assert!(env_dir(&cwd).exists());
+    let dispatch = read_json(&iteration_dir(&cwd).join("dispatch.json"));
+    assert!(
+        dispatch.get("groups").is_none(),
+        "single-group in-session omits the groups summary: {dispatch}"
+    );
+    for task in dispatch["tasks"].as_array().unwrap() {
+        assert!(task.get("group").is_none(), "no group tag: {task}");
+        assert!(task.get("eval_root").is_none(), "no eval_root: {task}");
+    }
+}
+
+#[test]
+fn cli_single_group_emits_groups_and_splits_env_per_condition() {
+    let tmp = tempfile::TempDir::new().unwrap();
+    let (skill_dir, cwd) = setup(tmp.path(), DEFAULT_EVALS);
+    skill_eval()
+        .current_dir(&cwd)
+        .args(["run", "--skill-dir"])
+        .arg(&skill_dir)
+        .args([
+            "--skill",
+            "mr-review",
+            "--mode",
+            "new-skill",
+            "--harness",
+            "claude-code",
+            "--run-mode",
+            "hybrid",
+            "--dry-run",
+        ])
+        .assert()
+        .success();
+
+    // Even a single group splits the Cli env per condition: the with_skill env holds
+    // the skill, the control arm's env holds none — physical condition isolation.
+    let with_env = cli_env_dir(&cwd, "g1", "with_skill");
+    let without_env = cli_env_dir(&cwd, "g1", "without_skill");
+    assert!(
+        with_env
+            .join(".claude/skills/slow-powers-eval-1-with_skill__mr-review")
+            .exists()
+    );
+    assert!(
+        !without_env
+            .join(".claude/skills/slow-powers-eval-1-with_skill__mr-review")
+            .exists(),
+        "the control arm's env contains no staged skill"
+    );
+
+    // The plan is recorded for the executing human: one group, its env per condition.
+    let dispatch = read_json(&iteration_dir(&cwd).join("dispatch.json"));
+    let groups = dispatch["groups"]
+        .as_array()
+        .expect("groups summary present");
+    assert_eq!(groups.len(), 1);
+    assert_eq!(groups[0]["id"], "g1");
+    assert_eq!(groups[0]["evals"], json!(["e1"]));
+
+    // Each task carries the env it runs in (the recipe `cd`s into it). A single
+    // group means no group tag.
+    for task in dispatch["tasks"].as_array().unwrap() {
+        assert!(task.get("group").is_none(), "single group: no tag: {task}");
+        let eval_root = task["eval_root"]
+            .as_str()
+            .expect("Cli task carries eval_root");
+        let cond = task["condition"].as_str().unwrap();
+        assert!(
+            eval_root.ends_with(&format!("env-g1-{cond}")),
+            "eval_root points at the per-condition env: {eval_root}"
+        );
+    }
+}
+
+#[test]
+fn isolated_hint_splits_into_two_groups_in_session() {
+    let tmp = tempfile::TempDir::new().unwrap();
+    let (skill_dir, cwd) = setup(tmp.path(), TWO_EVALS_ONE_ISOLATED);
+    write_fixtures(&skill_dir);
+    skill_eval()
+        .current_dir(&cwd)
+        .args(["run", "--skill-dir"])
+        .arg(&skill_dir)
+        .args(["--skill", "mr-review", "--mode", "new-skill", "--dry-run"])
+        .assert()
+        .success();
+
+    let dispatch = read_json(&iteration_dir(&cwd).join("dispatch.json"));
+    let groups = dispatch["groups"]
+        .as_array()
+        .expect("groups summary present");
+    assert_eq!(groups.len(), 2, "the isolated eval forms its own group");
+    assert_eq!(groups[0]["evals"], json!(["e1"]));
+    assert_eq!(groups[1]["evals"], json!(["e2"]));
+    assert!(
+        groups[1]["rationale"]
+            .as_str()
+            .unwrap()
+            .contains("isolated"),
+        "second group's rationale names the hint: {}",
+        groups[1]["rationale"]
+    );
+
+    // Tasks are tagged with their group.
+    let e2_task = dispatch["tasks"]
+        .as_array()
+        .unwrap()
+        .iter()
+        .find(|t| t["eval_id"] == "e2" && t["condition"] == "with_skill")
+        .unwrap();
+    assert_eq!(e2_task["group"], "g2");
+
+    // In-session stages only the FIRST group's fixtures into the one env up front;
+    // the isolated group's fixtures are swapped in later by reset-batch.
+    assert_eq!(read_str(&env_dir(&cwd).join("a.txt")), "AAA");
+    assert!(
+        !env_dir(&cwd).join("b.txt").exists(),
+        "the isolated group's fixture is not staged into the shared env up front"
+    );
+}
+
+#[test]
+fn isolated_hint_splits_into_separate_envs_cli() {
+    let tmp = tempfile::TempDir::new().unwrap();
+    let (skill_dir, cwd) = setup(tmp.path(), TWO_EVALS_ONE_ISOLATED);
+    write_fixtures(&skill_dir);
+    skill_eval()
+        .current_dir(&cwd)
+        .args(["run", "--skill-dir"])
+        .arg(&skill_dir)
+        .args([
+            "--skill",
+            "mr-review",
+            "--harness",
+            "codex",
+            "--run-mode",
+            "hybrid",
+            "--dry-run",
+        ])
+        .assert()
+        .success();
+
+    // Each group gets its own per-condition envs, holding only that group's fixtures.
+    assert_eq!(
+        read_str(&cli_env_dir(&cwd, "g1", "with_skill").join("a.txt")),
+        "AAA"
+    );
+    assert!(!cli_env_dir(&cwd, "g1", "with_skill").join("b.txt").exists());
+    assert_eq!(
+        read_str(&cli_env_dir(&cwd, "g2", "with_skill").join("b.txt")),
+        "BBB"
+    );
+    assert!(!cli_env_dir(&cwd, "g2", "with_skill").join("a.txt").exists());
+
+    let dispatch = read_json(&iteration_dir(&cwd).join("dispatch.json"));
+    assert_eq!(dispatch["groups"].as_array().unwrap().len(), 2);
+    let e2_task = dispatch["tasks"]
+        .as_array()
+        .unwrap()
+        .iter()
+        .find(|t| t["eval_id"] == "e2" && t["condition"] == "with_skill")
+        .unwrap();
+    assert_eq!(e2_task["group"], "g2");
+    assert!(
+        e2_task["eval_root"]
+            .as_str()
+            .unwrap()
+            .ends_with("env-g2-with_skill")
+    );
+}
diff --git a/tests/run/helpers.rs b/tests/run/helpers.rs
index 988e236..1c5caab 100644
--- a/tests/run/helpers.rs
+++ b/tests/run/helpers.rs
@@ -31,11 +31,31 @@ pub fn setup(root: &Path, evals_json: &str) -> (PathBuf, PathBuf) {
 }
 
 pub fn iteration_dir(cwd: &Path) -> PathBuf {
-    cwd.join("skills-workspace")
+    cwd.join(".eval-magic")
         .join("mr-review")
         .join("iteration-1")
 }
 
+/// The isolated env dir that becomes the agent-under-test's cwd (in-session
+/// dispatch): staging, fixtures, and `RUNBOOK.md` all land under here, below
+/// `iteration_dir`.
+pub fn env_dir(cwd: &Path) -> PathBuf {
+    iteration_dir(cwd).join("env")
+}
+
+/// A per-`(group, condition)` Cli env dir — the cwd each `claude -p`/`codex exec`
+/// subprocess runs from: `iteration-N/env-<group>-<condition>/`. Each holds only
+/// that condition's skill (or none, for the control arm) and its group's fixtures.
+pub fn cli_env_dir(cwd: &Path, group: &str, condition: &str) -> PathBuf {
+    iteration_dir(cwd).join(format!("env-{group}-{condition}"))
+}
+
+/// Staged skill names under the env's harness skills dir (`env/.claude/skills`),
+/// excluding the staging manifest, sorted.
+pub fn env_staged_entries(cwd: &Path) -> Vec<String> {
+    staged_entries(&env_dir(cwd).join(".claude/skills"))
+}
+
 pub fn read_json(path: &Path) -> Value {
     serde_json::from_str(&fs::read_to_string(path).unwrap()).unwrap()
 }
diff --git a/tests/run/lifecycle.rs b/tests/run/lifecycle.rs
index d58fb4d..f7a1b15 100644
--- a/tests/run/lifecycle.rs
+++ b/tests/run/lifecycle.rs
@@ -8,10 +8,11 @@ use std::fs;
 use std::path::Path;
 
 #[test]
-fn guard_installs_pretooluse_hook_and_teardown_guard_removes_it() {
+fn guard_installs_pretooluse_hook_under_env() {
     let tmp = tempfile::TempDir::new().unwrap();
     let (skill_dir, cwd) = setup(tmp.path(), DEFAULT_EVALS);
-    let settings = cwd.join(".claude/settings.local.json");
+    // The guard arms inside the isolated env — the agent-under-test's cwd.
+    let settings = env_dir(&cwd).join(".claude/settings.local.json");
 
     skill_eval()
         .current_dir(&cwd)
@@ -28,7 +29,14 @@ fn guard_installs_pretooluse_hook_and_teardown_guard_removes_it() {
             .unwrap()
             .contains("Write")
     );
+    // Nothing is armed at the invocation cwd anymore.
+    assert!(!cwd.join(".claude/settings.local.json").exists());
 
+    // `teardown-guard` operates at the invocation cwd, so it does not reach the
+    // env-scoped guard: this is a transitional no-op, reconciled when the loop runs
+    // inside the env session / teardown is reworked. The env is disposable
+    // and the guard auto-expires (6h TTL); full `teardown` reclaims it (see
+    // `teardown_reclaims_workspace_and_env_guard`).
     skill_eval()
         .current_dir(&cwd)
         .args(["teardown-guard", "--skill-dir"])
@@ -36,14 +44,17 @@ fn guard_installs_pretooluse_hook_and_teardown_guard_removes_it() {
         .args(["--skill", "mr-review"])
         .assert()
         .success();
-    assert!(!settings.exists());
+    assert!(settings.exists(), "env guard survives a cwd teardown-guard");
 }
 
 #[test]
-fn finalize_warns_when_guard_is_still_armed() {
+fn finalize_does_not_warn_about_env_scoped_guard_from_cwd() {
     let tmp = tempfile::TempDir::new().unwrap();
     let (skill_dir, cwd) = setup(tmp.path(), DEFAULT_EVALS);
-    let marker = cwd.join(".claude/skills/.slow-powers-eval-guard.json");
+    // The guard arms inside the env; `finalize` checks the invocation cwd, where no
+    // guard lives, so it does not warn. The env-scoped guard is harmless to the operator's
+    // cwd (it only loads when cwd = env); the in-env loop handles it within the session.
+    let marker = env_dir(&cwd).join(".claude/skills/.slow-powers-eval-guard.json");
 
     skill_eval()
         .current_dir(&cwd)
@@ -61,8 +72,7 @@ fn finalize_warns_when_guard_is_still_armed() {
         .args(["--skill", "mr-review", "--iteration", "1"])
         .assert()
         .success()
-        .stdout(contains("Guard still armed"))
-        .stdout(contains("eval-magic teardown-guard"));
+        .stdout(contains("Guard still armed").not());
 
     assert!(marker.exists());
 }
@@ -92,11 +102,11 @@ fn finalize_does_not_warn_when_guard_is_not_armed() {
 }
 
 #[test]
-fn teardown_removes_guard_and_staged_skill_set() {
+fn teardown_reclaims_workspace_and_env_guard() {
     let tmp = tempfile::TempDir::new().unwrap();
     let (skill_dir, cwd) = setup(tmp.path(), DEFAULT_EVALS);
-    let settings = cwd.join(".claude/settings.local.json");
-    let staged = cwd.join(".claude/skills");
+    let settings = env_dir(&cwd).join(".claude/settings.local.json");
+    let staged = env_dir(&cwd).join(".claude/skills");
 
     skill_eval()
         .current_dir(&cwd)
@@ -108,6 +118,9 @@ fn teardown_removes_guard_and_staged_skill_set() {
     assert!(settings.exists());
     assert!(staged.exists());
 
+    // Full `teardown` reclaims the workspace iteration; the env (and its guard) lives
+    // inside it, so removing the workspace removes the env guard too — this is what makes
+    // deferring the cwd teardown-guard rework safe.
     skill_eval()
         .current_dir(&cwd)
         .args(["teardown", "--skill-dir"])
@@ -115,10 +128,10 @@ fn teardown_removes_guard_and_staged_skill_set() {
         .args(["--skill", "mr-review"])
         .assert()
         .success();
+    assert!(!cwd.join(".eval-magic").exists());
     assert!(!settings.exists());
     assert!(!staged.exists());
     assert!(!cwd.join(".claude").exists());
-    assert!(!cwd.join("skills-workspace").exists());
 }
 
 #[test]
@@ -318,7 +331,14 @@ fn runs_flag_expands_dispatches_into_run_dirs() {
             "run.json not under its run dir: {}",
             task["run_record_path"]
         );
-        assert!(task["outputs_dir"].as_str().unwrap().contains(&run_seg));
+        // Outputs live inside the env, namespaced per run so concurrent
+        // same-batch subagents can't collide; run-<k> is the leaf segment.
+        let outputs_dir = task["outputs_dir"].as_str().unwrap();
+        assert!(
+            outputs_dir.contains(".eval-magic-outputs/")
+                && outputs_dir.ends_with(&format!("run-{k}")),
+            "outputs not namespaced under env per run: {outputs_dir}"
+        );
         let desc = task["agent_description"].as_str().unwrap();
         assert!(
             desc.contains(&format!(":r{k}:")),
@@ -329,11 +349,19 @@ fn runs_flag_expands_dispatches_into_run_dirs() {
     for eval in ["e1", "e2"] {
         for cond in ["with_skill", "without_skill"] {
             for k in [1, 2] {
+                // Meta run dir (run.json / timing.json) above the env.
                 let run_dir = iteration_dir(&cwd)
                     .join(format!("eval-{eval}"))
                     .join(cond)
                     .join(format!("run-{k}"));
-                assert!(run_dir.join("outputs").is_dir(), "missing {run_dir:?}");
+                assert!(run_dir.is_dir(), "missing meta run dir {run_dir:?}");
+                // Per-run outputs dir inside the env.
+                let out_dir = env_dir(&cwd)
+                    .join(".eval-magic-outputs")
+                    .join(format!("eval-{eval}"))
+                    .join(cond)
+                    .join(format!("run-{k}"));
+                assert!(out_dir.is_dir(), "missing env outputs dir {out_dir:?}");
             }
         }
     }
@@ -364,9 +392,14 @@ fn runs_one_keeps_flat_single_run_layout() {
         assert!(task.get("run_index").is_none(), "run_index on single run");
         assert!(!task["run_record_path"].as_str().unwrap().contains("/run-"));
     }
+    // Flat single-run layout: the meta cond dir exists, with no run-1/ nesting.
     let cond_dir = iteration_dir(&cwd).join("eval-e1").join("with_skill");
-    assert!(cond_dir.join("outputs").is_dir());
+    assert!(cond_dir.is_dir());
     assert!(!cond_dir.join("run-1").exists());
+    // Outputs live inside the env, flat (no run-1/ segment) for a single-run cell.
+    let out_dir = env_dir(&cwd).join(".eval-magic-outputs/eval-e1/with_skill");
+    assert!(out_dir.is_dir());
+    assert!(!out_dir.join("run-1").exists());
 }
 
 #[test]
@@ -475,3 +508,100 @@ fn only_with_unknown_id_exits_nonzero() {
         .failure()
         .stderr(contains("unknown eval id(s): nope"));
 }
+
+#[test]
+fn teardown_disarms_per_group_condition_cli_guards() {
+    let tmp = tempfile::TempDir::new().unwrap();
+    let (skill_dir, cwd) = setup(tmp.path(), DEFAULT_EVALS);
+
+    // Cli (hybrid) materializes one env per (group, condition); `--guard` arms a marker
+    // in each. The human runs teardown from the iteration dir, not from inside any env,
+    // so the cwd-only disarm never reaches these per-env markers.
+    skill_eval()
+        .current_dir(&cwd)
+        .args(["run", "--skill-dir"])
+        .arg(&skill_dir)
+        .args([
+            "--skill",
+            "mr-review",
+            "--harness",
+            "claude-code",
+            "--run-mode",
+            "hybrid",
+            "--guard",
+        ])
+        .assert()
+        .success();
+
+    let with_marker =
+        cli_env_dir(&cwd, "g1", "with_skill").join(".claude/skills/.slow-powers-eval-guard.json");
+    let without_marker = cli_env_dir(&cwd, "g1", "without_skill")
+        .join(".claude/skills/.slow-powers-eval-guard.json");
+    assert!(with_marker.exists());
+    assert!(without_marker.exists());
+
+    // Keep the iteration (simulate uncommitted results) so the env dirs survive
+    // teardown's reclaim and we can assert the markers themselves were disarmed.
+    fs::write(
+        iteration_dir(&cwd).join("benchmark.json"),
+        "{\"delta\":{\"pass_rate\":0.4}}\n",
+    )
+    .unwrap();
+
+    skill_eval()
+        .current_dir(&cwd)
+        .args(["teardown", "--skill-dir"])
+        .arg(&skill_dir)
+        .args(["--skill", "mr-review", "--run-mode", "hybrid"])
+        .assert()
+        .success()
+        .stdout(contains("write guard disarmed"));
+
+    assert!(
+        iteration_dir(&cwd).exists(),
+        "iteration kept (uncommitted results)"
+    );
+    assert!(!with_marker.exists(), "with_skill env guard disarmed");
+    assert!(!without_marker.exists(), "without_skill env guard disarmed");
+}
+
+#[test]
+fn finalize_warns_about_armed_cli_per_env_guard() {
+    let tmp = tempfile::TempDir::new().unwrap();
+    let (skill_dir, cwd) = setup(tmp.path(), DEFAULT_EVALS);
+
+    // Cli (hybrid) arms a guard in each per-(group, condition) env. finalize runs from
+    // the iteration dir, not an env, so the cwd-only check misses them; it must walk the
+    // per-env markers and remind the operator.
+    skill_eval()
+        .current_dir(&cwd)
+        .args(["run", "--skill-dir"])
+        .arg(&skill_dir)
+        .args([
+            "--skill",
+            "mr-review",
+            "--harness",
+            "claude-code",
+            "--run-mode",
+            "hybrid",
+            "--guard",
+        ])
+        .assert()
+        .success();
+
+    skill_eval()
+        .current_dir(&cwd)
+        .args(["finalize", "--skill-dir"])
+        .arg(&skill_dir)
+        .args([
+            "--skill",
+            "mr-review",
+            "--run-mode",
+            "hybrid",
+            "--iteration",
+            "1",
+        ])
+        .assert()
+        .success()
+        .stdout(contains("Guard still armed"));
+}
diff --git a/tests/run/main.rs b/tests/run/main.rs
index dff0527..74ef8d2 100644
--- a/tests/run/main.rs
+++ b/tests/run/main.rs
@@ -9,7 +9,13 @@
 
 mod helpers;
 
+mod claude_cli;
 mod codex;
+mod env_layout;
+mod grouping;
 mod lifecycle;
 mod opencode;
+mod reset_batch;
+mod runbook;
 mod staging;
+mod switch_condition;
diff --git a/tests/run/opencode.rs b/tests/run/opencode.rs
index 15c7b0b..073ae87 100644
--- a/tests/run/opencode.rs
+++ b/tests/run/opencode.rs
@@ -71,7 +71,9 @@ fn opencode_stages_repo_local_skills_under_opencode() {
         .assert()
         .success();
 
-    let opencode_skills = cwd.join(".opencode/skills");
+    // OpenCode rides Cli dispatch → per-(group, condition) envs; the skill stages
+    // into the with_skill env.
+    let opencode_skills = cli_env_dir(&cwd, "g1", "with_skill").join(".opencode/skills");
     assert!(
         read_str(&opencode_skills.join(OPENCODE_SLUG).join("SKILL.md"))
             .contains(&format!("name: {OPENCODE_SLUG}"))
diff --git a/tests/run/reset_batch.rs b/tests/run/reset_batch.rs
new file mode 100644
index 0000000..f2b7247
--- /dev/null
+++ b/tests/run/reset_batch.rs
@@ -0,0 +1,118 @@
+//! `reset-batch`: the per-group isolation barrier for a single-session (in-session)
+//! isolated run. Between eval-group batches it wipes the shared `env/` working tree
+//! — keeping the staged skills and the outputs tree — and re-seeds it with the next
+//! group's fixtures, so a prior batch's fixtures and stray writes can't leak.
+
+use crate::helpers::*;
+use predicates::str::contains;
+use std::fs;
+use std::path::Path;
+
+const WITH_SLUG: &str = "slow-powers-eval-1-with_skill__mr-review";
+
+/// Two evals routed into two groups: e2's `isolation: isolated` hint forces its own
+/// group, so the in-session env stages group g1 (e1/a.txt) up front and swaps in
+/// group g2 (e2/b.txt) via reset-batch.
+const TWO_GROUPS: &str = r#"{ "skill_name": "mr-review", "evals": [
+    { "id": "e1", "prompt": "p1", "expected_output": "o", "files": ["a.txt"] },
+    { "id": "e2", "prompt": "p2", "expected_output": "o", "files": ["b.txt"], "isolation": "isolated" } ] }"#;
+
+/// Stage a two-group interactive iteration; returns `(skill_dir, cwd)` with `env/`
+/// holding group g1's fixtures.
+fn setup_two_groups(root: &Path) -> (std::path::PathBuf, std::path::PathBuf) {
+    let (skill_dir, cwd) = setup(root, TWO_GROUPS);
+    fs::write(skill_dir.join("mr-review/evals/a.txt"), "AAA").unwrap();
+    fs::write(skill_dir.join("mr-review/evals/b.txt"), "BBB").unwrap();
+    skill_eval()
+        .current_dir(&cwd)
+        .args(["run", "--skill-dir"])
+        .arg(&skill_dir)
+        .args(["--skill", "mr-review", "--mode", "new-skill", "--dry-run"])
+        .assert()
+        .success();
+    (skill_dir, cwd)
+}
+
+/// Run `reset-batch` the way the runbook prescribes: from inside `env/`, carrying
+/// only the self-sufficient `--skill-dir/--skill/--workspace-dir` selector.
+fn reset_to(cwd: &Path, skill_dir: &Path, group: &str) -> assert_cmd::assert::Assert {
+    skill_eval()
+        .current_dir(env_dir(cwd))
+        .args(["reset-batch", "--skill-dir"])
+        .arg(skill_dir)
+        .args(["--skill", "mr-review", "--workspace-dir"])
+        .arg(cwd.join(".eval-magic"))
+        .args(["--iteration", "1", "--group", group])
+        .assert()
+}
+
+#[test]
+fn reset_batch_wipes_working_tree_and_reseeds_group_fixtures() {
+    let tmp = tempfile::TempDir::new().unwrap();
+    let (skill_dir, cwd) = setup_two_groups(tmp.path());
+
+    // Up front the env holds group g1's fixture only.
+    assert_eq!(read_str(&env_dir(&cwd).join("a.txt")), "AAA");
+    assert!(!env_dir(&cwd).join("b.txt").exists());
+    // Simulate a stray file the g1 batch's agent wrote into the env.
+    fs::write(env_dir(&cwd).join("stray.txt"), "STRAY").unwrap();
+
+    reset_to(&cwd, &skill_dir, "g2").success();
+
+    // The env is now seeded for g2: its fixture present, g1's gone, the stray write
+    // gone — a clean tree.
+    assert_eq!(read_str(&env_dir(&cwd).join("b.txt")), "BBB");
+    assert!(!env_dir(&cwd).join("a.txt").exists());
+    assert!(!env_dir(&cwd).join("stray.txt").exists());
+
+    // The staged skill and the outputs tree survive the wipe.
+    assert!(
+        env_dir(&cwd)
+            .join(".claude/skills")
+            .join(WITH_SLUG)
+            .is_dir(),
+        "the staged skill survives reset-batch"
+    );
+    assert!(env_dir(&cwd).join(".eval-magic-outputs").exists());
+}
+
+#[test]
+fn reset_batch_can_restore_the_first_group() {
+    let tmp = tempfile::TempDir::new().unwrap();
+    let (skill_dir, cwd) = setup_two_groups(tmp.path());
+
+    // Move to g2, then back to g1 (as condition B's loop does after condition A left
+    // the env on the last group).
+    reset_to(&cwd, &skill_dir, "g2").success();
+    reset_to(&cwd, &skill_dir, "g1").success();
+    assert_eq!(read_str(&env_dir(&cwd).join("a.txt")), "AAA");
+    assert!(!env_dir(&cwd).join("b.txt").exists());
+}
+
+#[test]
+fn reset_batch_rejects_unknown_group() {
+    let tmp = tempfile::TempDir::new().unwrap();
+    let (skill_dir, cwd) = setup_two_groups(tmp.path());
+    reset_to(&cwd, &skill_dir, "g99")
+        .failure()
+        .stderr(contains("unknown --group"));
+}
+
+#[test]
+fn reset_batch_on_single_group_run_explains_it_is_unneeded() {
+    let tmp = tempfile::TempDir::new().unwrap();
+    let (skill_dir, cwd) = setup(tmp.path(), DEFAULT_EVALS);
+    skill_eval()
+        .current_dir(&cwd)
+        .args(["run", "--skill-dir"])
+        .arg(&skill_dir)
+        .args(["--skill", "mr-review", "--mode", "new-skill", "--dry-run"])
+        .assert()
+        .success();
+
+    // A single-group run tags no task with a group, so reset-batch has nothing to do
+    // and says so rather than silently wiping.
+    reset_to(&cwd, &skill_dir, "g1")
+        .failure()
+        .stderr(contains("single group"));
+}
diff --git a/tests/run/runbook.rs b/tests/run/runbook.rs
new file mode 100644
index 0000000..e4c4a62
--- /dev/null
+++ b/tests/run/runbook.rs
@@ -0,0 +1,134 @@
+//! `RUNBOOK.md` generation during `run`: the followable isolated-session handoff
+//! artifact, and the post-run pointer at it.
+
+use crate::helpers::*;
+use predicates::prelude::PredicateBooleanExt;
+use predicates::str::contains;
+
+#[test]
+fn run_writes_interactive_runbook_and_points_at_it() {
+    let tmp = tempfile::TempDir::new().unwrap();
+    let (skill_dir, cwd) = setup(tmp.path(), DEFAULT_EVALS);
+    // A real run (not --dry-run) so the post-run "Next:" handoff prints; --dry-run
+    // stops before next steps by contract.
+    skill_eval()
+        .current_dir(&cwd)
+        .args(["run", "--skill-dir"])
+        .arg(&skill_dir)
+        .args(["--skill", "mr-review"])
+        .assert()
+        .success()
+        // The summary hands off to a fresh isolated session: cd into env/, then
+        // "Read and follow RUNBOOK.md". It must not re-print the dispatch loop —
+        // that lives only in RUNBOOK.md now (the session-juggling apparatus is gone).
+        // (The exact env path in the handoff is locked by the util.rs unit test;
+        // here we just confirm the handoff is wired into stdout.)
+        .stdout(contains("Read and follow RUNBOOK.md"))
+        .stdout(contains("1. cd "))
+        .stdout(contains("one batch at a time").not());
+
+    // The runbook lives inside the isolated env — the session's cwd reads it.
+    assert!(!iteration_dir(&cwd).join("RUNBOOK.md").exists());
+    let book = read_str(&env_dir(&cwd).join("RUNBOOK.md"));
+    assert!(book.contains("mr-review"), "names the skill: {book}");
+    assert!(
+        book.contains("with_skill") && book.contains("without_skill"),
+        "names both conditions: {book}"
+    );
+    assert!(
+        book.contains("agent_description"),
+        "carries the in-session dispatch guidance: {book}"
+    );
+    // The per-condition batch loop: a switch-condition barrier between the two
+    // batches, carrying the absolute --workspace-dir so it resolves from env/.
+    assert!(
+        book.contains("eval-magic switch-condition --skill-dir")
+            && book.contains("--workspace-dir")
+            && book.contains("--condition without_skill"),
+        "carries the switch-condition barrier between batches: {book}"
+    );
+    assert!(
+        book.contains("eval-magic ingest --skill-dir"),
+        "carries the ingest command: {book}"
+    );
+    assert!(
+        book.contains("eval-magic finalize --skill-dir"),
+        "carries the finalize command: {book}"
+    );
+    assert!(
+        book.contains("benchmark.json"),
+        "points at the result: {book}"
+    );
+    assert!(!book.contains("{{"), "no unsubstituted tokens: {book}");
+}
+
+#[test]
+fn run_writes_headless_runbook_for_codex() {
+    let tmp = tempfile::TempDir::new().unwrap();
+    let (skill_dir, cwd) = setup(tmp.path(), DEFAULT_EVALS);
+    skill_eval()
+        .current_dir(&cwd)
+        .args(["run", "--skill-dir"])
+        .arg(&skill_dir)
+        .args(["--skill", "mr-review", "--harness", "codex", "--dry-run"])
+        .assert()
+        .success();
+
+    // Cli dispatches from per-(group, condition) envs, so the human-followed runbook
+    // lives in the iteration dir (there is no single env/).
+    assert!(!env_dir(&cwd).join("RUNBOOK.md").exists());
+    let book = read_str(&iteration_dir(&cwd).join("RUNBOOK.md"));
+    assert!(
+        book.contains("human driving"),
+        "frames the run for a human at a terminal: {book}"
+    );
+    assert!(
+        book.contains("codex --ask-for-approval never exec"),
+        "carries the Codex CLI dispatch recipe: {book}"
+    );
+    assert!(
+        book.contains("--harness codex"),
+        "pipeline commands carry --harness codex: {book}"
+    );
+    assert!(!book.contains("{{"), "no unsubstituted tokens: {book}");
+}
+
+#[test]
+fn run_writes_headless_runbook_for_claude() {
+    let tmp = tempfile::TempDir::new().unwrap();
+    let (skill_dir, cwd) = setup(tmp.path(), DEFAULT_EVALS);
+    skill_eval()
+        .current_dir(&cwd)
+        .args(["run", "--skill-dir"])
+        .arg(&skill_dir)
+        .args([
+            "--skill",
+            "mr-review",
+            "--harness",
+            "claude-code",
+            "--run-mode",
+            "headless",
+            "--dry-run",
+        ])
+        .assert()
+        .success();
+
+    let book = read_str(&iteration_dir(&cwd).join("RUNBOOK.md"));
+    // A Claude Code Cli-mode run uses the shared human-followed template, NOT
+    // Claude's interactive (agent-followed) one — so the in-session switch-condition
+    // batch loop is absent and the claude -p recipe is present. Cli has no single
+    // env/, so the runbook lives in the iteration dir.
+    assert!(
+        book.contains("human driving"),
+        "frames the run for a human at a terminal: {book}"
+    );
+    assert!(
+        book.contains("claude -p"),
+        "carries the claude -p dispatch recipe: {book}"
+    );
+    assert!(
+        !book.contains("switch-condition"),
+        "headless does not use the in-session batch loop: {book}"
+    );
+    assert!(!book.contains("{{"), "no unsubstituted tokens: {book}");
+}
diff --git a/tests/run/staging.rs b/tests/run/staging.rs
index 7faef1c..0db4828 100644
--- a/tests/run/staging.rs
+++ b/tests/run/staging.rs
@@ -1,7 +1,6 @@
 //! Staging, plan-mode injection, `--stage-name`, and dispatch-prompt rendering.
 
 use crate::helpers::*;
-use predicates::prelude::PredicateBooleanExt;
 use predicates::str::contains;
 use serde_json::Value;
 use std::fs;
@@ -30,7 +29,7 @@ fn setup_direct_skill(root: &Path) -> (PathBuf, PathBuf, PathBuf) {
 }
 
 fn direct_iteration_dir(cwd: &Path) -> PathBuf {
-    cwd.join("skills-workspace")
+    cwd.join(".eval-magic")
         .join("mr-review")
         .join("iteration-1")
 }
@@ -49,7 +48,7 @@ fn stages_only_sut_and_writes_workspace_under_cwd() {
 
     assert!(iteration_dir(&cwd).join("dispatch.json").exists());
     assert_eq!(
-        staged_entries(&cwd.join(".claude/skills")),
+        env_staged_entries(&cwd),
         vec!["slow-powers-eval-1-with_skill__mr-review"]
     );
 }
@@ -65,8 +64,9 @@ fn run_from_skill_dir_defaults_to_new_skill_without_staging_siblings() {
         .assert()
         .success()
         .stdout(contains("Preparing mr-review iteration-1 (new-skill)"))
-        .stdout(contains("eval-magic ingest --skill-dir"))
-        .stdout(contains("--skill mr-review --iteration 1"));
+        // The run summary now hands off to the isolated session; the pipeline
+        // commands live in the RUNBOOK (asserted below), not the printed summary.
+        .stdout(contains("Read and follow RUNBOOK.md"));
 
     assert!(
         direct_iteration_dir(&skill_sub)
@@ -74,10 +74,21 @@ fn run_from_skill_dir_defaults_to_new_skill_without_staging_siblings() {
             .exists()
     );
     assert_eq!(
-        staged_entries(&skill_sub.join(".claude/skills")),
+        env_staged_entries(&skill_sub),
         vec!["slow-powers-eval-1-with_skill__mr-review"]
     );
 
+    // Run from inside the skill dir with no args: the auto-derived target selector
+    // (`command_target_args`) is threaded into the RUNBOOK's pipeline commands.
+    let runbook = read_str(
+        &direct_iteration_dir(&skill_sub)
+            .join("env")
+            .join("RUNBOOK.md"),
+    );
+    assert!(runbook.contains("eval-magic ingest --skill-dir"));
+    assert!(runbook.contains("--skill mr-review --workspace-dir"));
+    assert!(runbook.contains("--iteration 1"));
+
     let dispatch = read_json(&direct_iteration_dir(&skill_sub).join("dispatch.json"));
     let task = dispatch["tasks"]
         .as_array()
@@ -107,7 +118,7 @@ fn run_with_skill_path_defaults_to_single_skill_mode() {
 
     assert!(direct_iteration_dir(&cwd).join("dispatch.json").exists());
     assert_eq!(
-        staged_entries(&cwd.join(".claude/skills")),
+        env_staged_entries(&cwd),
         vec!["slow-powers-eval-1-with_skill__mr-review"]
     );
 }
@@ -181,7 +192,7 @@ fn stage_name_threads_verbatim_name_and_registers_cleanup() {
         .assert()
         .success();
 
-    let skills_dir = cwd.join(".claude/skills");
+    let skills_dir = env_dir(&cwd).join(".claude/skills");
     assert_eq!(staged_entries(&skills_dir), vec!["mr-review"]);
 
     let conditions = read_json(&iteration_dir(&cwd).join("conditions.json"));
@@ -218,7 +229,10 @@ fn stage_name_threads_verbatim_name_and_registers_cleanup() {
 fn stage_name_refuses_to_clobber_preexisting_dir() {
     let tmp = tempfile::TempDir::new().unwrap();
     let (skill_dir, cwd) = setup(tmp.path(), DEFAULT_EVALS);
-    let preexisting = cwd.join(".claude/skills/my-real-skill");
+    // Staging now lands in env/.claude/skills, which is fresh per iteration.
+    // The clobber guard still matters on a re-run (--iteration 1) where the env
+    // already holds an untracked skill dir; pre-seed that and confirm it is preserved.
+    let preexisting = env_dir(&cwd).join(".claude/skills/my-real-skill");
     fs::create_dir_all(&preexisting).unwrap();
     fs::write(preexisting.join("SKILL.md"), "USER OWNED").unwrap();
 
@@ -231,6 +245,8 @@ fn stage_name_refuses_to_clobber_preexisting_dir() {
             "mr-review",
             "--mode",
             "new-skill",
+            "--iteration",
+            "1",
             "--stage-name",
             "my-real-skill",
             "--dry-run",
@@ -295,41 +311,3 @@ fn writes_each_prompt_to_file_and_drops_inline() {
         assert!(contents.contains("User request:"));
     }
 }
-
-#[test]
-fn discovery_warning_fires_when_claude_skills_dir_created_fresh() {
-    let tmp = tempfile::TempDir::new().unwrap();
-    let (skill_dir, cwd) = setup(tmp.path(), DEFAULT_EVALS);
-    // No .claude/skills/ in cwd when the session started: `run` creates it, so Claude Code's
-    // watcher won't pick it up until the session re-scans — the actionable warning should fire.
-    assert!(!cwd.join(".claude/skills").exists());
-    skill_eval()
-        .current_dir(&cwd)
-        .args(["run", "--skill-dir"])
-        .arg(&skill_dir)
-        .args(["--skill", "mr-review", "--mode", "new-skill", "--dry-run"])
-        .assert()
-        .success()
-        .stderr(contains("did not exist when your session started"))
-        .stderr(contains("--no-stage"))
-        .stderr(contains("live change detection").not());
-}
-
-#[test]
-fn discovery_note_when_claude_skills_dir_preexists() {
-    let tmp = tempfile::TempDir::new().unwrap();
-    let (skill_dir, cwd) = setup(tmp.path(), DEFAULT_EVALS);
-    // .claude/skills/ already exists when the session starts: it is watched, so live change
-    // detection surfaces the staged skill in-session — emit the confirmation note, not the
-    // fallback warning.
-    fs::create_dir_all(cwd.join(".claude/skills")).unwrap();
-    skill_eval()
-        .current_dir(&cwd)
-        .args(["run", "--skill-dir"])
-        .arg(&skill_dir)
-        .args(["--skill", "mr-review", "--mode", "new-skill", "--dry-run"])
-        .assert()
-        .success()
-        .stderr(contains("live change detection"))
-        .stderr(contains("did not exist when your session started").not());
-}
diff --git a/tests/run/switch_condition.rs b/tests/run/switch_condition.rs
new file mode 100644
index 0000000..cd39279
--- /dev/null
+++ b/tests/run/switch_condition.rs
@@ -0,0 +1,153 @@
+//! `switch-condition`: the per-condition read-isolation barrier for a
+//! single-session isolated run. It removes the off-condition's staged skill from
+//! `env/.claude/skills/` between dispatch batches, and must resolve the iteration
+//! tree while invoked from `cwd = env/`.
+
+use crate::helpers::*;
+use std::path::{Path, PathBuf};
+
+const WITH_SLUG: &str = "slow-powers-eval-1-with_skill__mr-review";
+
+fn env_skills_dir(cwd: &Path) -> PathBuf {
+    env_dir(cwd).join(".claude/skills")
+}
+
+/// Run `switch-condition` the way the runbook prescribes: from inside `env/`,
+/// carrying only the self-sufficient `--skill-dir/--skill/--workspace-dir` selector.
+fn switch_to(cwd: &Path, skill_dir: &Path, condition: &str) -> assert_cmd::assert::Assert {
+    skill_eval()
+        .current_dir(env_dir(cwd))
+        .args(["switch-condition", "--skill-dir"])
+        .arg(skill_dir)
+        .args(["--skill", "mr-review", "--workspace-dir"])
+        .arg(cwd.join(".eval-magic"))
+        .args(["--iteration", "1", "--condition", condition])
+        .assert()
+}
+
+#[test]
+fn switch_condition_removes_off_condition_slug_from_env_cwd() {
+    let tmp = tempfile::TempDir::new().unwrap();
+    let (skill_dir, cwd) = setup(tmp.path(), DEFAULT_EVALS);
+    // Build the env (staging happens even under --dry-run).
+    skill_eval()
+        .current_dir(&cwd)
+        .args(["run", "--skill-dir"])
+        .arg(&skill_dir)
+        .args(["--skill", "mr-review", "--mode", "new-skill", "--dry-run"])
+        .assert()
+        .success();
+
+    let with_slug = env_skills_dir(&cwd).join(WITH_SLUG);
+    assert!(with_slug.is_dir(), "with_skill staged before switch");
+
+    // Move to the without_skill batch: the off-condition (with_skill) staged skill
+    // is removed so the control arm cannot read it.
+    switch_to(&cwd, &skill_dir, "without_skill").success();
+
+    assert!(!with_slug.exists(), "with_skill slug removed after switch");
+}
+
+#[test]
+fn switch_condition_is_idempotent() {
+    let tmp = tempfile::TempDir::new().unwrap();
+    let (skill_dir, cwd) = setup(tmp.path(), DEFAULT_EVALS);
+    skill_eval()
+        .current_dir(&cwd)
+        .args(["run", "--skill-dir"])
+        .arg(&skill_dir)
+        .args(["--skill", "mr-review", "--mode", "new-skill", "--dry-run"])
+        .assert()
+        .success();
+
+    // Two switches in a row: the second is a no-op, not an error (a re-run after a
+    // fix, or an over-eager operator, must stay safe).
+    switch_to(&cwd, &skill_dir, "without_skill").success();
+    switch_to(&cwd, &skill_dir, "without_skill").success();
+    assert!(!env_skills_dir(&cwd).join(WITH_SLUG).exists());
+}
+
+#[test]
+fn switch_condition_preserves_guard_marker() {
+    let tmp = tempfile::TempDir::new().unwrap();
+    let (skill_dir, cwd) = setup(tmp.path(), DEFAULT_EVALS);
+    // A guarded run arms the write guard; --guard requires a real (non-dry) run.
+    skill_eval()
+        .current_dir(&cwd)
+        .args(["run", "--skill-dir"])
+        .arg(&skill_dir)
+        .args(["--skill", "mr-review", "--mode", "new-skill", "--guard"])
+        .assert()
+        .success();
+
+    // The guard marker is a sibling file of the slug subtree inside the skills dir.
+    let marker = env_skills_dir(&cwd).join(".slow-powers-eval-guard.json");
+    assert!(marker.exists(), "guard armed before switch");
+
+    switch_to(&cwd, &skill_dir, "without_skill").success();
+
+    assert!(
+        !env_skills_dir(&cwd).join(WITH_SLUG).exists(),
+        "slug removed"
+    );
+    assert!(marker.exists(), "guard marker survives the switch");
+}
+
+#[test]
+fn switch_condition_rejects_unknown_condition() {
+    let tmp = tempfile::TempDir::new().unwrap();
+    let (skill_dir, cwd) = setup(tmp.path(), DEFAULT_EVALS);
+    skill_eval()
+        .current_dir(&cwd)
+        .args(["run", "--skill-dir"])
+        .arg(&skill_dir)
+        .args(["--skill", "mr-review", "--mode", "new-skill", "--dry-run"])
+        .assert()
+        .success();
+
+    switch_to(&cwd, &skill_dir, "bogus_condition")
+        .failure()
+        .stderr(predicates::str::contains(
+            "unknown --condition 'bogus_condition'",
+        ));
+    // A typo must not silently leave the staged skill in place under a false sense
+    // of isolation.
+    assert!(env_skills_dir(&cwd).join(WITH_SLUG).is_dir());
+}
+
+#[test]
+fn switch_condition_revision_removes_old_skill_keeps_new() {
+    let tmp = tempfile::TempDir::new().unwrap();
+    let (skill_dir, cwd) = setup(tmp.path(), DEFAULT_EVALS);
+    // Revision mode compares a baseline snapshot (old_skill) against the working
+    // SKILL.md (new_skill); both arms stage a skill. Seed the baseline snapshot.
+    let snapshot = iteration_dir(&cwd)
+        .parent()
+        .unwrap()
+        .join("snapshots")
+        .join("baseline");
+    std::fs::create_dir_all(&snapshot).unwrap();
+    std::fs::write(
+        snapshot.join("SKILL.md"),
+        "---\nname: mr-review\ndescription: review merge requests\n---\n\nold body\n",
+    )
+    .unwrap();
+
+    skill_eval()
+        .current_dir(&cwd)
+        .args(["run", "--skill-dir"])
+        .arg(&skill_dir)
+        .args(["--skill", "mr-review", "--mode", "revision", "--dry-run"])
+        .assert()
+        .success();
+
+    let old_slug = env_skills_dir(&cwd).join("slow-powers-eval-1-old_skill__mr-review");
+    let new_slug = env_skills_dir(&cwd).join("slow-powers-eval-1-new_skill__mr-review");
+    assert!(old_slug.is_dir() && new_slug.is_dir(), "both arms staged");
+
+    // Switch to the new_skill batch: only the old_skill slug is removed.
+    switch_to(&cwd, &skill_dir, "new_skill").success();
+
+    assert!(!old_slug.exists(), "old_skill slug removed");
+    assert!(new_slug.is_dir(), "new_skill slug kept");
+}