slowdini · slowdini · Jun 21, 2026 · Jun 21, 2026
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,6 @@
 /target
 .DS_Store
+
+# eval-magic run artifacts (workspace root + per-env outputs) — churn every run
+.eval-magic/
+.eval-magic-outputs/
diff --git a/README.md b/README.md
@@ -86,7 +86,7 @@ environment.
 
 ```bash
 # 1. Build the iteration's isolated env (arm --guard — see Cost & confirmation).
-#    run stages skills into skills-workspace/my-skill/iteration-1/env/, copies
+#    run stages skills into .eval-magic/my-skill/iteration-1/env/, copies
 #    fixtures in, and writes RUNBOOK.md. It does NOT dispatch — it prints a handoff.
 #    Add --runs <N> to dispatch every eval N times per condition for variance
 #    reduction (a per-eval "runs" field in evals.json overrides the flag).
@@ -112,7 +112,7 @@ eval-magic ingest
 #    armed, finalize reminds you to run teardown-guard before editing source.
 eval-magic finalize
 
-# 5. Read skills-workspace/my-skill/iteration-1/benchmark.json (the prep session
+# 5. Read .eval-magic/my-skill/iteration-1/benchmark.json (the prep session
 #    resumes here), then clean up:
 eval-magic teardown
 ```
@@ -201,7 +201,7 @@ Read `validity_warnings` **before** trusting any delta — a low skill-invocatio
 Per skill being evaluated, the runner produces this tree (everything but `evals/evals.json` is generated):
 
 ```
-skills-workspace/<skill>/                # outside the skill directory, gitignore it
+.eval-magic/<skill>/                     # outside the skill directory, gitignore it
   snapshots/                             # Mode B baselines, persist across iterations
     <label>/SKILL.md
   iteration-N/
@@ -228,7 +228,7 @@ independently and the benchmark's per-condition `mean`/`stddev`/`n` cover all of
         run-2/  outputs/  run.json  timing.json  grading.json
 ```
 
-The only source file you author for evals is `<skill>/evals/evals.json` (or create it with `eval-magic init`). Keep `skills-workspace/` out of version control — it churns on every run. Snapshot retention is manual: delete `<workspace>/<skill>/snapshots/<label>/` when no longer needed.
+The only source file you author for evals is `<skill>/evals/evals.json` (or create it with `eval-magic init`). Keep `.eval-magic/` out of version control — it churns on every run. Snapshot retention is manual: delete `<workspace>/<skill>/snapshots/<label>/` when no longer needed.
 
 ## Version-controlled baselines
 
@@ -294,7 +294,7 @@ Support today:
 
 ### Claude Code (fully wired)
 
-The run loop above *is* the Claude Code loop. By default this is the **fully-interactive** run mode (see [Run modes](#run-modes)) — subagents are dispatched in-session via the Task tool; the **hybrid** and **headless** (`claude -p`) modes are now wired too (pass `--run-mode hybrid` or `--run-mode headless`, see below). `eval-magic run` itself only *prepares* the isolated env (`skills-workspace/<skill>/iteration-N/env/`) and writes `RUNBOOK.md` into it, then prints a handoff: `cd` into `env/`, start a **fresh** Claude Code session there, and say *Read and follow RUNBOOK.md*. That fresh session — clean cwd, staged skills present at session start — drives the whole dispatch → switch-condition → ingest → finalize loop and writes `benchmark.json`, which the prep session resumes on. These are the Claude-Code-specific details:
+The run loop above *is* the Claude Code loop. By default this is the **fully-interactive** run mode (see [Run modes](#run-modes)) — subagents are dispatched in-session via the Task tool; the **hybrid** and **headless** (`claude -p`) modes are now wired too (pass `--run-mode hybrid` or `--run-mode headless`, see below). `eval-magic run` itself only *prepares* the isolated env (`.eval-magic/<skill>/iteration-N/env/`) and writes `RUNBOOK.md` into it, then prints a handoff: `cd` into `env/`, start a **fresh** Claude Code session there, and say *Read and follow RUNBOOK.md*. That fresh session — clean cwd, staged skills present at session start — drives the whole dispatch → switch-condition → ingest → finalize loop and writes `benchmark.json`, which the prep session resumes on. These are the Claude-Code-specific details:
 
 **Isolating from installed plugins.** Read this first if the skill you're evaluating shares a name with one an installed, enabled plugin provides. Subagents are dispatched via the **Task tool**, so they inherit *this session's* enabled plugins — the staging slug avoids an on-disk collision but does not stop the installed copy from being discoverable, contaminating both arms (the `without_skill` arm is then not truly skill-absent). Plugins load at session start and can't be unloaded mid-session, so the runner only *detects and warns* (the plugin-shadow banner). The isolated env gives a clean *cwd* but does not unload user/global plugins, so this still applies. To actually isolate, launch the **fresh session you start in `env/`** one of these ways — subagents inherit it:
 

diff --git a/src/cli/args.rs b/src/cli/args.rs
@@ -95,7 +95,7 @@ pub struct CommonArgs {
     /// commands already carry it.
     #[arg(long)]
     pub run_mode: Option<RunMode>,
-    /// Workspace directory (defaults to `<cwd>/skills-workspace`).
+    /// Workspace directory (defaults to `<cwd>/.eval-magic`).
     ///
     /// The artifact root. Pass the same value to every command of a run, including
     /// `teardown`.
@@ -456,7 +456,7 @@ pub(crate) enum Commands {
     /// Swap the active isolation batch in a single-session isolated run.
     ///
     /// Wipes the shared `env/` working tree (keeping `.claude/skills/` and the
-    /// `.eval-magic/` outputs tree) and re-seeds it with `--group`'s fixtures — the
+    /// `.eval-magic-outputs/` tree) and re-seeds it with `--group`'s fixtures — the
     /// per-batch isolation barrier between eval groups in an interactive isolated run
     /// (see `RUNBOOK.md`). `--group` names the group you are
     /// about to dispatch next. Run it only after every Task subagent of the prior

diff --git a/src/cli/commands/pipeline.rs b/src/cli/commands/pipeline.rs
@@ -238,7 +238,7 @@ pub(crate) fn run_switch_condition(args: SwitchConditionArgs) -> anyhow::Result<
 
 /// Swap the active isolation batch in a single-session (in-session) isolated run:
 /// wipe the shared `env/` working tree — keeping the staged skills and the
-/// `.eval-magic/` outputs tree — and re-seed it with `--group`'s fixtures, so the
+/// `.eval-magic-outputs/` tree — and re-seed it with `--group`'s fixtures, so the
 /// next batch starts from a clean tree the prior batch's fixtures and stray writes
 /// can't taint. A hard barrier: the runbook joins every Task subagent of the prior
 /// batch first. Resolves the iteration from `--workspace-dir`, so it runs from
@@ -300,7 +300,7 @@ pub(crate) fn run_reset_batch(args: ResetBatchArgs) -> anyhow::Result<()> {
         ".agents",
         ".codex",
         ".opencode",
-        ".eval-magic",
+        ".eval-magic-outputs",
         "RUNBOOK.md",
     ];
     for entry in std::fs::read_dir(&env_dir)? {

diff --git a/src/cli/commands/workspace.rs b/src/cli/commands/workspace.rs
@@ -134,9 +134,7 @@ pub(crate) fn run_teardown(args: CommonArgs) -> anyhow::Result<()> {
         eprintln!(
             "⚠ Kept {} workspace iteration(s) with results not yet committed:\n{lines}\n   Commit them, e.g.:\n     eval-magic promote-baseline{target_args} --iteration <N>\n   or delete {}/ manually to discard.",
             ws.kept_iterations.len(),
-            Path::new("skills-workspace")
-                .join(&ctx.skill_name)
-                .display()
+            Path::new(".eval-magic").join(&ctx.skill_name).display()
         );
     }
     Ok(())

diff --git a/src/cli/mod.rs b/src/cli/mod.rs
@@ -120,7 +120,7 @@ pub(crate) fn parse_id_list(v: Option<&str>) -> Option<Vec<String>> {
 /// "Next:" commands are copy-pasteable from any cwd — not just the one `run`
 /// happened to start in. The absolute `--workspace-dir` is what lets the isolated
 /// session run `ingest`/`finalize`/`switch-condition` from `cwd = iteration-N/env/`:
-/// without it, `workspace_root` would default to `<cwd>/skills-workspace`
+/// without it, `workspace_root` would default to `<cwd>/.eval-magic`
 /// (`detect_run_context`) and the iteration tree above the env would not resolve.
 pub(crate) fn command_target_args(ctx: &RunContext) -> String {
     format!(
@@ -314,7 +314,7 @@ mod tests {
 
     /// The isolated session runs `ingest`/`finalize`/`switch-condition` from
     /// `cwd = iteration-N/env/`. Without an explicit workspace root those commands
-    /// default `workspace_root` to `<cwd>/skills-workspace` and bail "not found",
+    /// default `workspace_root` to `<cwd>/.eval-magic` and bail "not found",
     /// so the selector must carry an absolute `--workspace-dir` pointing at the
     /// real workspace above the env.
     #[test]
@@ -342,7 +342,7 @@ mod tests {
 
         // Round-trip from an env-like cwd below the workspace: feeding the
         // selector's roots back resolves the SAME workspace, not
-        // `<cwd>/skills-workspace`.
+        // `<cwd>/.eval-magic`.
         let env_like = ctx
             .workspace_root
             .join("mr-review")

diff --git a/src/cli/run/orchestrate/build.rs b/src/cli/run/orchestrate/build.rs
@@ -184,10 +184,7 @@ pub(super) fn write_dispatch(
                         None => format!("eval-{}/{cond_name}", ev.id),
                         Some(k) => format!("eval-{}/{cond_name}/run-{k}", ev.id),
                     };
-                    let outputs_dir = env_root
-                        .join(".eval-magic")
-                        .join("outputs")
-                        .join(outputs_rel);
+                    let outputs_dir = env_root.join(".eval-magic-outputs").join(outputs_rel);
                     fs::create_dir_all(&outputs_dir)?;
 
                     let fixtures = fixtures_by_eval

diff --git a/src/cli/run/runbook.rs b/src/cli/run/runbook.rs
@@ -255,7 +255,7 @@ mod tests {
 
     #[test]
     fn interactive_runbook_carries_run_specifics_and_full_loop() {
-        let dir = PathBuf::from("/work/skills-workspace/widget-skill/iteration-5");
+        let dir = PathBuf::from("/work/.eval-magic/widget-skill/iteration-5");
         let book = build_runbook(&claude_ctx(&dir));
 
         // Run-specific identity.
@@ -326,7 +326,7 @@ mod tests {
 
     #[test]
     fn interactive_runbook_with_multiple_groups_carries_reset_batch_barriers() {
-        let dir = PathBuf::from("/work/skills-workspace/widget-skill/iteration-5");
+        let dir = PathBuf::from("/work/.eval-magic/widget-skill/iteration-5");
         let groups = ["g1".to_string(), "g2".to_string()];
         let book = build_runbook(&RunbookContext {
             groups: &groups,
@@ -362,7 +362,7 @@ mod tests {
 
     #[test]
     fn headless_runbook_is_human_followed_cli_recipe() {
-        let dir = PathBuf::from("/work/skills-workspace/widget-skill/iteration-2");
+        let dir = PathBuf::from("/work/.eval-magic/widget-skill/iteration-2");
         let ctx = RunbookContext {
             harness: Harness::Codex,
             run_mode: RunMode::Hybrid,

diff --git a/src/cli/run/staging/mod.rs b/src/cli/run/staging/mod.rs
@@ -257,7 +257,7 @@ pub fn stage_skill_for_harness(opts: &StageSkillOpts) -> Result<String, RunError
                 "SKILL.md"
                     | "evals"
                     | SNAPSHOT_META
-                    | "skills-workspace"
+                    | ".eval-magic"
                     | ".claude"
                     | ".agents"
                     | ".codex"

diff --git a/src/cli/run/util.rs b/src/cli/run/util.rs
@@ -287,10 +287,10 @@ mod tests {
 
     #[test]
     fn isolated_handoff_points_into_env_and_at_the_runbook() {
-        let env = Path::new("/work/skills-workspace/widget/iteration-3/env");
+        let env = Path::new("/work/.eval-magic/widget/iteration-3/env");
         let handoff = insession_isolated_handoff(env);
         assert!(
-            handoff.contains("/work/skills-workspace/widget/iteration-3/env"),
+            handoff.contains("/work/.eval-magic/widget/iteration-3/env"),
             "names the env to cd into: {handoff}"
         );
         assert!(handoff.contains("cd "), "spells out the cd step: {handoff}");

diff --git a/src/core/context.rs b/src/core/context.rs
@@ -207,7 +207,7 @@ pub fn detect_run_context(input: DetectInput) -> Result<RunContext, ContextError
 
     let workspace_root = match input.workspace_dir {
         Some(raw) => absolutize(&cwd, &raw)?,
-        None => cwd.join("skills-workspace"),
+        None => cwd.join(".eval-magic"),
     };
     let stage_root = cwd;
 
@@ -444,7 +444,7 @@ mod tests {
         let tmp = TempDir::new().unwrap();
         let skill_dir = make_skill_dir(tmp.path(), &["foo"]);
         let ctx = detect_run_context(input(&skill_dir, "foo")).unwrap();
-        let expected = std::env::current_dir().unwrap().join("skills-workspace");
+        let expected = std::env::current_dir().unwrap().join(".eval-magic");
         assert_eq!(ctx.workspace_root, expected);
     }
 

diff --git a/src/pipeline/detect_stray_writes.rs b/src/pipeline/detect_stray_writes.rs
@@ -338,7 +338,7 @@ pub fn detect_stray_writes_report(
                 invocations_inspected += run.tool_invocations.len();
                 // `dispatch.json` is the authoritative source of the outputs
                 // boundary: an absolute path into the isolated env
-                // (`env/.eval-magic/outputs/...`). Without it we cannot honor the
+                // (`env/.eval-magic-outputs/...`). Without it we cannot honor the
                 // outputs-only contract, so we skip out-of-bounds *write*
                 // classification for that run rather than guess a boundary — the old
                 // `<slot>/outputs` convention no longer matches where agents write and
@@ -747,7 +747,7 @@ mod tests {
         let f = detect_live_source_reads(
             &[
                 inv("Read", json!({"file_path": format!("{OUTPUTS}/x.md")}), 0),
-                inv("Bash", json!({"command": "ls skills-workspace"}), 1),
+                inv("Bash", json!({"command": "ls .eval-magic"}), 1),
                 // Write tools are detect_stray_writes' jurisdiction — reads only here.
                 inv(
                     "Write",

diff --git a/src/sandbox/decide.rs b/src/sandbox/decide.rs
@@ -139,7 +139,7 @@ mod tests {
     use crate::sandbox::now_ms;
     use serde_json::json;
 
-    const ROOTS: [&str; 2] = ["/work/skills-workspace", "/work/.claude/skills"];
+    const ROOTS: [&str; 2] = ["/work/.eval-magic", "/work/.claude/skills"];
 
     /// An RFC3339 timestamp `offset_ms` from now — `future`/`past` bracket the
     /// current wall clock used by `decide`.
@@ -209,7 +209,7 @@ mod tests {
     fn allows_a_write_under_an_allowed_root() {
         let d = decide_now(
             "Write",
-            json!({ "file_path": "/work/skills-workspace/x/outputs/a.md" }),
+            json!({ "file_path": "/work/.eval-magic/x/outputs/a.md" }),
             Some(&marker()),
         );
         assert!(d.allow);
@@ -241,7 +241,7 @@ mod tests {
     fn allows_a_bash_command_scoped_to_an_allowed_root() {
         let d = decide_now(
             "Bash",
-            json!({ "command": "echo hi > /work/skills-workspace/x/outputs/log" }),
+            json!({ "command": "echo hi > /work/.eval-magic/x/outputs/log" }),
             Some(&marker()),
         );
         assert!(d.allow);
@@ -286,7 +286,7 @@ mod tests {
     fn allows_apply_patch_inside_allowed_roots() {
         let d = decide_now(
             "apply_patch",
-            json!({ "files": ["/work/skills-workspace/eval/outputs/out.md"] }),
+            json!({ "files": ["/work/.eval-magic/eval/outputs/out.md"] }),
             Some(&marker()),
         );
         assert!(d.allow);
@@ -363,10 +363,13 @@ mod tests {
     }
 
     #[test]
-    fn does_not_flag_skills_workspace_as_a_bare_skills_write() {
+    fn does_not_flag_a_skills_prefixed_dir_as_a_bare_skills_write() {
+        // A `skills`-prefixed path that is NOT an allowed root: the bare-`skills/`
+        // heuristic only fires on a bare `skills` at a path boundary, so a
+        // `skills-`-prefixed dir must not be flagged and the write is allowed.
         let d = decide_now(
             "Bash",
-            json!({ "command": "mkdir -p /work/skills-workspace/x/outputs" }),
+            json!({ "command": "mkdir -p /work/skills-data/x/outputs" }),
             Some(&marker()),
         );
         assert!(d.allow);

diff --git a/src/sandbox/guard.rs b/src/sandbox/guard.rs
@@ -118,7 +118,7 @@ mod tests {
     fn marker() -> GuardMarker {
         GuardMarker {
             active: Some(true),
-            allowed_roots: Some(vec!["/work/skills-workspace".to_string()]),
+            allowed_roots: Some(vec!["/work/.eval-magic".to_string()]),
             expires_at: None,
         }
     }
@@ -170,7 +170,7 @@ mod tests {
 
     #[test]
     fn codex_apply_patch_inside_allowed_roots_allows() {
-        let payload = r#"{ "hook_event_name": "PreToolUse", "tool_name": "apply_patch", "tool_input": { "files": ["/work/skills-workspace/out.md"] } }"#;
+        let payload = r#"{ "hook_event_name": "PreToolUse", "tool_name": "apply_patch", "tool_input": { "files": ["/work/.eval-magic/out.md"] } }"#;
         assert_eq!(codex_guard_decision(payload, Some(marker())), None);
     }
 

diff --git a/src/sandbox/install.rs b/src/sandbox/install.rs
@@ -72,7 +72,7 @@ fn write_json(path: &Path, value: &Value) -> io::Result<()> {
 /// agent-under-test's cwd) and the OS temp dir. The staged skills dir
 /// (`stage_root/.claude/skills` or `.agents/skills`) and the per-task outputs dir
 /// both live *inside* `stage_root`, so a single env root covers every legitimate
-/// agent write. Scoping to the env — not the parent `skills-workspace/` — keeps the
+/// agent write. Scoping to the env — not the parent `.eval-magic/` — keeps the
 /// guard boundary identical to the isolation boundary: the agent can't reach a
 /// sibling iteration or the `iteration-N/` meta tree above its cwd. eval-magic's own
 /// above-env writes (e.g. `benchmark.json`) are not gated here: they run as
@@ -408,7 +408,7 @@ mod tests {
         let temp = absolutize(&std::env::temp_dir()).display().to_string();
         assert_eq!(roots, vec![env, temp]);
         assert!(
-            !roots.iter().any(|r| r.ends_with("skills-workspace")),
+            !roots.iter().any(|r| r.ends_with(".eval-magic")),
             "workspace_root must not be an allowed root: {roots:?}"
         );
     }