Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
0a436ba
docs(isolated-run): resolve design spike & add env/dispatch design note
slowdini Jun 19, 2026
48efa1f
Merge pull request #88 from slowdini/docs/isolated-run-design-spike
slowdini Jun 19, 2026
d27c492
feat(isolated-runs): runbook artifact
slowdini Jun 19, 2026
eb807f8
Merge pull request #89 from slowdini/feat/runbook-artifact
slowdini Jun 19, 2026
cbad44d
feat(isolated-runs): create isolated env for eval runs
slowdini Jun 19, 2026
078bfa1
Merge pull request #91 from slowdini/feat/isolated-run-env-builder
slowdini Jun 19, 2026
9482ba5
feat(isolated-runs): run the eval loop from inside env/
slowdini Jun 19, 2026
26674e3
feat(isolated-runs): add switch-condition barrier and batch-loop runbook
slowdini Jun 19, 2026
047a544
docs(isolated-runs): update isolated-run note for the full-loop handoff
slowdini Jun 19, 2026
660f999
chore(isolated-runs): tighten the per-run output dir comments
slowdini Jun 19, 2026
492b0d5
Merge pull request #92 from slowdini/feat/isolated-run-full-loop
slowdini Jun 19, 2026
e2a6733
feat(isolated-runs): retire the session-juggling apparatus
slowdini Jun 20, 2026
8313b9d
Merge pull request #93 from slowdini/feat/isolated-run-retire-session…
slowdini Jun 20, 2026
9d8b8b2
feat(isolated-runs): focus guard flag for new isolated envs
slowdini Jun 20, 2026
c5705b4
Merge pull request #94 from slowdini/feat/isolated-run-reeval-write-g…
slowdini Jun 20, 2026
d55bb72
feat(claude): hybrid run mode support
slowdini Jun 20, 2026
428ddde
Merge pull request #95 from slowdini/feat/claude-hybrid-dispatch
slowdini Jun 20, 2026
21ec5bb
feat(claude): headless run mode support
slowdini Jun 20, 2026
f7da791
Merge pull request #96 from slowdini/feat/headless-run-mode
slowdini Jun 20, 2026
1b6c278
docs(isolated-runs): encapsulation bug fixes and docs update
slowdini Jun 20, 2026
c47f07d
Merge pull request #97 from slowdini/docs/isolated-run-final
slowdini Jun 20, 2026
10aab40
feat(run): setup-time isolation grouping for multi-run batches (#90)
slowdini Jun 21, 2026
ec385fd
Merge pull request #98 from slowdini/feat/isolation-groups
slowdini Jun 21, 2026
de79661
fix(run): walk per-(group,condition) Cli envs for guard + plugin-shad…
slowdini Jun 21, 2026
9850c3a
Merge pull request #101 from slowdini/fix/cli-multienv-guard-shadow
slowdini Jun 21, 2026
11f27df
chore(docs): retire isolated-run.md (#100)
slowdini Jun 21, 2026
d6a4686
Merge pull request #103 from slowdini/chore/retire-isolated-run-doc
slowdini Jun 21, 2026
5cdcae0
fix(codex): fix flag order
slowdini Jun 21, 2026
767bcd1
Merge pull request #105 from slowdini/fix/codex-approval-flag-order
slowdini Jun 21, 2026
f636bb7
chore(cli): rename artifacts directories
slowdini Jun 21, 2026
5fc8cde
Merge pull request #106 from slowdini/refactor/workspace-dir-to-eval-…
slowdini Jun 21, 2026
8770e5e
chore: bump version to 0.4.0
github-actions[bot] Jun 21, 2026
aee65ea
Merge branch 'main' into dev
slowdini Jun 21, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,6 @@
/target
.DS_Store

# eval-magic run artifacts (workspace root + per-env outputs) — churn every run
.eval-magic/
.eval-magic-outputs/
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "eval-magic"
version = "0.3.4"
version = "0.4.0"
edition = "2024"
description = "One-stop CLI for running skill evals — measure whether an agent skill actually shifts behavior."
license = "MIT"
Expand Down
104 changes: 72 additions & 32 deletions README.md

Large diffs are not rendered by default.

26 changes: 13 additions & 13 deletions docs/harness-parity.md

Large diffs are not rendered by default.

71 changes: 71 additions & 0 deletions profiles/claude-code/runbook.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
# Eval run — {{SKILL_NAME}} (iteration {{ITERATION}})

You are an agent in a **fresh, isolated** session. Follow this runbook top to bottom to run
the eval and produce `benchmark.json`. Everything you need is in this iteration directory —
you should not need anything from the surrounding repo.

- **Skill under test:** {{SKILL_NAME}}
- **Mode:** {{MODE}} — comparing `{{COND_A}}` vs `{{COND_B}}`
- **Dispatches:** {{NUM_TASKS}} (the `tasks[]` array in `{{DISPATCH_JSON}}`)

The two conditions run as **separate batches** in this one session: dispatch every subagent of
one batch, wait for them **all** to return, then switch conditions before dispatching the next.
Never interleave the batches — `switch-condition` removes the off-condition's staged skill, and a
subagent still in flight could observe a half-removed skill or read the wrong one.

## 1. Dispatch the `{{COND_A}}` batch

{{DISPATCH_COND_A}}

Wait for **every** one of these subagents to return before continuing.

## 2. Switch to the `{{COND_B}}` condition

This removes the `{{COND_A}}` staged skill so the `{{COND_B}}` batch cannot read it:

```
{{SWITCH_CMD}}
```

## 3. Dispatch the `{{COND_B}}` batch

{{DISPATCH_COND_B}}

Wait for **every** one of these subagents to return before continuing.

## 4. Ingest

```
{{INGEST_CMD}}
```

`ingest` records each run, backfills transcripts, scans for stray writes, and grades every
mechanical assertion. It then prints any `llm_judge` tasks it could not grade itself.

## 5. Dispatch the judge subagents, then finalize

Dispatch each judge task `ingest` listed as a subagent the same way — pass its
`agent_description` verbatim — then merge the verdicts and aggregate:

```
{{FINALIZE_CMD}}
```

## 6. Read the result

`finalize` writes the cross-condition benchmark to:

```
{{BENCHMARK_PATH}}
```

Read it for the per-condition pass rates and the `{{COND_A}}` − `{{COND_B}}` deltas. This is
the artifact the prep session resumes on.

## 7. Tear down

When you are done, remove the staged skills (and the write guard, if armed):

```
{{TEARDOWN_CMD}}
```
40 changes: 40 additions & 0 deletions profiles/shared/runbook-headless.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# Eval run — {{SKILL_NAME}} (iteration {{ITERATION}}, {{HARNESS}})

This runbook is for a human driving the run from a terminal. Work from this iteration directory
and copy-paste each step. The workspace is self-contained — you should not need the surrounding
repo.

- **Skill under test:** {{SKILL_NAME}}
- **Mode:** {{MODE}} — comparing `{{COND_A}}` vs `{{COND_B}}`
- **Dispatches:** {{NUM_TASKS}} (the `tasks[]` array in `{{DISPATCH_JSON}}`)

## 1. Dispatch the eval agents, then ingest
{{DISPATCH_RECIPE}}

`ingest` records each run, backfills transcripts, scans for stray writes, and grades every
mechanical assertion. It then prints any `llm_judge` tasks it could not grade itself.

## 2. Dispatch the judge agents, then finalize
{{JUDGE_RECIPE}}

Then merge the verdicts and aggregate:

```
{{FINALIZE_CMD}}
```

## 3. Read the result

`finalize` writes the cross-condition benchmark to:

```
{{BENCHMARK_PATH}}
```

Read it for the per-condition pass rates and the `{{COND_A}}` − `{{COND_B}}` deltas.

## 4. Tear down

```
{{TEARDOWN_CMD}}
```
6 changes: 6 additions & 0 deletions schema/evals.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,12 @@
"minimum": 1,
"description": "Runs per condition for this eval, for variance reduction; overrides the --runs flag. Defaults to the flag's value (1 unless raised)."
},
"isolation": {
"type": "string",
"enum": ["shared", "isolated"],
"default": "shared",
"description": "Isolation hint for run batching. 'isolated' forces this eval into its own group so it never shares an env with another eval (for confounds the framework can't detect from fixture conflicts, e.g. the agent mutates a shared fixture). Defaults to 'shared'. Evals whose fixtures conflict are auto-isolated regardless."
},
"skill_should_trigger": {
"type": "boolean",
"default": true,
Expand Down
136 changes: 136 additions & 0 deletions src/adapters/claude_cli.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
//! Claude Code `claude -p` command rendering for `DispatchMechanism::Cli`
//! guidance (hybrid / headless run modes).
//!
//! Differences from the Codex recipe, all forced by the `claude` CLI:
//! `--output-format stream-json` requires `--verbose` in `-p` mode; there is no
//! `--cd` flag, so the dispatch runs from the env dir (`cd <eval-root> &&`);
//! and there is no `--output-last-message`, so the final message is recovered
//! from the stream-json `result` event by the transcript adapter rather than
//! written to a file. `</dev/null` detaches stdin so a permission prompt cannot
//! block on a TTY and piped task data cannot become extra prompt context.

use super::cli_command::render_cli_model_arg;

/// Copy/pasteable Claude Code dispatch command template.
pub(crate) fn claude_exec_command_template(
model_flag: Option<&str>,
agent_model: Option<&str>,
) -> String {
let model_arg = render_cli_model_arg(model_flag, agent_model);
[
format!(
"cd <eval-root> && claude -p --output-format stream-json --verbose --permission-mode acceptEdits{model_arg} \\"
),
" \"Read the file at <dispatch_prompt_path> and follow its instructions exactly. When you finish, make your final response your closing summary.\" \\".to_string(),
" </dev/null \\".to_string(),
" > <outputs_dir>/claude-events.jsonl \\".to_string(),
" 2> <outputs_dir>/claude-stderr.log".to_string(),
]
.join("\n")
}

/// Parallel dispatch recipe over `dispatch.json` tasks, one `claude -p` per task.
pub(crate) fn claude_parallel_dispatch_recipe(
model_flag: Option<&str>,
agent_model: Option<&str>,
) -> String {
let model_arg = render_cli_model_arg(model_flag, agent_model);
[
"JOBS=${JOBS:-4}".to_string(),
"jq -j '.tasks[] | [.eval_root, .dispatch_prompt_path, .outputs_dir] | @tsv + \"\\u0000\"' dispatch.json | \\".to_string(),
" xargs -0 -P \"$JOBS\" -I{} sh -c '".to_string(),
" eval_root=\"$(printf \"%s\" \"$1\" | cut -f1)\"".to_string(),
" prompt_path=\"$(printf \"%s\" \"$1\" | cut -f2)\"".to_string(),
" outputs_dir=\"$(printf \"%s\" \"$1\" | cut -f3)\"".to_string(),
" mkdir -p \"$outputs_dir\"".to_string(),
format!(
" cd \"$eval_root\" && claude -p --output-format stream-json --verbose --permission-mode acceptEdits{model_arg} \\"
),
" \"Read the file at $prompt_path and follow its instructions exactly. When you finish, make your final response your closing summary.\" \\".to_string(),
" </dev/null \\".to_string(),
" > \"$outputs_dir/claude-events.jsonl\" \\".to_string(),
" 2> \"$outputs_dir/claude-stderr.log\"".to_string(),
" ' sh {}".to_string(),
]
.join("\n")
}

/// Judge dispatch recipe over `judge-tasks.json`, one `claude -p` per task.
pub(crate) fn claude_judge_dispatch_recipe(model_flag: Option<&str>) -> String {
let model_flag = model_flag.unwrap_or("--model");
[
"Dispatch each judge task from judge-tasks.json with:".to_string(),
String::new(),
"```bash".to_string(),
"JOBS=${JOBS:-4}".to_string(),
"jq -j '.tasks[] | [.dispatch_prompt_path, .response_path, (.model // \"\")] | @tsv + \"\\u0000\"' judge-tasks.json | \\".to_string(),
" xargs -0 -P \"$JOBS\" -I{} sh -c '".to_string(),
" prompt_path=\"$(printf \"%s\" \"$1\" | cut -f1)\"".to_string(),
" response_path=\"$(printf \"%s\" \"$1\" | cut -f2)\"".to_string(),
" model=\"$(printf \"%s\" \"$1\" | cut -f3)\"".to_string(),
" response_base=\"${response_path%.json}\"".to_string(),
" mkdir -p \"$(dirname \"$response_path\")\"".to_string(),
" model_arg=\"\"; [ -n \"$model\" ] && model_arg=\"".to_string()
+ model_flag
+ " $model\"",
" cd <eval-root> && claude -p --output-format stream-json --verbose --permission-mode acceptEdits $model_arg \\".to_string(),
" \"Read the file at $prompt_path and follow it exactly. You are a judge worker only: write the JSON verdict to $response_path, then reply with one sentence. Do not run eval-magic. Do not dispatch other judge tasks. Do not wait for other workers.\" \\".to_string(),
" </dev/null \\".to_string(),
" > \"$response_base.claude-events.jsonl\" \\".to_string(),
" 2> \"$response_base.claude-stderr.log\"".to_string(),
" ' sh {}".to_string(),
"```".to_string(),
]
.join("\n")
}

#[cfg(test)]
mod tests {
use super::{
claude_exec_command_template, claude_judge_dispatch_recipe, claude_parallel_dispatch_recipe,
};

#[test]
fn exec_template_carries_required_stream_json_flags() {
let cmd = claude_exec_command_template(Some("--model"), None);
assert!(cmd.contains("claude -p"), "{cmd}");
assert!(cmd.contains("--output-format stream-json"), "{cmd}");
// stream-json requires --verbose in -p mode.
assert!(cmd.contains("--verbose"), "{cmd}");
assert!(cmd.contains("--permission-mode acceptEdits"), "{cmd}");
assert!(cmd.contains("> <outputs_dir>/claude-events.jsonl"), "{cmd}");
assert!(cmd.contains("2> <outputs_dir>/claude-stderr.log"), "{cmd}");
assert!(cmd.contains("</dev/null"), "{cmd}");
// claude has no --cd flag; the dispatch runs from the env dir.
assert!(cmd.contains("cd <eval-root>"), "{cmd}");
assert!(cmd.contains("<dispatch_prompt_path>"), "{cmd}");
// claude has no --output-last-message; final text comes from the result event.
assert!(!cmd.contains("--output-last-message"), "{cmd}");
assert!(!cmd.contains("final-message.md"), "{cmd}");
}

#[test]
fn exec_template_includes_model_only_when_declared() {
let with = claude_exec_command_template(Some("--model"), Some("opus"));
assert!(with.contains("--model opus"), "{with}");
let without = claude_exec_command_template(Some("--model"), None);
assert!(!without.contains("--model "), "{without}");
}

#[test]
fn parallel_recipe_drives_claude_p_per_task() {
let recipe = claude_parallel_dispatch_recipe(Some("--model"), Some("sonnet"));
assert!(recipe.contains("claude -p"), "{recipe}");
assert!(recipe.contains("claude-events.jsonl"), "{recipe}");
assert!(recipe.contains("dispatch.json"), "{recipe}");
assert!(recipe.contains("--model sonnet"), "{recipe}");
}

#[test]
fn judge_recipe_drives_claude_p() {
let recipe = claude_judge_dispatch_recipe(Some("--model"));
assert!(recipe.contains("claude -p"), "{recipe}");
assert!(recipe.contains("judge-tasks.json"), "{recipe}");
assert!(recipe.contains("response_path"), "{recipe}");
}
}
51 changes: 32 additions & 19 deletions src/adapters/claude_code_transcript.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@ use std::path::{Path, PathBuf};
use std::time::SystemTime;

#[derive(Debug, Deserialize)]
struct UsageRecord {
input_tokens: Option<i64>,
output_tokens: Option<i64>,
cache_creation_input_tokens: Option<i64>,
cache_read_input_tokens: Option<i64>,
pub(crate) struct UsageRecord {
pub(crate) input_tokens: Option<i64>,
pub(crate) output_tokens: Option<i64>,
pub(crate) cache_creation_input_tokens: Option<i64>,
pub(crate) cache_read_input_tokens: Option<i64>,
}

#[derive(Debug, Deserialize)]
Expand All @@ -32,9 +32,9 @@ struct Message {
}

#[derive(Debug, Deserialize)]
struct TranscriptRecord {
pub(crate) struct TranscriptRecord {
#[serde(rename = "type")]
record_type: Option<String>,
pub(crate) record_type: Option<String>,
timestamp: Option<String>,
message: Option<Message>,
}
Expand Down Expand Up @@ -83,7 +83,7 @@ fn stringify_result(content: Option<&Value>) -> String {
}
}

fn read_records(jsonl_path: &Path) -> io::Result<Vec<TranscriptRecord>> {
pub(crate) fn read_records(jsonl_path: &Path) -> io::Result<Vec<TranscriptRecord>> {
let raw = fs::read_to_string(jsonl_path)?;
let mut records = Vec::new();
for line in raw.split('\n') {
Expand All @@ -98,7 +98,7 @@ fn read_records(jsonl_path: &Path) -> io::Result<Vec<TranscriptRecord>> {
Ok(records)
}

fn extract_invocations(records: &[TranscriptRecord]) -> Vec<ToolInvocation> {
pub(crate) fn extract_invocations(records: &[TranscriptRecord]) -> Vec<ToolInvocation> {
let mut invocations: Vec<ToolInvocation> = Vec::new();
let mut index_by_id: HashMap<String, usize> = HashMap::new();

Expand Down Expand Up @@ -151,6 +151,27 @@ pub fn parse_transcript(jsonl_path: &Path) -> io::Result<Vec<ToolInvocation>> {
Ok(extract_invocations(&read_records(jsonl_path)?))
}

/// The concatenated text blocks of the last assistant message carrying any text.
/// Shared with the `-p` stream-json parser, which uses it as the final-message
/// fallback when the terminal `result` event is absent or errored.
pub(crate) fn last_assistant_text(records: &[TranscriptRecord]) -> Option<String> {
let mut final_text: Option<String> = None;
for record in records {
if record.record_type.as_deref() != Some("assistant") {
continue;
}
let texts: Vec<&str> = content_blocks(&record.message)
.iter()
.filter(|b| b.get("type").and_then(Value::as_str) == Some("text"))
.filter_map(|b| b.get("text").and_then(Value::as_str))
.collect();
if !texts.is_empty() {
final_text = Some(texts.join("\n"));
}
}
final_text
}

/// A transcript boiled down to the artifacts the pipeline needs.
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct TranscriptSummary {
Expand Down Expand Up @@ -179,7 +200,6 @@ pub fn parse_transcript_full(jsonl_path: &Path) -> io::Result<TranscriptSummary>
let mut first_ts: Option<i64> = None;
let mut last_ts: Option<i64> = None;
let mut timestamp_count = 0usize;
let mut final_text: Option<String> = None;

for record in &records {
if let Some(ts_str) = &record.timestamp
Expand All @@ -201,17 +221,10 @@ pub fn parse_transcript_full(jsonl_path: &Path) -> io::Result<TranscriptSummary>
{
usage_by_id.insert(id, usage);
}

let texts: Vec<&str> = content_blocks(&record.message)
.iter()
.filter(|b| b.get("type").and_then(Value::as_str) == Some("text"))
.filter_map(|b| b.get("text").and_then(Value::as_str))
.collect();
if !texts.is_empty() {
final_text = Some(texts.join("\n"));
}
}

let final_text = last_assistant_text(&records);

let total_tokens = if usage_by_id.is_empty() {
None
} else {
Expand Down
Loading
Loading