Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "eval-magic"
version = "0.4.0"
version = "0.4.1"
edition = "2024"
description = "One-stop CLI for running skill evals — measure whether an agent skill actually shifts behavior."
license = "MIT"
Expand Down
19 changes: 15 additions & 4 deletions src/adapters/claude_cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
//! block on a TTY and piped task data cannot become extra prompt context.

use super::cli_command::render_cli_model_arg;
use std::path::Path;

/// Copy/pasteable Claude Code dispatch command template.
pub(crate) fn claude_exec_command_template(
Expand Down Expand Up @@ -56,8 +57,13 @@ pub(crate) fn claude_parallel_dispatch_recipe(
}

/// Judge dispatch recipe over `judge-tasks.json`, one `claude -p` per task.
pub(crate) fn claude_judge_dispatch_recipe(model_flag: Option<&str>) -> String {
///
/// Judges run from `judge_cwd` (the iteration dir) — a common ancestor of every
/// judge prompt, verdict `response_path`, and agent `outputs_dir`, and a dir with
/// no write-guard hook.
pub(crate) fn claude_judge_dispatch_recipe(model_flag: Option<&str>, judge_cwd: &Path) -> String {
let model_flag = model_flag.unwrap_or("--model");
let cwd = judge_cwd.display();
[
"Dispatch each judge task from judge-tasks.json with:".to_string(),
String::new(),
Expand All @@ -73,7 +79,9 @@ pub(crate) fn claude_judge_dispatch_recipe(model_flag: Option<&str>) -> String {
" model_arg=\"\"; [ -n \"$model\" ] && model_arg=\"".to_string()
+ model_flag
+ " $model\"",
" cd <eval-root> && claude -p --output-format stream-json --verbose --permission-mode acceptEdits $model_arg \\".to_string(),
format!(
" cd \"{cwd}\" && claude -p --output-format stream-json --verbose --permission-mode acceptEdits $model_arg \\"
),
" \"Read the file at $prompt_path and follow it exactly. You are a judge worker only: write the JSON verdict to $response_path, then reply with one sentence. Do not run eval-magic. Do not dispatch other judge tasks. Do not wait for other workers.\" \\".to_string(),
" </dev/null \\".to_string(),
" > \"$response_base.claude-events.jsonl\" \\".to_string(),
Expand All @@ -89,6 +97,7 @@ mod tests {
use super::{
claude_exec_command_template, claude_judge_dispatch_recipe, claude_parallel_dispatch_recipe,
};
use std::path::Path;

#[test]
fn exec_template_carries_required_stream_json_flags() {
Expand Down Expand Up @@ -127,10 +136,12 @@ mod tests {
}

#[test]
fn judge_recipe_drives_claude_p() {
let recipe = claude_judge_dispatch_recipe(Some("--model"));
fn judge_recipe_runs_from_iteration_dir() {
let recipe = claude_judge_dispatch_recipe(Some("--model"), Path::new("/work/iter-1"));
assert!(recipe.contains("claude -p"), "{recipe}");
assert!(recipe.contains("judge-tasks.json"), "{recipe}");
assert!(recipe.contains("response_path"), "{recipe}");
assert!(!recipe.contains("<eval-root>"), "{recipe}");
assert!(recipe.contains("cd \"/work/iter-1\" &&"), "{recipe}");
}
}
22 changes: 16 additions & 6 deletions src/adapters/codex_cli.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
//! Codex CLI command rendering for `DispatchMechanism::Cli` guidance.

use super::cli_command::render_cli_model_arg;
use std::path::Path;

/// Copy/pasteable Codex dispatch command template. Stdin is detached so a
/// surrounding `xargs`/pipe cannot be treated as extra prompt context.
Expand Down Expand Up @@ -60,13 +61,20 @@ pub(crate) fn codex_parallel_dispatch_recipe(
.join("\n")
}

pub(crate) fn codex_judge_dispatch_recipe(model_flag: Option<&str>, guard: bool) -> String {
/// Judges run from `judge_cwd` (the iteration dir) — a common ancestor of every
/// judge prompt, verdict `response_path`, and agent `outputs_dir`.
pub(crate) fn codex_judge_dispatch_recipe(
model_flag: Option<&str>,
guard: bool,
judge_cwd: &Path,
) -> String {
let hook_trust = if guard {
" --dangerously-bypass-hook-trust"
} else {
""
};
let model_flag = model_flag.unwrap_or("-m");
let cwd = judge_cwd.display();
[
"Dispatch each judge task from judge-tasks.json with:".to_string(),
String::new(),
Expand All @@ -81,15 +89,15 @@ pub(crate) fn codex_judge_dispatch_recipe(model_flag: Option<&str>, guard: bool)
" mkdir -p \"$(dirname \"$response_path\")\"".to_string(),
" if [ -n \"$model\" ]; then".to_string(),
format!(
" codex --ask-for-approval never exec --cd <eval-root> --sandbox workspace-write{hook_trust} {model_flag} \"$model\" --json \\"
" codex --ask-for-approval never exec --cd \"{cwd}\" --sandbox workspace-write{hook_trust} {model_flag} \"$model\" --json \\"
),
" \"Read the file at $prompt_path and follow it exactly. You are a judge worker only: write the JSON verdict to $response_path, then reply with one sentence. Do not run eval-magic. Do not dispatch other judge tasks. Do not wait for other workers.\" \\".to_string(),
" </dev/null \\".to_string(),
" > \"$response_base.codex-events.jsonl\" \\".to_string(),
" 2> \"$response_base.codex-stderr.log\"".to_string(),
" else".to_string(),
format!(
" codex --ask-for-approval never exec --cd <eval-root> --sandbox workspace-write{hook_trust} --json \\"
" codex --ask-for-approval never exec --cd \"{cwd}\" --sandbox workspace-write{hook_trust} --json \\"
),
" \"Read the file at $prompt_path and follow it exactly. You are a judge worker only: write the JSON verdict to $response_path, then reply with one sentence. Do not run eval-magic. Do not dispatch other judge tasks. Do not wait for other workers.\" \\".to_string(),
" </dev/null \\".to_string(),
Expand All @@ -107,6 +115,7 @@ mod tests {
use super::{
codex_exec_command_template, codex_judge_dispatch_recipe, codex_parallel_dispatch_recipe,
};
use std::path::Path;

#[test]
fn exec_template_places_approval_policy_before_exec() {
Expand All @@ -133,19 +142,20 @@ mod tests {

#[test]
fn judge_recipe_places_approval_policy_before_exec() {
let recipe = codex_judge_dispatch_recipe(Some("-m"), true);
let recipe = codex_judge_dispatch_recipe(Some("-m"), true, Path::new("/work/iter-1"));

assert!(
recipe.contains(
" codex --ask-for-approval never exec --cd <eval-root> --sandbox workspace-write --dangerously-bypass-hook-trust -m \"$model\" --json \\"
" codex --ask-for-approval never exec --cd \"/work/iter-1\" --sandbox workspace-write --dangerously-bypass-hook-trust -m \"$model\" --json \\"
),
"{recipe}"
);
assert!(
recipe.contains(
" codex --ask-for-approval never exec --cd <eval-root> --sandbox workspace-write --dangerously-bypass-hook-trust --json \\"
" codex --ask-for-approval never exec --cd \"/work/iter-1\" --sandbox workspace-write --dangerously-bypass-hook-trust --json \\"
),
"{recipe}"
);
assert!(!recipe.contains("<eval-root>"), "{recipe}");
}
}
15 changes: 10 additions & 5 deletions src/adapters/harness.rs
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ pub trait HarnessAdapter {
/// The post-`grade` / post-`ingest` judge dispatch guidance for a
/// [`Cli`](crate::core::DispatchMechanism::Cli)-dispatch harness. `None`
/// leaves the generic in-session-style judge handoff in place.
fn cli_judge_next_steps(&self, _ctx: CliJudgeContext) -> Option<String> {
fn cli_judge_next_steps(&self, _ctx: CliJudgeContext<'_>) -> Option<String> {
None
}

Expand Down Expand Up @@ -195,8 +195,9 @@ pub struct CliManifestContext<'a> {

/// Context for rendering a harness's one-shot CLI judge-dispatch guidance.
#[derive(Debug, Clone, Copy)]
pub struct CliJudgeContext {
pub struct CliJudgeContext<'a> {
pub guard: bool,
pub iteration_dir: &'a Path,
}

impl HarnessAdapter for ClaudeCodeAdapter {
Expand Down Expand Up @@ -261,8 +262,11 @@ impl HarnessAdapter for ClaudeCodeAdapter {
String::new(),
])
}
fn cli_judge_next_steps(&self, _ctx: CliJudgeContext) -> Option<String> {
Some(claude_judge_dispatch_recipe(self.cli_model_flag()))
fn cli_judge_next_steps(&self, ctx: CliJudgeContext<'_>) -> Option<String> {
Some(claude_judge_dispatch_recipe(
self.cli_model_flag(),
ctx.iteration_dir,
))
}
fn parse_transcript(&self, path: &Path) -> io::Result<Vec<ToolInvocation>> {
parse_transcript(path)
Expand Down Expand Up @@ -350,10 +354,11 @@ impl HarnessAdapter for CodexAdapter {
String::new(),
])
}
fn cli_judge_next_steps(&self, ctx: CliJudgeContext) -> Option<String> {
fn cli_judge_next_steps(&self, ctx: CliJudgeContext<'_>) -> Option<String> {
Some(codex_judge_dispatch_recipe(
self.cli_model_flag(),
ctx.guard,
ctx.iteration_dir,
))
}
fn parse_transcript(&self, path: &Path) -> io::Result<Vec<ToolInvocation>> {
Expand Down
17 changes: 13 additions & 4 deletions src/cli/commands/pipeline.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,19 @@ use crate::validation;

const JUDGE_WORKER_PROMPT: &str = "Read the file at <dispatch_prompt_path> and follow it exactly. You are a judge worker only: write the JSON verdict to <response_path>, then reply with one sentence. Do not run eval-magic. Do not dispatch other judge tasks. Do not wait for other workers.";

fn judge_dispatch_guidance(ctx: &RunContext) -> String {
fn judge_dispatch_guidance(ctx: &RunContext, iteration: u32) -> String {
let iteration_dir = ctx
.workspace_root
.join(&ctx.skill_name)
.join(format!("iteration-{iteration}"));
match ctx.run_mode.mechanism() {
DispatchMechanism::InSession => {
format!("Dispatch each task as a judge subagent with:\n {JUDGE_WORKER_PROMPT}")
}
DispatchMechanism::Cli => adapter_for(ctx.harness)
.cli_judge_next_steps(CliJudgeContext {
guard: sandbox::guard_is_armed(&ctx.stage_root),
iteration_dir: &iteration_dir,
})
.unwrap_or_else(|| {
format!(
Expand Down Expand Up @@ -107,7 +112,7 @@ pub(crate) fn run_ingest(args: CommonArgs) -> anyhow::Result<()> {
.and_then(|s| serde_json::from_str::<serde_json::Value>(&s).ok())
.and_then(|v| v.get("total_tasks").and_then(serde_json::Value::as_u64));
let target_args = command_target_args(&ctx);
let judge_guidance = judge_dispatch_guidance(&ctx);
let judge_guidance = judge_dispatch_guidance(&ctx, iteration);
match total_tasks {
Some(0) => println!(
"\n✅ Ingest complete — no judge dispatches needed.\nNext: eval-magic finalize{target_args} --iteration {iteration}"
Expand Down Expand Up @@ -351,15 +356,19 @@ pub(crate) fn run_record_runs(args: CommonArgs) -> anyhow::Result<()> {
pipeline::record_runs(&dir, ctx.harness, mechanism, subagents_dir, args.overwrite)?;

println!(
"\nRecorded: {}, skipped (existing run.json): {}, skipped (no final message): {}, missing transcript: {}",
"\nRecorded: {}, skipped (existing run.json): {}, skipped (no final message): {}, skipped (prompt unread): {}, missing transcript: {}",
result.recorded,
result.skipped_existing,
result.skipped_no_final_message,
result.skipped_prompt_unread,
result.missing_transcript
);
if let Some(warning) = result.transcript_warning(ctx.harness, mechanism) {
eprintln!("{warning}");
}
if let Some(warning) = result.prompt_unread_warning() {
eprintln!("{warning}");
}
Ok(())
}

Expand Down Expand Up @@ -502,7 +511,7 @@ pub(crate) fn run_grade(args: GradeArgs) -> anyhow::Result<()> {
);
}
let target_args = command_target_args(&ctx);
let judge_guidance = judge_dispatch_guidance(&ctx);
let judge_guidance = judge_dispatch_guidance(&ctx, iteration);
println!(
"\nNext: {judge_guidance}\nThen run: eval-magic grade{target_args} --iteration {iteration} --finalize"
);
Expand Down
8 changes: 5 additions & 3 deletions src/cli/run/dispatch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,7 @@ pub fn build_dispatch_task(opts: &DispatchTaskOpts) -> Result<DispatchTask, RunE
run_record_path: cond_dir.join("run.json").to_string_lossy().into_owned(),
timing_path: cond_dir.join("timing.json").to_string_lossy().into_owned(),
agent_description,
dispatch_prompt_path: cond_dir
dispatch_prompt_path: Path::new(opts.outputs_dir)
.join("dispatch-prompt.txt")
.to_string_lossy()
.into_owned(),
Expand Down Expand Up @@ -681,9 +681,11 @@ mod tests {
}

#[test]
fn dispatch_prompt_path_under_cond_dir() {
fn dispatch_prompt_path_under_outputs_dir() {
let task = build_dispatch_task(&base_opts()).unwrap();
assert_eq!(task.dispatch_prompt_path, "/tmp/cond/dispatch-prompt.txt");
assert_eq!(task.dispatch_prompt_path, "/tmp/out/dispatch-prompt.txt");
assert_eq!(task.run_record_path, "/tmp/cond/run.json");
assert_eq!(task.timing_path, "/tmp/cond/timing.json");
}

const SAMPLE_DIRECTORY: &str = "## Active Skills Directory\n\n* **`test-driven-development`**\n * *Trigger:* Use whenever implementing code.\n* **`systematic-debugging`**\n * *Trigger:* Use when debugging.";
Expand Down
10 changes: 5 additions & 5 deletions src/cli/run/orchestrate/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -173,13 +173,13 @@ pub(super) fn write_dispatch(
} else {
(cond_dir.join(format!("run-{run_idx}")), Some(run_idx))
};
// Create the per-run meta dir (run.json / timing.json /
// dispatch-prompt.txt), which lives above the env.
// Create the per-run meta dir (run.json / timing.json), which
// lives above the env.
fs::create_dir_all(&run_dir)?;
// The agent-under-test's cwd is its env, so its outputs land
// *inside* it — never above its sandbox.
// A hidden, per-(eval, condition, run) subtree keeps concurrent
// same-env subagents from colliding.
// *inside* it — never above its sandbox. A hidden,
// per-(eval, condition, run) subtree keeps concurrent same-env
// subagents from colliding.
let outputs_rel = match run_index {
None => format!("eval-{}/{cond_name}", ev.id),
Some(k) => format!("eval-{}/{cond_name}/run-{k}", ev.id),
Expand Down
5 changes: 4 additions & 1 deletion src/cli/run/runbook.rs
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,10 @@ pub(crate) fn build_runbook(ctx: &RunbookContext) -> String {
agent_model: ctx.agent_model,
});
judge_recipe = adapter
.cli_judge_next_steps(CliJudgeContext { guard: ctx.guard })
.cli_judge_next_steps(CliJudgeContext {
guard: ctx.guard,
iteration_dir: ctx.iteration_dir,
})
.unwrap_or_else(|| {
"Dispatch each judge task `ingest` listed through the same harness CLI, \
capturing its transcript output, then finalize."
Expand Down
Loading
Loading