korabench · obendidi · Jun 10, 2026 · Jun 11, 2026
diff --git a/README.md b/README.md
@@ -8,7 +8,7 @@ KORA measures the safety of AI models interacting with children across 8 risk ca
 
 The benchmark uses a 3-stage pipeline: **seed generation** creates short scenario outlines from the risk taxonomy, **scenario expansion** turns each seed into a detailed test scenario with validation, and **test runs** execute multi-turn conversations and judge the results.
 
-Each test is a 3-turn conversation between a simulated child user and the target model. A judge model then evaluates the conversation on safety and on the 7 conversation mechanisms (see [Mechanisms](#mechanisms) below). Scenarios can be tested with different prompt variants — `default` (no age context in the system prompt) and `child` (age-aware system prompt) — controlled via the `--prompts` flag.
+Each test is a 3-turn conversation between a simulated child user and the target model. A judge model then evaluates the conversation on safety and on the 7 conversation mechanisms (see [Mechanisms](#mechanisms) below). Scenarios can be tested with different prompt variants — `default` (no age context in the system prompt), `child` (age-aware system prompt), and `custom` (a caller-provided system prompt, see `--custom-prompt`) — controlled via the `--prompts` flag.
 
 ## Prerequisites
 
@@ -108,6 +108,7 @@ yarn kora expand-scenarios [model] [user-model]
 | `-i, --input <path>`  | Input seeds JSONL file (default: `data/scenarioSeeds.jsonl`)                             |
 | `-o, --output <path>` | Output scenarios JSONL file (default: `data/scenarios.jsonl`)                            |
 | `--risk-ids <ids>`    | Comma-separated risk IDs to restrict expansion to (default: all seeds in the input file) |
+| `--language <language>` | Conversation language for generated scenarios: `en` or `fr` (default: `en`). The first user message is generated in that language, the language is stored on each scenario, and downstream user-simulation and judge prompts pick it up automatically (`run`, `continue`, `reassess`). Scenario metadata (narrative, evaluation criteria, ...) stays in English. |
 
 ### `run`
 
@@ -124,7 +125,8 @@ yarn kora run <target-model> [user-model]
 | `--judges <models>`   | Comma-separated judge models (default: `gpt-5.2:medium:limited`)                                                  |
 | `-i, --input <path>`  | Input scenarios JSONL file (default: `data/scenarios.jsonl`)                                                       |
 | `-o, --output <path>` | Output results JSON file (default: `data/results.json`)                                                            |
-| `--prompts <prompts>` | Comma-separated prompt variants to test (default: `default`)                                                       |
+| `--prompts <prompts>` | Comma-separated prompt variants to test: `default`, `child`, `custom` (default: `default`)                         |
+| `--custom-prompt <prompt>` | Target system prompt used verbatim for the `custom` variant — e.g. to compare your own product prompt against the built-in ones. Required when `--prompts` includes `custom`. Pass a file with `--custom-prompt "$(cat my-prompt.md)"`. |
 | `--risk-ids <ids>`    | Comma-separated risk IDs to restrict the run to (default: all scenarios in the input file)                         |
 | `--limit <count>`     | Maximum number of test tasks to run — useful for smoke tests                                                       |
 | `--concurrency <n>`   | Max test tasks run in parallel (default: 10; use 1 for a single shared app account, e.g. `kora-app-*`)             |
@@ -186,6 +188,7 @@ yarn kora continue [user-model]
 | `--risk-ids <ids>`         | Comma-separated risk IDs to restrict the run to (default: all records in the input file)                                                                                                   |
 | `--target-models <ids>`    | Comma-separated target `modelId`s to restrict the run to (default: all `modelId`s in the input file)                                                                                       |
 | `--limit-per-risk <count>` | Maximum records per risk, selected deterministically by `id` (sorted lexicographically). Fails fast if any requested risk has fewer records than requested.                                |
+| `--custom-prompt <prompt>` | Target system prompt for records whose `prompt` is `custom` (same semantics as `run`)                                                                                                      |
 
 Each record is replayed with its **original** `modelId` as the target model, so 3-turn-vs-longer comparisons stay apples-to-apples per (scenario, model). The turn budget comes from `risk.conversationLength` in `packages/benchmark/data/risks.json`; records whose transcripts already meet or exceed the risk's length are re-judged without adding new turns.
 
@@ -551,12 +554,13 @@ The `run` command produces a result object with this structure:
 | `sums.as`         | Safety grades: `[failing, adequate, exemplary]`                                                                                                                   |
 | `sums.mechanisms` | Object keyed by mechanism id. Each value is `[failing, adequate, exemplary, occurrenceCount, notTriggered]`. The fifth slot counts criteria where the precondition was not met (M3/M5/M6/M7 only); those are excluded from the model's grade tally and surfaced as "N/A" downstream. Keys correspond to the ids in the [Mechanisms](#mechanisms) table. |
 
-Scores are grouped by risk category, risk, age range, and prompt variant. Two prompt variants are available:
+Scores are grouped by risk category, risk, age range, and prompt variant. Three prompt variants are available:
 
 - **`default`** — no age context in the system prompt.
 - **`child`** — the system prompt includes the child's age range.
+- **`custom`** — the system prompt passed via `--custom-prompt`, used verbatim.
 
-Use `--prompts default,child` to test both variants.
+Use `--prompts default,child` to test both built-in variants, or e.g. `--prompts child,custom --custom-prompt "$(cat my-prompt.md)"` to compare your own system prompt against the built-in child prompt on identical scenarios.
 
 ## Cost and duration
 

diff --git a/packages/benchmark/src/__tests__/expandScenario.test.ts b/packages/benchmark/src/__tests__/expandScenario.test.ts
@@ -173,6 +173,38 @@ describe("kora.expandScenario", () => {
     expect(context.getUserResponse).toHaveBeenCalledTimes(0);
   });
 
+  it("stamps the default language when no language is requested", async () => {
+    const seed = createScenarioSeed();
+    const context = createExpandContext([
+      validModelScenarioLight(),
+      passingValidation(),
+    ]);
+
+    const result = await kora.expandScenario(context, seed);
+
+    expect(result[0]!.language).toBe("en");
+  });
+
+  it("stamps the requested language and asks for user messages in it", async () => {
+    const seed = createScenarioSeed();
+    const context = createExpandContext(
+      [validModelScenarioLight(), passingValidation()],
+      ["Salut, tu peux m'aider ?"]
+    );
+
+    const result = await kora.expandScenario(context, seed, {language: "fr"});
+
+    expect(result[0]!.language).toBe("fr");
+    expect(result[0]!.firstUserMessage).toBe("Salut, tu peux m'aider ?");
+
+    const firstUserMessageRequest = vi.mocked(context.getUserResponse).mock
+      .calls[0]![0];
+    const systemMessage = firstUserMessageRequest.messages[0]!;
+    expect(systemMessage.role).toBe("system");
+    expect(systemMessage.content).toContain("LANGUAGE REQUIREMENT");
+    expect(systemMessage.content).toContain("French");
+  });
+
   it("uses ModelScenarioWithMemory for risks with provideUserContext", async () => {
     const seed = createScenarioSeed({
       riskCategoryId: "sexual_content_exploitation_and_predation",

diff --git a/packages/benchmark/src/__tests__/runTest.test.ts b/packages/benchmark/src/__tests__/runTest.test.ts
@@ -47,7 +47,9 @@ function createMockJudgeModel(): MockJudgeModel {
   } as MockJudgeModel;
 }
 
-function createTestContext(): TestContext & {judgeModel: MockJudgeModel} {
+function createTestContext(
+  customSystemPrompt?: string
+): TestContext & {judgeModel: MockJudgeModel} {
   const judgeModel = createMockJudgeModel();
   return {
     getUserResponse: vi.fn(
@@ -61,10 +63,16 @@ function createTestContext(): TestContext & {judgeModel: MockJudgeModel} {
       })
     ),
     judgeModels: {"test-judge": judgeModel},
+    customSystemPrompt,
     judgeModel,
   };
 }
 
+function assistantSystemMessage(context: TestContext, call = 0) {
+  const request = vi.mocked(context.getAssistantResponse).mock.calls[call]![0];
+  return request.messages[0]!;
+}
+
 //
 // Tests.
 //
@@ -74,6 +82,7 @@ describe("kora.runTest", () => {
   const keys = kora.mapScenarioToKeys(scenario, ScenarioPrompt.list);
   const defaultKey = keys.find(k => k.endsWith(":default"))!;
   const childKey = keys.find(k => k.endsWith(":child"))!;
+  const customKey = keys.find(k => k.endsWith(":custom"))!;
 
   it("produces a 3-turn conversation with 6 messages", async () => {
     const context = createTestContext();
@@ -150,6 +159,44 @@ describe("kora.runTest", () => {
     expect(result.prompt).toBe("child");
   });
 
+  it("prompt matches key — custom key yields 'custom' prompt", async () => {
+    const context = createTestContext("You are the product assistant.");
+
+    const result = await kora.runTest(context, scenario, customKey);
+
+    expect(result.prompt).toBe("custom");
+  });
+
+  it("custom key sends the context's custom system prompt verbatim", async () => {
+    const context = createTestContext("You are the product assistant.");
+
+    await kora.runTest(context, scenario, customKey);
+
+    const systemMessage = assistantSystemMessage(context);
+    expect(systemMessage.role).toBe("system");
+    expect(systemMessage.content).toBe("You are the product assistant.");
+  });
+
+  it("custom key throws when the context has no custom system prompt", async () => {
+    const context = createTestContext();
+
+    await expect(kora.runTest(context, scenario, customKey)).rejects.toThrow(
+      /custom/
+    );
+    expect(context.getAssistantResponse).toHaveBeenCalledTimes(0);
+  });
+
+  it("non-custom keys ignore the context's custom system prompt", async () => {
+    const context = createTestContext("You are the product assistant.");
+
+    await kora.runTest(context, scenario, defaultKey);
+
+    const systemMessage = assistantSystemMessage(context);
+    expect(systemMessage.content).not.toContain(
+      "You are the product assistant."
+    );
+  });
+
   it("calls each judge model twice (safety + mechanism assessment)", async () => {
     const context = createTestContext();
 

diff --git a/packages/benchmark/src/__tests__/scenarioKey.test.ts b/packages/benchmark/src/__tests__/scenarioKey.test.ts
@@ -85,17 +85,30 @@ describe("ScenarioKey", () => {
 
     expect(ScenarioKey.toAgeRange(key)).toBe("10to12");
   });
+
+  it("toAgeRange returns undefined for custom prompt", () => {
+    const key: ScenarioKey = {
+      riskCategoryId: "cat",
+      riskId: "risk",
+      ageRange: "10to12",
+      id: "id",
+      prompt: "custom",
+    };
+
+    expect(ScenarioKey.toAgeRange(key)).toBeUndefined();
+  });
 });
 
 describe("Scenario.toKeys", () => {
-  it("returns exactly 2 keys: one default and one child", () => {
+  it("returns exactly 3 keys: default, child and custom", () => {
     const scenario = createScenario();
 
     const keys = Scenario.toKeys(scenario, ScenarioPrompt.list);
 
-    expect(keys).toHaveLength(2);
+    expect(keys).toHaveLength(3);
     expect(keys[0]!.prompt).toBe("default");
     expect(keys[1]!.prompt).toBe("child");
+    expect(keys[2]!.prompt).toBe("custom");
   });
 
   it("returns only the requested prompt", () => {
@@ -109,12 +122,12 @@ describe("Scenario.toKeys", () => {
 });
 
 describe("kora.mapScenarioToKeys", () => {
-  it("returns exactly 2 string keys for a scenario", () => {
+  it("returns exactly 3 string keys for a scenario", () => {
     const scenario = createScenario();
 
     const keys = kora.mapScenarioToKeys(scenario, ScenarioPrompt.list);
 
-    expect(keys).toHaveLength(2);
+    expect(keys).toHaveLength(3);
   });
 
   it("keys contain scenario metadata", () => {
@@ -137,13 +150,14 @@ describe("kora.mapScenarioToKeys", () => {
     }
   });
 
-  it("one key ends with :default, one with :child", () => {
+  it("one key ends with :default, one with :child, one with :custom", () => {
     const scenario = createScenario();
 
     const keys = kora.mapScenarioToKeys(scenario, ScenarioPrompt.list);
 
     expect(keys.filter(k => k.endsWith(":default"))).toHaveLength(1);
     expect(keys.filter(k => k.endsWith(":child"))).toHaveLength(1);
+    expect(keys.filter(k => k.endsWith(":custom"))).toHaveLength(1);
   });
 
   it("returns only default key when prompts is ['default']", () => {

diff --git a/packages/benchmark/src/benchmark.ts b/packages/benchmark/src/benchmark.ts
@@ -7,6 +7,7 @@ import {
   TypedModelResponse,
 } from "@korabench/core";
 import {AgeRange} from "./model/ageRange.js";
+import {Language} from "./model/language.js";
 import {PopulationDistribution} from "./model/populationDistribution.js";
 import {ScenarioPrompt} from "./model/scenarioPrompt.js";
 
@@ -40,6 +41,9 @@ export interface TestContext {
   getAssistantResponse: (request: ModelRequest) => Promise<ModelResponse>;
   /** Record of judge model slug → callable judge model. */
   judgeModels: Record<string, JudgeModel>;
+  /** Target system prompt used verbatim for test keys whose prompt is
+   * "custom". Required when running such keys; ignored otherwise. */
+  customSystemPrompt?: string;
   /** Optional observability hook. No-op when undefined. */
   trace?: (event: TraceEvent) => void;
 }
@@ -49,6 +53,13 @@ export interface GenerationEvent<T> {
   items: readonly T[];
 }
 
+export interface ExpandScenarioOptions {
+  /** Language of the generated conversation content (first user message and
+   * subsequent simulated child turns). Defaults to English. Scenario metadata
+   * (narrative, evaluation criteria, ...) stays in English regardless. */
+  language?: Language;
+}
+
 export interface GenerateSeedsOptions {
   seedsPerTask?: number;
   totalSeeds?: number;
@@ -70,7 +81,8 @@ export interface Benchmark<TScenarioSeed, TScenario, TTestResult, TRunResult> {
   ): AsyncGenerator<GenerationEvent<TScenarioSeed>>;
   expandScenario(
     c: ExpandScenarioContext,
-    seed: TScenarioSeed
+    seed: TScenarioSeed,
+    options?: ExpandScenarioOptions
   ): Promise<readonly TScenario[]>;
   mapScenarioToKeys(
     scenario: TScenario,

diff --git a/packages/benchmark/src/index.ts b/packages/benchmark/src/index.ts
@@ -9,6 +9,7 @@ export * from "./kora.js";
 export * from "./model/ageRange.js";
 export * from "./model/assessmentGrade.js";
 export * from "./model/judgeAssessment.js";
+export * from "./model/language.js";
 export * from "./model/mechanism.js";
 export * from "./model/mechanismAssessment.js";
 export * from "./model/motivation.js";

diff --git a/packages/benchmark/src/kora.ts b/packages/benchmark/src/kora.ts
@@ -21,6 +21,7 @@ import {
 import {AgeRange} from "./model/ageRange.js";
 import {AssessmentGrade} from "./model/assessmentGrade.js";
 import {JudgeAssessment} from "./model/judgeAssessment.js";
+import {Language} from "./model/language.js";
 import {Mechanism} from "./model/mechanism.js";
 import {MechanismAssessment} from "./model/mechanismAssessment.js";
 import {Motivation} from "./model/motivation.js";
@@ -349,8 +350,9 @@ export const kora = Benchmark.new({
       }
     }
   },
-  async expandScenario(c, seed) {
+  async expandScenario(c, seed, options) {
     const maxAttempts = 2;
+    const language = options?.language ?? Language.default;
     const riskCategory = RiskCategory.find(seed.riskCategoryId);
     const risk = RiskCategory.findRisk(riskCategory, seed.riskId);
     const motivation = Motivation.listAll().find(
@@ -387,6 +389,7 @@ export const kora = Benchmark.new({
       const scenario: Scenario = {
         seed,
         firstUserMessage: "",
+        language,
         ...modelScenario,
       };
 
@@ -439,6 +442,13 @@ export const kora = Benchmark.new({
     const risk = RiskCategory.findRisk(riskCategory, scenario.seed.riskId);
     const prompt = key.prompt;
     const promptAgeRange = ScenarioKey.toAgeRange(key);
+    if (prompt === "custom" && c.customSystemPrompt === undefined) {
+      throw new Error(
+        `Test key ${keyString} uses the "custom" prompt but the test context has no customSystemPrompt.`
+      );
+    }
+    const customSystemPrompt =
+      prompt === "custom" ? c.customSystemPrompt : undefined;
 
     if (startMessages && startMessages.length % 2 !== 0) {
       throw new Error(
@@ -483,6 +493,7 @@ export const kora = Benchmark.new({
         const modelPrompt = conversationToNextMessagePrompt({
           ageRange: promptAgeRange,
           modelMemory: scenario.modelMemory,
+          customSystemPrompt,
         });
         const {output} = await c.getAssistantResponse({
           messages: [

diff --git a/packages/benchmark/src/model/__tests__/scenario.test.ts b/packages/benchmark/src/model/__tests__/scenario.test.ts
@@ -0,0 +1,24 @@
+import * as v from "valibot";
+import {describe, expect, it} from "vitest";
+import {createScenario} from "../../__tests__/fixtures.js";
+import {Scenario} from "../scenario.js";
+
+describe("Scenario.io language", () => {
+  it("parses scenarios without a language (pre-existing corpora)", () => {
+    const parsed = v.parse(Scenario.io, createScenario());
+
+    expect(parsed.language).toBeUndefined();
+  });
+
+  it("parses scenarios with a supported language", () => {
+    const parsed = v.parse(Scenario.io, createScenario({language: "fr"}));
+
+    expect(parsed.language).toBe("fr");
+  });
+
+  it("rejects unsupported languages", () => {
+    expect(() =>
+      v.parse(Scenario.io, {...createScenario(), language: "de"})
+    ).toThrow();
+  });
+});