microsoft · Jiawen-CS · Apr 14, 2026 · Apr 14, 2026 · Apr 14, 2026 · Apr 14, 2026
diff --git a/.github/actions/setup-bc-container/action.yml b/.github/actions/setup-bc-container/action.yml
@@ -37,7 +37,7 @@ runs:
         # Mask the password in GitHub Actions logs
         Write-Output "::add-mask::$password"
 
-        "BC_CONTAINER_NAME=bcbench-$("${{ inputs.instance-id }}".Split('-')[1])" | Out-File -FilePath $env:GITHUB_ENV -Append
+        "BC_CONTAINER_NAME=bcbench-$("${{ inputs.instance-id }}".Split('-')[1].Split('_')[0])" | Out-File -FilePath $env:GITHUB_ENV -Append
         "BC_CONTAINER_USERNAME=admin" | Out-File -FilePath $env:GITHUB_ENV -Append
         "BC_CONTAINER_PASSWORD=$password" | Out-File -FilePath $env:GITHUB_ENV -Append
       shell: pwsh

diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md
@@ -16,7 +16,7 @@ This is a benchmark for evaluating coding agents on real-world Business Central
 - Uses `pre-commit` for code quality checks (ruff linting/formatting, trailing whitespace, etc.)
 
 ## Categories
-BC-Bench is category-based and designed to grow over time. It currently has two categories, `bug-fix` and `test-generation`. They share the same dataset tasks and execution-based setup, but use different prompts, expected outputs, and evaluation pipelines. Future categories such as `code-review` can be added within the same overall benchmark structure, though they may require different inputs, setup, or evaluation methods.
+BC-Bench is category-based and designed to grow over time. It currently has two execution-based categories that update the public leaderboard, `bug-fix` and `test-generation`, plus a single counterfactual category `cf` (entries from `counterfactual.jsonl`) that reuses the bug-fix pipeline but only saves raw results — its metrics are computed offline in notebooks. Future categories such as `code-review` can be added within the same overall benchmark structure, though they may require different inputs, setup, or evaluation methods.
 
 ## Coding Patterns and Guidelines
 

diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
@@ -41,7 +41,7 @@ jobs:
         id: random
         shell: pwsh
         run: |
-          $categories = @("bug-fix", "test-generation")
+          $categories = @("bug-fix", "test-generation", "cf")
           $selected = $categories | Get-Random
           echo "category=$selected" >> $env:GITHUB_OUTPUT
 

diff --git a/.github/workflows/claude-evaluation.yml b/.github/workflows/claude-evaluation.yml
@@ -23,6 +23,7 @@ on:
         options:
           - "bug-fix"
           - "test-generation"
+          - "cf"
       test-run:
         description: "Indicate this is a test run (with few entries)"
         required: false

diff --git a/.github/workflows/copilot-evaluation.yml b/.github/workflows/copilot-evaluation.yml
@@ -29,6 +29,7 @@ on:
         options:
           - "bug-fix"
           - "test-generation"
+          - "cf"
       test-run:
         description: "Indicate this is a test run (with few entries)"
         required: false
@@ -39,6 +40,11 @@ on:
         required: false
         default: false
         type: boolean
+      entries-override:
+        description: "Optional JSON array of entry IDs to run (overrides get-entries). Example: [\"microsoftInternal__NAV-213524__cf-4\"]"
+        required: false
+        default: ""
+        type: string
       repeat:
         description: "Number of times to run sequentially (ignored for test runs)"
         required: false
@@ -60,17 +66,33 @@ env:
 
 jobs:
   get-entries:
+    if: ${{ inputs.entries-override == '' }}
     uses: ./.github/workflows/get-entries.yml
     with:
       test-run: ${{ inputs.test-run }}
       category: ${{ inputs.category }}
 
+  resolve-entries:
+    runs-on: ubuntu-latest
+    needs: get-entries
+    if: always()
+    outputs:
+      entries: ${{ steps.pick.outputs.entries }}
+    steps:
+      - id: pick
+        run: |
+          if [[ -n "${{ inputs.entries-override }}" ]]; then
+            echo "entries=${{ inputs.entries-override }}" >> "$GITHUB_OUTPUT"
+          else
+            echo 'entries=${{ needs.get-entries.outputs.entries }}' >> "$GITHUB_OUTPUT"
+          fi
+
   evaluate-with-copilot-cli:
     runs-on: [ GitHub-BCBench ]
-    needs: get-entries
+    needs: resolve-entries
     outputs:
       results-dir: ${{ env.EVALUATION_RESULTS_DIR }}
-    if: needs.get-entries.outputs.entries != '[]'
+    if: needs.resolve-entries.outputs.entries != '[]' && needs.resolve-entries.outputs.entries != ''
     environment:
       name: ado-read
       deployment: false
@@ -82,7 +104,7 @@ jobs:
       fail-fast: false
       max-parallel: 32
       matrix:
-        entry: ${{ fromJson(needs.get-entries.outputs.entries) }}
+        entry: ${{ fromJson(needs.resolve-entries.outputs.entries) }}
     steps:
       - name: Checkout repository
         uses: actions/checkout@v5

diff --git a/.github/workflows/dataset-validation.yml b/.github/workflows/dataset-validation.yml
@@ -10,6 +10,19 @@ on:
         required: false
         default: true
         type: boolean
+      category:
+        description: "Dataset category to validate"
+        required: false
+        default: "bug-fix"
+        type: choice
+        options:
+          - "bug-fix"
+          - "cf"
+      entries-override:
+        description: "Optional JSON array of entry IDs to validate (overrides get-entries)"
+        required: false
+        default: ""
+        type: string
       modified-only:
         description: "Only verify modified entries"
         required: false
@@ -20,16 +33,32 @@ on:
 
 jobs:
   get-entries:
+    if: ${{ inputs.entries-override == '' || inputs.entries-override == null }}
     uses: ./.github/workflows/get-entries.yml
     with:
       modified-only: ${{ inputs.modified-only || false }}
       test-run: ${{ inputs.test-run || false }}
-      category: "bug-fix"
+      category: ${{ inputs.category || 'bug-fix' }}
+
+  resolve-entries:
+    runs-on: ubuntu-latest
+    needs: get-entries
+    if: always()
+    outputs:
+      entries: ${{ steps.pick.outputs.entries }}
+    steps:
+      - id: pick
+        run: |
+          if [[ -n "${{ inputs.entries-override }}" ]]; then
+            echo "entries=${{ inputs.entries-override }}" >> "$GITHUB_OUTPUT"
+          else
+            echo 'entries=${{ needs.get-entries.outputs.entries }}' >> "$GITHUB_OUTPUT"
+          fi
 
   verify-build-and-tests:
     runs-on: [GitHub-BCBench]
-    needs: get-entries
-    if: needs.get-entries.outputs.entries != '[]'
+    needs: resolve-entries
+    if: needs.resolve-entries.outputs.entries != '[]' && needs.resolve-entries.outputs.entries != ''
     environment:
       name: ado-read
       deployment: false
@@ -40,7 +69,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        entry: ${{ fromJson(needs.get-entries.outputs.entries) }}
+        entry: ${{ fromJson(needs.resolve-entries.outputs.entries) }}
     steps:
       - name: Checkout repository
         uses: actions/checkout@v5

diff --git a/.github/workflows/summarize-results.yml b/.github/workflows/summarize-results.yml
@@ -64,13 +64,15 @@ jobs:
           retention-days: ${{ inputs.mock && 1 || 30 }}
 
       - name: Azure Login with OIDC for bceval package
+        if: ${{ inputs.category != 'cf' }}
         uses: azure/login@v3
         with:
           client-id: ${{ secrets.AZURE_CLIENT_ID }}
           tenant-id: ${{ secrets.AZURE_TENANT_ID }}
           allow-no-subscriptions: true
 
       - name: Upload result with bceval to Braintrust
+        if: ${{ inputs.category != 'cf' }}
         env:
           BRAINTRUST_PROJECT_ID: ${{ secrets.BRAINTRUST_PROJECT_ID }}
           BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }}
@@ -95,7 +97,10 @@ jobs:
             --tags "${{ inputs.agent }},${MODEL_TAG},${{ inputs.category }}" ${{ !inputs.mock && '--upload-results' || '' }}
 
       - name: Update leaderboard in a new branch
-        if: ${{ !inputs.mock }}
+        # CF runs only save raw results (per-instance result files + summary artifact);
+        # they do not contribute to the public leaderboard. Metrics for CF are
+        # computed offline from the raw results in notebooks.
+        if: ${{ !inputs.mock && inputs.category != 'cf' }}
         run: |
           git fetch origin main
 

diff --git a/COUNTERFACTUAL.md b/COUNTERFACTUAL.md
@@ -0,0 +1,120 @@
+# Counterfactual Evaluation
+
+This document describes the **counterfactual (CF)** categories in BC-Bench.
+
+## What Are Counterfactual Entries?
+
+A counterfactual (CF) entry is a **variant** of an existing base bug-fix entry. It reuses the same repository state (`repo`, `base_commit`, `project_paths`) but provides a **different fix and test pair** — testing whether an agent can solve a related-but-different version of the same bug.
+
+Each CF entry lives in [`dataset/counterfactual.jsonl`](dataset/counterfactual.jsonl) and references a base entry from [`dataset/bcbench.jsonl`](dataset/bcbench.jsonl).
+
+### Validation Contract
+
+CF entries follow the **exact same** validation contract as bug-fix:
+
+- **Before** applying the patch → `FAIL_TO_PASS` tests **FAIL**
+- **After** applying the patch → `FAIL_TO_PASS` tests **PASS**
+
+This means the same `BugFixPipeline` is reused for evaluation.
+
+### Naming Convention
+
+```
+microsoftInternal__NAV-210528         ← base entry (in bcbench.jsonl)
+microsoftInternal__NAV-210528__cf-1   ← first counterfactual variant
+microsoftInternal__NAV-210528__cf-2   ← second variant
+```
+
+## The `cf` Category
+
+All CF entries are exposed under a single category named `cf`. The base bug-fix
+evaluation pipeline is reused as-is. CF runs in CI **only save raw results**
+(per-instance result JSONL files + an aggregated `evaluation_summary.json`
+artifact); they do not push to the public leaderboard or upload to Braintrust.
+Downstream metrics (resolution rate, family fragility, severity, stability,
+etc.) are computed offline from the raw results in the analysis notebooks.
+
+Variant numbering still lives in the instance ID (`<base_id>__cf-<N>`), so any
+per-variant analysis remains possible by grouping on the `__cf-N` suffix.
+
+All CF entries share:
+- **Dataset file**: `counterfactual.jsonl`
+- **Entry class**: `CounterfactualEntry`
+- **Pipeline**: `BugFixPipeline` (reused)
+- **Result class**: `BugFixResult` (reused)
+- **Prompt template**: `counterfactual-template`
+
+## CF Entry Schema
+
+Each line in `counterfactual.jsonl` contains:
+
+| Field                        | Description                                       |
+| ---------------------------- | ------------------------------------------------- |
+| `instance_id`                | `<base_id>__cf-<N>` — unique identifier           |
+| `base_instance_id`           | ID of the base entry this variant is derived from |
+| `variant_description`        | Human-readable description of the variant         |
+| `failure_layer`              | Optional failure layer classification             |
+| `problem_statement_override` | Path to the CF-specific problem statement         |
+| `patch`                      | The counterfactual fix patch                      |
+| `test_patch`                 | The counterfactual test patch                     |
+| `FAIL_TO_PASS`               | Tests that must fail before fix, pass after       |
+| `PASS_TO_PASS`               | Tests that must pass both before and after        |
+
+**Note:** Fields like `repo`, `base_commit`, `project_paths`, `environment_setup_version`, and `created_at` are **not stored** in CF entries. They are resolved at load time from the base entry in `bcbench.jsonl`.
+
+## Architecture
+
+The counterfactual categories integrate into BC-Bench's polymorphic category system:
+
+| Extension Point | Value |
+|---|---|
+| `EvaluationCategory` | `CF = "cf"` |
+| `is_counterfactual` | `True` only for `CF` |
+| `prompt_template_key` | `"counterfactual"` |
+| `dataset_path` | `dataset/counterfactual.jsonl` |
+| `entry_class` | `CounterfactualEntry` (resolves base fields at load time) |
+| `pipeline` | `BugFixPipeline` (reused — same FAIL→PASS contract) |
+| `result_class` | `BugFixResult` (reused) |
+| `summary_class` | `ExecutionBasedEvaluationResultSummary` (reused) |
+
+### Key File Reference
+
+| File | Purpose |
+|---|---|
+| [`dataset/counterfactual.jsonl`](dataset/counterfactual.jsonl) | All CF entries (one JSON per line) |
+| [`dataset/problemstatement/<id>/`](dataset/problemstatement/) | Problem statement for each CF entry |
+| [`src/bcbench/dataset/counterfactual_entry.py`](src/bcbench/dataset/counterfactual_entry.py) | CF entry model with base resolution |
+| [`src/bcbench/types.py`](src/bcbench/types.py) | Category registration (`CF`) |
+
+## CLI Usage
+
+```bash
+# List all CF entries
+uv run bcbench dataset list --category cf
+
+# List a few random CF entries (test run)
+uv run bcbench dataset list --category cf --test-run
+
+# View a specific CF entry
+uv run bcbench dataset view microsoftInternal__NAV-210528__cf-1 --category cf
+
+# Run agent on a CF entry
+uv run bcbench run copilot microsoftInternal__NAV-210528__cf-1 \
+  --category cf \
+  --repo-path /path/to/NAV
+
+# Full evaluation (build + test)
+uv run bcbench evaluate copilot microsoftInternal__NAV-210528__cf-1 \
+  --category cf \
+  --repo-path /path/to/NAV
+```
+
+## Analysis Notebooks
+
+Experiment notebooks for analyzing CF results are in [`notebooks/counterfactual-evaluation/`](notebooks/counterfactual-evaluation/):
+
+| Notebook | Purpose |
+|---|---|
+| `experiment1-base-performance.ipynb` | Instance-level compile/pass rates per model |
+| `experiment2-cf-sensitivity.ipynb` | Family fragility rate, severity, pattern analysis |
+| `experiment3-layered-failure.ipynb` | L1-L5 failure distribution, layer-conditioned fragility |