microsoft · haoranpb · May 22, 2026 · May 21, 2026 · May 21, 2026 · May 21, 2026
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
@@ -37,11 +37,17 @@ jobs:
     outputs:
       category: ${{ steps.random.outputs.category }}
     steps:
+      - name: Checkout repository
+        uses: actions/checkout@v5
+
+      - name: Setup Python with UV
+        uses: ./.github/actions/setup-python-uv
+
       - name: Select random category
         id: random
         shell: pwsh
         run: |
-          $categories = @("bug-fix", "test-generation")
+          $categories = (uv run bcbench category list) -split "`n" | Where-Object { $_ }
           $selected = $categories | Get-Random
           echo "category=$selected" >> $env:GITHUB_OUTPUT
 

diff --git a/.github/workflows/summarize-results.yml b/.github/workflows/summarize-results.yml
@@ -22,7 +22,7 @@ on:
         default: false
       category:
         description: "Evaluation category"
-        required: false
+        required: true
         type: string
 
 env:
@@ -70,7 +70,11 @@ jobs:
           tenant-id: ${{ secrets.AZURE_TENANT_ID }}
           allow-no-subscriptions: true
 
-      - name: Upload result with bceval to Braintrust
+      - name: Resolve bceval evaluator config for category
+        id: bceval
+        run: uv run bcbench category bceval-config --category "${{ inputs.category }}"
+
+      - name: Upload result using bceval
         env:
           BRAINTRUST_PROJECT_ID: ${{ secrets.BRAINTRUST_PROJECT_ID }}
           BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }}
@@ -79,20 +83,23 @@ jobs:
           ADO_TOKEN=$(az account get-access-token --resource 499b84ac-1321-427f-aa17-267ca6975798 --query accessToken -o tsv)
           echo "::add-mask::$ADO_TOKEN"
 
-          # Install bc-eval from ADO feed using Python 3.11 (bc-eval's private feed only has cp311 wheels)
-          uv tool install bc-eval==0.1.3 --python 3.11 --index "https://anything:${ADO_TOKEN}@dynamicssmb2.pkgs.visualstudio.com/1fcb79e7-ab07-432a-a3c6-6cf5a88ba4a5/_packaging/PythonPackages%40Local/pypi/simple/"
+          uv tool install bc-eval[capi]==0.3.6 --python 3.12 --index "https://anything:${ADO_TOKEN}@dynamicssmb2.pkgs.visualstudio.com/1fcb79e7-ab07-432a-a3c6-6cf5a88ba4a5/_packaging/BC_PythonFeed/pypi/simple/"
 
-          # Upload summary to Braintrust using bc-eval
+          # Upload summary using bc-eval
           MODEL_TAG="${{ inputs.model }}"
           MODEL_TAG="${MODEL_TAG//./-}"
           bceval metrics calculate \
+            --feature-name "BC-Bench" \
+            --eval-suite-name "${{ inputs.category }}" \
+            --eval-run-name "${{ inputs.agent }} (${{ inputs.model }}) - #${{ github.run_id }}" \
+            --tags "${{ inputs.agent }},${MODEL_TAG}" \
+            --source "${{ github.sha }}" \
             --input-file "${{ inputs.results-dir }}/${{ github.run_id }}/${{ env.BCEVAL_RESULT_FILE }}" \
             --evaluator-definitions "${{ github.workspace }}/evaluator/scores.py" \
-            --evaluators "resolution_rate,build_rate${{ inputs.category == 'test-generation' && ',pre_patch_failed_rate,post_patch_passed_rate' || '' }}" \
+            --evaluators "${{ steps.bceval.outputs.evaluators }}" \
+            --core-score "${{ steps.bceval.outputs.core_score }}" \
             --metric-definitions "${{ github.workspace }}/evaluator/metrics.py" \
-            --metrics "bc_bench_metrics" \
-            --eval-run-name "${{ inputs.agent }} (${{ inputs.model }}) - #${{ github.run_id }}" \
-            --tags "${{ inputs.agent }},${MODEL_TAG},${{ inputs.category }}" ${{ !inputs.mock && '--upload-results' || '' }}
+            --metrics "bc_bench_metrics" ${{ !inputs.mock && '--storage braintrust --storage kusto' || '' }}
 
       - name: Update leaderboard in a new branch
         if: ${{ !inputs.mock }}

diff --git a/src/bcbench/cli.py b/src/bcbench/cli.py
@@ -7,6 +7,7 @@
 from typing_extensions import Annotated
 
 from bcbench.commands import dataset_app, evaluate_app, run_app
+from bcbench.commands.category import category_app
 from bcbench.commands.collect import collect_app
 from bcbench.commands.result import result_app
 from bcbench.config import get_config
@@ -33,6 +34,7 @@
 app.add_typer(dataset_app, name="dataset")
 app.add_typer(evaluate_app, name="evaluate")
 app.add_typer(result_app, name="result")
+app.add_typer(category_app, name="category")
 
 
 @app.callback()

diff --git a/src/bcbench/commands/__init__.py b/src/bcbench/commands/__init__.py
@@ -1,7 +1,8 @@
 """CLI commands for bcbench."""
 
+from bcbench.commands.category import category_app
 from bcbench.commands.dataset import dataset_app
 from bcbench.commands.evaluate import evaluate_app
 from bcbench.commands.run import run_app
 
-__all__ = ["dataset_app", "evaluate_app", "run_app"]
+__all__ = ["category_app", "dataset_app", "evaluate_app", "run_app"]
diff --git a/src/bcbench/commands/category.py b/src/bcbench/commands/category.py
@@ -0,0 +1,50 @@
+import os
+import sys
+from pathlib import Path
+
+import typer
+from typing_extensions import Annotated
+
+from bcbench.cli_options import EvaluationCategoryOption
+from bcbench.types import EvaluationCategory
+
+category_app = typer.Typer(help="Category-specific configuration helpers")
+
+
+@category_app.command("list")
+def list_categories() -> None:
+    """Print all evaluation category names, one per line."""
+    for category in EvaluationCategory:
+        sys.stdout.write(f"{category.value}\n")
+
+
+@category_app.command("bceval-config")
+def bceval_config(
+    category: EvaluationCategoryOption,
+    github_output: Annotated[
+        Path | None,
+        typer.Option(envvar="GITHUB_OUTPUT", help="Append outputs to this file (typically $GITHUB_OUTPUT)"),
+    ] = None,
+) -> None:
+    """
+    Print the bc-eval evaluator list and core score for a category as key=value lines.
+
+    When run inside a GitHub Actions step with $GITHUB_OUTPUT set, the lines are
+    appended to that file so they become step outputs. Otherwise they're written
+    to stdout.
+    """
+    lines: list[str] = [
+        f"evaluators={','.join(category.evaluators)}",
+        f"core_score={category.core_score}",
+    ]
+    payload: str = "\n".join(lines) + "\n"
+
+    if github_output:
+        with open(github_output, "a", encoding="utf-8") as file:
+            file.write(payload)
+    else:
+        sys.stdout.write(payload)
+
+    # Always echo to stderr so workflow logs show what was emitted.
+    if os.getenv("GITHUB_ACTIONS"):
+        sys.stderr.write(payload)
diff --git a/src/bcbench/dataset/__init__.py b/src/bcbench/dataset/__init__.py
@@ -1,10 +1,11 @@
 """Dataset module for querying, validating and analyze dataset entries."""
 
-from bcbench.dataset.dataset_entry import BaseDatasetEntry, BugFixEntry, TestEntry, TestGenEntry
+from bcbench.dataset.dataset_entry import BaseDatasetEntry, BugFixEntry, ExpectedOutput, TestEntry, TestGenEntry
 
 __all__ = [
     "BaseDatasetEntry",
     "BugFixEntry",
+    "ExpectedOutput",
     "TestEntry",
     "TestGenEntry",
 ]
diff --git a/src/bcbench/dataset/dataset_entry.py b/src/bcbench/dataset/dataset_entry.py
@@ -4,7 +4,7 @@
 import re
 from abc import abstractmethod
 from pathlib import Path
-from typing import Annotated, Self
+from typing import Annotated, Literal, Self, TypedDict
 
 from pydantic import BaseModel, ConfigDict, Field, model_validator
 
@@ -13,7 +13,24 @@
 
 _config = get_config()
 
-__all__ = ["BaseDatasetEntry", "BugFixEntry", "TestEntry", "TestGenEntry"]
+__all__ = ["BaseDatasetEntry", "BugFixEntry", "ExpectedOutput", "TestEntry", "TestGenEntry"]
+
+
+type ChecklistLevel = Literal["critical", "expected", "aspirational"]
+
+
+class ChecklistAssertion(TypedDict):
+    text: str
+    level: ChecklistLevel
+
+
+class Checklist(TypedDict):
+    assertions: list[ChecklistAssertion]
+
+
+# Patch-style string for execution-based categories (bug-fix, test-generation),
+# or an lm_checklist payload for scorer-driven categories.
+type ExpectedOutput = str | Checklist
 
 
 class TestEntry(BaseModel):
@@ -89,7 +106,7 @@ def get_task(self) -> str:
         pass
 
     @abstractmethod
-    def get_expected_output(self) -> str:
+    def get_expected_output(self) -> ExpectedOutput:
         pass
 
     def extract_project_name(self) -> str:

diff --git a/src/bcbench/results/bceval_export.py b/src/bcbench/results/bceval_export.py
@@ -6,7 +6,7 @@
 from pathlib import Path
 from typing import Any
 
-from bcbench.dataset import BaseDatasetEntry
+from bcbench.dataset import BaseDatasetEntry, ExpectedOutput
 from bcbench.logger import get_logger
 from bcbench.results.base import BaseEvaluationResult
 from bcbench.types import EvaluationCategory
@@ -29,7 +29,8 @@ def write_bceval_results(results: list[BaseEvaluationResult], out_dir: Path, run
                 continue
 
             matched_entry = matching_entries[0]
-            input, expected = matched_entry.get_task(), matched_entry.get_expected_output()
+            task_input: str = matched_entry.get_task()
+            expected: ExpectedOutput = matched_entry.get_expected_output()
 
             metadata: dict[str, Any] = {
                 "model": result.model,
@@ -47,7 +48,7 @@ def write_bceval_results(results: list[BaseEvaluationResult], out_dir: Path, run
 
             bceval_result = {
                 "id": result.instance_id,
-                "input": input,
+                "input": task_input,
                 "expected": expected,
                 "output": result.output,
                 "context": "",

diff --git a/src/bcbench/types.py b/src/bcbench/types.py
@@ -162,6 +162,30 @@ def pipeline(self) -> EvaluationPipeline:
 
         raise ValueError(f"Unknown evaluation category: {self}")
 
+    @property
+    def evaluators(self) -> list[str]:
+        """
+        Names of bc-eval evaluators (from evaluator/scores.py) to run for this category.
+
+        Used for uploading evaluation results to long term storage.
+        """
+        match self:
+            case EvaluationCategory.BUG_FIX:
+                return ["resolution_rate", "build_rate"]
+            case EvaluationCategory.TEST_GENERATION:
+                return ["resolution_rate", "build_rate", "pre_patch_failed_rate", "post_patch_passed_rate"]
+
+        raise ValueError(f"Unknown evaluation category: {self}")
+
+    @property
+    def core_score(self) -> str:
+        """Name of the evaluator whose value is considered as CoreScore, required by bc-eval."""
+        match self:
+            case EvaluationCategory.BUG_FIX | EvaluationCategory.TEST_GENERATION:
+                return "ResolutionRate"
+
+        raise ValueError(f"Unknown evaluation category: {self}")
+
 
 @dataclass(frozen=True)
 class ContainerConfig:

diff --git a/tests/test_category_command.py b/tests/test_category_command.py
@@ -0,0 +1,51 @@
+from typer.testing import CliRunner
+
+from bcbench.cli import app
+from bcbench.types import EvaluationCategory
+
+runner = CliRunner()
+
+
+def test_bceval_config_prints_evaluators_and_core_score_to_stdout_when_no_github_output(monkeypatch):
+    monkeypatch.delenv("GITHUB_OUTPUT", raising=False)
+    monkeypatch.delenv("GITHUB_ACTIONS", raising=False)
+
+    result = runner.invoke(app, ["category", "bceval-config", "--category", "bug-fix"])
+
+    assert result.exit_code == 0
+    assert "evaluators=resolution_rate,build_rate" in result.stdout
+    assert "core_score=ResolutionRate" in result.stdout
+
+
+def test_bceval_config_appends_to_github_output_file_when_set(tmp_path, monkeypatch):
+    output_file = tmp_path / "gh_output"
+    output_file.write_text("pre_existing=keep\n", encoding="utf-8")
+    monkeypatch.setenv("GITHUB_OUTPUT", str(output_file))
+    monkeypatch.delenv("GITHUB_ACTIONS", raising=False)
+
+    result = runner.invoke(app, ["category", "bceval-config", "--category", "test-generation"])
+
+    assert result.exit_code == 0
+    contents = output_file.read_text(encoding="utf-8")
+    assert "pre_existing=keep" in contents
+    assert "evaluators=resolution_rate,build_rate,pre_patch_failed_rate,post_patch_passed_rate" in contents
+    assert "core_score=ResolutionRate" in contents
+
+
+def test_bceval_config_supports_every_category(monkeypatch):
+    monkeypatch.delenv("GITHUB_OUTPUT", raising=False)
+    monkeypatch.delenv("GITHUB_ACTIONS", raising=False)
+
+    for category in EvaluationCategory:
+        result = runner.invoke(app, ["category", "bceval-config", "--category", category.value])
+        assert result.exit_code == 0, f"{category}: {result.stdout}"
+        assert f"evaluators={','.join(category.evaluators)}" in result.stdout
+        assert f"core_score={category.core_score}" in result.stdout
+
+
+def test_list_prints_every_category_one_per_line():
+    result = runner.invoke(app, ["category", "list"])
+
+    assert result.exit_code == 0
+    lines = [line for line in result.stdout.splitlines() if line]
+    assert lines == [c.value for c in EvaluationCategory]
diff --git a/tests/test_result_writer.py b/tests/test_result_writer.py
@@ -1,7 +1,7 @@
 import json
 from unittest.mock import PropertyMock, patch
 
-from bcbench.dataset.dataset_entry import _BugFixTestGenBase
+from bcbench.dataset.dataset_entry import BugFixEntry, _BugFixTestGenBase
 from bcbench.results.bceval_export import write_bceval_results
 from bcbench.types import AgentMetrics, EvaluationCategory
 from tests.conftest import VALID_INSTANCE_ID, create_bugfix_result
@@ -180,3 +180,33 @@ def test_handles_partial_none_metrics(self, tmp_path, sample_dataset_file, probl
         assert data["metadata"]["prompt_tokens"] == 0
         assert data["metadata"]["completion_tokens"] == 1500
         assert data["metadata"]["latency"] == 100.0
+
+    def test_preserves_dict_expected_output_for_lm_checklist_style_categories(self, tmp_path, sample_dataset_file, sample_bugfix_result_with_metrics, problem_statement_dir):
+        """A category whose `get_expected_output()` returns assertions (dict) must surface them as a JSON object in `expected`."""
+        output_dir = tmp_path / "output"
+        output_dir.mkdir()
+
+        checklist_payload = {
+            "assertions": [
+                {"text": "The output identifies the root cause.", "level": "critical"},
+                {"text": "The output mentions the affected codeunit.", "level": "expected"},
+            ]
+        }
+
+        with (
+            patch.object(_BugFixTestGenBase, "problem_statement_dir", property(lambda self: problem_statement_dir)),
+            patch.object(EvaluationCategory, "dataset_path", new_callable=PropertyMock, return_value=sample_dataset_file),
+            patch.object(BugFixEntry, "get_expected_output", lambda self: checklist_payload),
+        ):
+            write_bceval_results(
+                results=[sample_bugfix_result_with_metrics],
+                out_dir=output_dir,
+                run_id="test_run_checklist",
+                output_filename="results.jsonl",
+                category=EvaluationCategory.BUG_FIX,
+            )
+
+        with open(output_dir / "results.jsonl") as f:
+            data = json.loads(f.readline())
+
+        assert data["expected"] == checklist_payload
diff --git a/tests/test_type_exhaustiveness.py b/tests/test_type_exhaustiveness.py
@@ -39,5 +39,24 @@ def test_all_categories_handled_in_get_expected_output(sample_dataset_entry_with
         input_text = entry.get_task()
         expected_output = entry.get_expected_output()
         assert isinstance(input_text, str)
-        assert isinstance(expected_output, str)
-        assert len(expected_output) > 0
+        # ExpectedOutput is `str | Checklist`: string for execution-based categories,
+        # `{"assertions": [...]}` for lm_checklist-driven ones.
+        if isinstance(expected_output, dict):
+            assert "assertions" in expected_output
+        else:
+            assert isinstance(expected_output, str)
+            assert expected_output
+
+
+def test_all_categories_have_evaluators():
+    for category in EvaluationCategory:
+        evaluators = category.evaluators
+        assert isinstance(evaluators, list)
+        assert evaluators, f"{category} must declare at least one evaluator"
+        assert all(isinstance(e, str) and e for e in evaluators)
+
+
+def test_all_categories_have_core_score():
+    for category in EvaluationCategory:
+        assert isinstance(category.core_score, str)
+        assert category.core_score, f"{category} must declare a non-empty core_score"