diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 368e4a909..65f3600e3 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -37,11 +37,17 @@ jobs: outputs: category: ${{ steps.random.outputs.category }} steps: + - name: Checkout repository + uses: actions/checkout@v5 + + - name: Setup Python with UV + uses: ./.github/actions/setup-python-uv + - name: Select random category id: random shell: pwsh run: | - $categories = @("bug-fix", "test-generation") + $categories = (uv run bcbench category list) -split "`n" | Where-Object { $_ } $selected = $categories | Get-Random echo "category=$selected" >> $env:GITHUB_OUTPUT diff --git a/.github/workflows/summarize-results.yml b/.github/workflows/summarize-results.yml index 54201a583..7e93a1562 100644 --- a/.github/workflows/summarize-results.yml +++ b/.github/workflows/summarize-results.yml @@ -22,7 +22,7 @@ on: default: false category: description: "Evaluation category" - required: false + required: true type: string env: @@ -70,7 +70,11 @@ jobs: tenant-id: ${{ secrets.AZURE_TENANT_ID }} allow-no-subscriptions: true - - name: Upload result with bceval to Braintrust + - name: Resolve bceval evaluator config for category + id: bceval + run: uv run bcbench category bceval-config --category "${{ inputs.category }}" + + - name: Upload result using bceval env: BRAINTRUST_PROJECT_ID: ${{ secrets.BRAINTRUST_PROJECT_ID }} BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }} @@ -79,20 +83,23 @@ jobs: ADO_TOKEN=$(az account get-access-token --resource 499b84ac-1321-427f-aa17-267ca6975798 --query accessToken -o tsv) echo "::add-mask::$ADO_TOKEN" - # Install bc-eval from ADO feed using Python 3.11 (bc-eval's private feed only has cp311 wheels) - uv tool install bc-eval==0.1.3 --python 3.11 --index "https://anything:${ADO_TOKEN}@dynamicssmb2.pkgs.visualstudio.com/1fcb79e7-ab07-432a-a3c6-6cf5a88ba4a5/_packaging/PythonPackages%40Local/pypi/simple/" + uv tool install bc-eval[capi]==0.3.6 --python 3.12 --index "https://anything:${ADO_TOKEN}@dynamicssmb2.pkgs.visualstudio.com/1fcb79e7-ab07-432a-a3c6-6cf5a88ba4a5/_packaging/BC_PythonFeed/pypi/simple/" - # Upload summary to Braintrust using bc-eval + # Upload summary using bc-eval MODEL_TAG="${{ inputs.model }}" MODEL_TAG="${MODEL_TAG//./-}" bceval metrics calculate \ + --feature-name "BC-Bench" \ + --eval-suite-name "${{ inputs.category }}" \ + --eval-run-name "${{ inputs.agent }} (${{ inputs.model }}) - #${{ github.run_id }}" \ + --tags "${{ inputs.agent }},${MODEL_TAG}" \ + --source "${{ github.sha }}" \ --input-file "${{ inputs.results-dir }}/${{ github.run_id }}/${{ env.BCEVAL_RESULT_FILE }}" \ --evaluator-definitions "${{ github.workspace }}/evaluator/scores.py" \ - --evaluators "resolution_rate,build_rate${{ inputs.category == 'test-generation' && ',pre_patch_failed_rate,post_patch_passed_rate' || '' }}" \ + --evaluators "${{ steps.bceval.outputs.evaluators }}" \ + --core-score "${{ steps.bceval.outputs.core_score }}" \ --metric-definitions "${{ github.workspace }}/evaluator/metrics.py" \ - --metrics "bc_bench_metrics" \ - --eval-run-name "${{ inputs.agent }} (${{ inputs.model }}) - #${{ github.run_id }}" \ - --tags "${{ inputs.agent }},${MODEL_TAG},${{ inputs.category }}" ${{ !inputs.mock && '--upload-results' || '' }} + --metrics "bc_bench_metrics" ${{ !inputs.mock && '--storage braintrust --storage kusto' || '' }} - name: Update leaderboard in a new branch if: ${{ !inputs.mock }} diff --git a/src/bcbench/cli.py b/src/bcbench/cli.py index 3fe0e26f0..afc57cb42 100644 --- a/src/bcbench/cli.py +++ b/src/bcbench/cli.py @@ -7,6 +7,7 @@ from typing_extensions import Annotated from bcbench.commands import dataset_app, evaluate_app, run_app +from bcbench.commands.category import category_app from bcbench.commands.collect import collect_app from bcbench.commands.result import result_app from bcbench.config import get_config @@ -33,6 +34,7 @@ app.add_typer(dataset_app, name="dataset") app.add_typer(evaluate_app, name="evaluate") app.add_typer(result_app, name="result") +app.add_typer(category_app, name="category") @app.callback() diff --git a/src/bcbench/commands/__init__.py b/src/bcbench/commands/__init__.py index 88d3fc4a6..ead2142b9 100644 --- a/src/bcbench/commands/__init__.py +++ b/src/bcbench/commands/__init__.py @@ -1,7 +1,8 @@ """CLI commands for bcbench.""" +from bcbench.commands.category import category_app from bcbench.commands.dataset import dataset_app from bcbench.commands.evaluate import evaluate_app from bcbench.commands.run import run_app -__all__ = ["dataset_app", "evaluate_app", "run_app"] +__all__ = ["category_app", "dataset_app", "evaluate_app", "run_app"] diff --git a/src/bcbench/commands/category.py b/src/bcbench/commands/category.py new file mode 100644 index 000000000..a18fdfc85 --- /dev/null +++ b/src/bcbench/commands/category.py @@ -0,0 +1,50 @@ +import os +import sys +from pathlib import Path + +import typer +from typing_extensions import Annotated + +from bcbench.cli_options import EvaluationCategoryOption +from bcbench.types import EvaluationCategory + +category_app = typer.Typer(help="Category-specific configuration helpers") + + +@category_app.command("list") +def list_categories() -> None: + """Print all evaluation category names, one per line.""" + for category in EvaluationCategory: + sys.stdout.write(f"{category.value}\n") + + +@category_app.command("bceval-config") +def bceval_config( + category: EvaluationCategoryOption, + github_output: Annotated[ + Path | None, + typer.Option(envvar="GITHUB_OUTPUT", help="Append outputs to this file (typically $GITHUB_OUTPUT)"), + ] = None, +) -> None: + """ + Print the bc-eval evaluator list and core score for a category as key=value lines. + + When run inside a GitHub Actions step with $GITHUB_OUTPUT set, the lines are + appended to that file so they become step outputs. Otherwise they're written + to stdout. + """ + lines: list[str] = [ + f"evaluators={','.join(category.evaluators)}", + f"core_score={category.core_score}", + ] + payload: str = "\n".join(lines) + "\n" + + if github_output: + with open(github_output, "a", encoding="utf-8") as file: + file.write(payload) + else: + sys.stdout.write(payload) + + # Always echo to stderr so workflow logs show what was emitted. + if os.getenv("GITHUB_ACTIONS"): + sys.stderr.write(payload) diff --git a/src/bcbench/dataset/__init__.py b/src/bcbench/dataset/__init__.py index 4e6e205fa..7491f3e91 100644 --- a/src/bcbench/dataset/__init__.py +++ b/src/bcbench/dataset/__init__.py @@ -1,10 +1,11 @@ """Dataset module for querying, validating and analyze dataset entries.""" -from bcbench.dataset.dataset_entry import BaseDatasetEntry, BugFixEntry, TestEntry, TestGenEntry +from bcbench.dataset.dataset_entry import BaseDatasetEntry, BugFixEntry, ExpectedOutput, TestEntry, TestGenEntry __all__ = [ "BaseDatasetEntry", "BugFixEntry", + "ExpectedOutput", "TestEntry", "TestGenEntry", ] diff --git a/src/bcbench/dataset/dataset_entry.py b/src/bcbench/dataset/dataset_entry.py index c2620caa0..0a295934a 100644 --- a/src/bcbench/dataset/dataset_entry.py +++ b/src/bcbench/dataset/dataset_entry.py @@ -4,7 +4,7 @@ import re from abc import abstractmethod from pathlib import Path -from typing import Annotated, Self +from typing import Annotated, Literal, Self, TypedDict from pydantic import BaseModel, ConfigDict, Field, model_validator @@ -13,7 +13,24 @@ _config = get_config() -__all__ = ["BaseDatasetEntry", "BugFixEntry", "TestEntry", "TestGenEntry"] +__all__ = ["BaseDatasetEntry", "BugFixEntry", "ExpectedOutput", "TestEntry", "TestGenEntry"] + + +type ChecklistLevel = Literal["critical", "expected", "aspirational"] + + +class ChecklistAssertion(TypedDict): + text: str + level: ChecklistLevel + + +class Checklist(TypedDict): + assertions: list[ChecklistAssertion] + + +# Patch-style string for execution-based categories (bug-fix, test-generation), +# or an lm_checklist payload for scorer-driven categories. +type ExpectedOutput = str | Checklist class TestEntry(BaseModel): @@ -89,7 +106,7 @@ def get_task(self) -> str: pass @abstractmethod - def get_expected_output(self) -> str: + def get_expected_output(self) -> ExpectedOutput: pass def extract_project_name(self) -> str: diff --git a/src/bcbench/results/bceval_export.py b/src/bcbench/results/bceval_export.py index ad28f4727..72d957a7f 100644 --- a/src/bcbench/results/bceval_export.py +++ b/src/bcbench/results/bceval_export.py @@ -6,7 +6,7 @@ from pathlib import Path from typing import Any -from bcbench.dataset import BaseDatasetEntry +from bcbench.dataset import BaseDatasetEntry, ExpectedOutput from bcbench.logger import get_logger from bcbench.results.base import BaseEvaluationResult from bcbench.types import EvaluationCategory @@ -29,7 +29,8 @@ def write_bceval_results(results: list[BaseEvaluationResult], out_dir: Path, run continue matched_entry = matching_entries[0] - input, expected = matched_entry.get_task(), matched_entry.get_expected_output() + task_input: str = matched_entry.get_task() + expected: ExpectedOutput = matched_entry.get_expected_output() metadata: dict[str, Any] = { "model": result.model, @@ -47,7 +48,7 @@ def write_bceval_results(results: list[BaseEvaluationResult], out_dir: Path, run bceval_result = { "id": result.instance_id, - "input": input, + "input": task_input, "expected": expected, "output": result.output, "context": "", diff --git a/src/bcbench/types.py b/src/bcbench/types.py index 59021dbad..b2bb1eacc 100644 --- a/src/bcbench/types.py +++ b/src/bcbench/types.py @@ -162,6 +162,30 @@ def pipeline(self) -> EvaluationPipeline: raise ValueError(f"Unknown evaluation category: {self}") + @property + def evaluators(self) -> list[str]: + """ + Names of bc-eval evaluators (from evaluator/scores.py) to run for this category. + + Used for uploading evaluation results to long term storage. + """ + match self: + case EvaluationCategory.BUG_FIX: + return ["resolution_rate", "build_rate"] + case EvaluationCategory.TEST_GENERATION: + return ["resolution_rate", "build_rate", "pre_patch_failed_rate", "post_patch_passed_rate"] + + raise ValueError(f"Unknown evaluation category: {self}") + + @property + def core_score(self) -> str: + """Name of the evaluator whose value is considered as CoreScore, required by bc-eval.""" + match self: + case EvaluationCategory.BUG_FIX | EvaluationCategory.TEST_GENERATION: + return "ResolutionRate" + + raise ValueError(f"Unknown evaluation category: {self}") + @dataclass(frozen=True) class ContainerConfig: diff --git a/tests/test_category_command.py b/tests/test_category_command.py new file mode 100644 index 000000000..81d684db6 --- /dev/null +++ b/tests/test_category_command.py @@ -0,0 +1,51 @@ +from typer.testing import CliRunner + +from bcbench.cli import app +from bcbench.types import EvaluationCategory + +runner = CliRunner() + + +def test_bceval_config_prints_evaluators_and_core_score_to_stdout_when_no_github_output(monkeypatch): + monkeypatch.delenv("GITHUB_OUTPUT", raising=False) + monkeypatch.delenv("GITHUB_ACTIONS", raising=False) + + result = runner.invoke(app, ["category", "bceval-config", "--category", "bug-fix"]) + + assert result.exit_code == 0 + assert "evaluators=resolution_rate,build_rate" in result.stdout + assert "core_score=ResolutionRate" in result.stdout + + +def test_bceval_config_appends_to_github_output_file_when_set(tmp_path, monkeypatch): + output_file = tmp_path / "gh_output" + output_file.write_text("pre_existing=keep\n", encoding="utf-8") + monkeypatch.setenv("GITHUB_OUTPUT", str(output_file)) + monkeypatch.delenv("GITHUB_ACTIONS", raising=False) + + result = runner.invoke(app, ["category", "bceval-config", "--category", "test-generation"]) + + assert result.exit_code == 0 + contents = output_file.read_text(encoding="utf-8") + assert "pre_existing=keep" in contents + assert "evaluators=resolution_rate,build_rate,pre_patch_failed_rate,post_patch_passed_rate" in contents + assert "core_score=ResolutionRate" in contents + + +def test_bceval_config_supports_every_category(monkeypatch): + monkeypatch.delenv("GITHUB_OUTPUT", raising=False) + monkeypatch.delenv("GITHUB_ACTIONS", raising=False) + + for category in EvaluationCategory: + result = runner.invoke(app, ["category", "bceval-config", "--category", category.value]) + assert result.exit_code == 0, f"{category}: {result.stdout}" + assert f"evaluators={','.join(category.evaluators)}" in result.stdout + assert f"core_score={category.core_score}" in result.stdout + + +def test_list_prints_every_category_one_per_line(): + result = runner.invoke(app, ["category", "list"]) + + assert result.exit_code == 0 + lines = [line for line in result.stdout.splitlines() if line] + assert lines == [c.value for c in EvaluationCategory] diff --git a/tests/test_result_writer.py b/tests/test_result_writer.py index 4133f8714..0f08ee573 100644 --- a/tests/test_result_writer.py +++ b/tests/test_result_writer.py @@ -1,7 +1,7 @@ import json from unittest.mock import PropertyMock, patch -from bcbench.dataset.dataset_entry import _BugFixTestGenBase +from bcbench.dataset.dataset_entry import BugFixEntry, _BugFixTestGenBase from bcbench.results.bceval_export import write_bceval_results from bcbench.types import AgentMetrics, EvaluationCategory from tests.conftest import VALID_INSTANCE_ID, create_bugfix_result @@ -180,3 +180,33 @@ def test_handles_partial_none_metrics(self, tmp_path, sample_dataset_file, probl assert data["metadata"]["prompt_tokens"] == 0 assert data["metadata"]["completion_tokens"] == 1500 assert data["metadata"]["latency"] == 100.0 + + def test_preserves_dict_expected_output_for_lm_checklist_style_categories(self, tmp_path, sample_dataset_file, sample_bugfix_result_with_metrics, problem_statement_dir): + """A category whose `get_expected_output()` returns assertions (dict) must surface them as a JSON object in `expected`.""" + output_dir = tmp_path / "output" + output_dir.mkdir() + + checklist_payload = { + "assertions": [ + {"text": "The output identifies the root cause.", "level": "critical"}, + {"text": "The output mentions the affected codeunit.", "level": "expected"}, + ] + } + + with ( + patch.object(_BugFixTestGenBase, "problem_statement_dir", property(lambda self: problem_statement_dir)), + patch.object(EvaluationCategory, "dataset_path", new_callable=PropertyMock, return_value=sample_dataset_file), + patch.object(BugFixEntry, "get_expected_output", lambda self: checklist_payload), + ): + write_bceval_results( + results=[sample_bugfix_result_with_metrics], + out_dir=output_dir, + run_id="test_run_checklist", + output_filename="results.jsonl", + category=EvaluationCategory.BUG_FIX, + ) + + with open(output_dir / "results.jsonl") as f: + data = json.loads(f.readline()) + + assert data["expected"] == checklist_payload diff --git a/tests/test_type_exhaustiveness.py b/tests/test_type_exhaustiveness.py index 3da3844f7..6fed69368 100644 --- a/tests/test_type_exhaustiveness.py +++ b/tests/test_type_exhaustiveness.py @@ -39,5 +39,24 @@ def test_all_categories_handled_in_get_expected_output(sample_dataset_entry_with input_text = entry.get_task() expected_output = entry.get_expected_output() assert isinstance(input_text, str) - assert isinstance(expected_output, str) - assert len(expected_output) > 0 + # ExpectedOutput is `str | Checklist`: string for execution-based categories, + # `{"assertions": [...]}` for lm_checklist-driven ones. + if isinstance(expected_output, dict): + assert "assertions" in expected_output + else: + assert isinstance(expected_output, str) + assert expected_output + + +def test_all_categories_have_evaluators(): + for category in EvaluationCategory: + evaluators = category.evaluators + assert isinstance(evaluators, list) + assert evaluators, f"{category} must declare at least one evaluator" + assert all(isinstance(e, str) and e for e in evaluators) + + +def test_all_categories_have_core_score(): + for category in EvaluationCategory: + assert isinstance(category.core_score, str) + assert category.core_score, f"{category} must declare a non-empty core_score"