Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion .github/workflows/CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,17 @@ jobs:
outputs:
category: ${{ steps.random.outputs.category }}
steps:
- name: Checkout repository
uses: actions/checkout@v5

- name: Setup Python with UV
uses: ./.github/actions/setup-python-uv

- name: Select random category
id: random
shell: pwsh
run: |
$categories = @("bug-fix", "test-generation")
$categories = (uv run bcbench category list) -split "`n" | Where-Object { $_ }
$selected = $categories | Get-Random
echo "category=$selected" >> $env:GITHUB_OUTPUT
Expand Down
25 changes: 16 additions & 9 deletions .github/workflows/summarize-results.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ on:
default: false
category:
description: "Evaluation category"
required: false
required: true
type: string

env:
Expand Down Expand Up @@ -70,7 +70,11 @@ jobs:
tenant-id: ${{ secrets.AZURE_TENANT_ID }}
allow-no-subscriptions: true

- name: Upload result with bceval to Braintrust
- name: Resolve bceval evaluator config for category
id: bceval
run: uv run bcbench category bceval-config --category "${{ inputs.category }}"

Comment thread
haoranpb marked this conversation as resolved.
- name: Upload result using bceval
env:
BRAINTRUST_PROJECT_ID: ${{ secrets.BRAINTRUST_PROJECT_ID }}
BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }}
Expand All @@ -79,20 +83,23 @@ jobs:
ADO_TOKEN=$(az account get-access-token --resource 499b84ac-1321-427f-aa17-267ca6975798 --query accessToken -o tsv)
echo "::add-mask::$ADO_TOKEN"

# Install bc-eval from ADO feed using Python 3.11 (bc-eval's private feed only has cp311 wheels)
uv tool install bc-eval==0.1.3 --python 3.11 --index "https://anything:${ADO_TOKEN}@dynamicssmb2.pkgs.visualstudio.com/1fcb79e7-ab07-432a-a3c6-6cf5a88ba4a5/_packaging/PythonPackages%40Local/pypi/simple/"
uv tool install bc-eval[capi]==0.3.6 --python 3.12 --index "https://anything:${ADO_TOKEN}@dynamicssmb2.pkgs.visualstudio.com/1fcb79e7-ab07-432a-a3c6-6cf5a88ba4a5/_packaging/BC_PythonFeed/pypi/simple/"

# Upload summary to Braintrust using bc-eval
# Upload summary using bc-eval
MODEL_TAG="${{ inputs.model }}"
MODEL_TAG="${MODEL_TAG//./-}"
bceval metrics calculate \
--feature-name "BC-Bench" \
--eval-suite-name "${{ inputs.category }}" \
--eval-run-name "${{ inputs.agent }} (${{ inputs.model }}) - #${{ github.run_id }}" \
--tags "${{ inputs.agent }},${MODEL_TAG}" \
--source "${{ github.sha }}" \
--input-file "${{ inputs.results-dir }}/${{ github.run_id }}/${{ env.BCEVAL_RESULT_FILE }}" \
--evaluator-definitions "${{ github.workspace }}/evaluator/scores.py" \
--evaluators "resolution_rate,build_rate${{ inputs.category == 'test-generation' && ',pre_patch_failed_rate,post_patch_passed_rate' || '' }}" \
--evaluators "${{ steps.bceval.outputs.evaluators }}" \
--core-score "${{ steps.bceval.outputs.core_score }}" \
--metric-definitions "${{ github.workspace }}/evaluator/metrics.py" \
--metrics "bc_bench_metrics" \
--eval-run-name "${{ inputs.agent }} (${{ inputs.model }}) - #${{ github.run_id }}" \
--tags "${{ inputs.agent }},${MODEL_TAG},${{ inputs.category }}" ${{ !inputs.mock && '--upload-results' || '' }}
--metrics "bc_bench_metrics" ${{ !inputs.mock && '--storage braintrust --storage kusto' || '' }}

- name: Update leaderboard in a new branch
if: ${{ !inputs.mock }}
Expand Down
2 changes: 2 additions & 0 deletions src/bcbench/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from typing_extensions import Annotated

from bcbench.commands import dataset_app, evaluate_app, run_app
from bcbench.commands.category import category_app
from bcbench.commands.collect import collect_app
from bcbench.commands.result import result_app
from bcbench.config import get_config
Expand All @@ -33,6 +34,7 @@
app.add_typer(dataset_app, name="dataset")
app.add_typer(evaluate_app, name="evaluate")
app.add_typer(result_app, name="result")
app.add_typer(category_app, name="category")


@app.callback()
Expand Down
3 changes: 2 additions & 1 deletion src/bcbench/commands/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
"""CLI commands for bcbench."""

from bcbench.commands.category import category_app
from bcbench.commands.dataset import dataset_app
from bcbench.commands.evaluate import evaluate_app
from bcbench.commands.run import run_app

__all__ = ["dataset_app", "evaluate_app", "run_app"]
__all__ = ["category_app", "dataset_app", "evaluate_app", "run_app"]
50 changes: 50 additions & 0 deletions src/bcbench/commands/category.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import os
import sys
from pathlib import Path

import typer
from typing_extensions import Annotated

from bcbench.cli_options import EvaluationCategoryOption
from bcbench.types import EvaluationCategory

category_app = typer.Typer(help="Category-specific configuration helpers")


@category_app.command("list")
def list_categories() -> None:
"""Print all evaluation category names, one per line."""
for category in EvaluationCategory:
sys.stdout.write(f"{category.value}\n")


@category_app.command("bceval-config")
def bceval_config(
category: EvaluationCategoryOption,
github_output: Annotated[
Path | None,
typer.Option(envvar="GITHUB_OUTPUT", help="Append outputs to this file (typically $GITHUB_OUTPUT)"),
] = None,
) -> None:
"""
Print the bc-eval evaluator list and core score for a category as key=value lines.

When run inside a GitHub Actions step with $GITHUB_OUTPUT set, the lines are
appended to that file so they become step outputs. Otherwise they're written
to stdout.
"""
lines: list[str] = [
f"evaluators={','.join(category.evaluators)}",
f"core_score={category.core_score}",
]
payload: str = "\n".join(lines) + "\n"

if github_output:
with open(github_output, "a", encoding="utf-8") as file:
file.write(payload)
else:
sys.stdout.write(payload)

# Always echo to stderr so workflow logs show what was emitted.
if os.getenv("GITHUB_ACTIONS"):
sys.stderr.write(payload)
3 changes: 2 additions & 1 deletion src/bcbench/dataset/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
"""Dataset module for querying, validating and analyze dataset entries."""

from bcbench.dataset.dataset_entry import BaseDatasetEntry, BugFixEntry, TestEntry, TestGenEntry
from bcbench.dataset.dataset_entry import BaseDatasetEntry, BugFixEntry, ExpectedOutput, TestEntry, TestGenEntry

__all__ = [
"BaseDatasetEntry",
"BugFixEntry",
"ExpectedOutput",
"TestEntry",
"TestGenEntry",
]
23 changes: 20 additions & 3 deletions src/bcbench/dataset/dataset_entry.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import re
from abc import abstractmethod
from pathlib import Path
from typing import Annotated, Self
from typing import Annotated, Literal, Self, TypedDict

from pydantic import BaseModel, ConfigDict, Field, model_validator

Expand All @@ -13,7 +13,24 @@

_config = get_config()

__all__ = ["BaseDatasetEntry", "BugFixEntry", "TestEntry", "TestGenEntry"]
__all__ = ["BaseDatasetEntry", "BugFixEntry", "ExpectedOutput", "TestEntry", "TestGenEntry"]
Comment thread
haoranpb marked this conversation as resolved.
Dismissed


type ChecklistLevel = Literal["critical", "expected", "aspirational"]


class ChecklistAssertion(TypedDict):
text: str
level: ChecklistLevel


class Checklist(TypedDict):
assertions: list[ChecklistAssertion]


# Patch-style string for execution-based categories (bug-fix, test-generation),
# or an lm_checklist payload for scorer-driven categories.
type ExpectedOutput = str | Checklist


class TestEntry(BaseModel):
Expand Down Expand Up @@ -89,7 +106,7 @@ def get_task(self) -> str:
pass

@abstractmethod
def get_expected_output(self) -> str:
def get_expected_output(self) -> ExpectedOutput:
pass

def extract_project_name(self) -> str:
Expand Down
7 changes: 4 additions & 3 deletions src/bcbench/results/bceval_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from pathlib import Path
from typing import Any

from bcbench.dataset import BaseDatasetEntry
from bcbench.dataset import BaseDatasetEntry, ExpectedOutput
from bcbench.logger import get_logger
from bcbench.results.base import BaseEvaluationResult
from bcbench.types import EvaluationCategory
Expand All @@ -29,7 +29,8 @@ def write_bceval_results(results: list[BaseEvaluationResult], out_dir: Path, run
continue

matched_entry = matching_entries[0]
input, expected = matched_entry.get_task(), matched_entry.get_expected_output()
task_input: str = matched_entry.get_task()
expected: ExpectedOutput = matched_entry.get_expected_output()
Comment thread
haoranpb marked this conversation as resolved.

metadata: dict[str, Any] = {
"model": result.model,
Expand All @@ -47,7 +48,7 @@ def write_bceval_results(results: list[BaseEvaluationResult], out_dir: Path, run

bceval_result = {
"id": result.instance_id,
"input": input,
"input": task_input,
"expected": expected,
"output": result.output,
"context": "",
Expand Down
24 changes: 24 additions & 0 deletions src/bcbench/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,30 @@ def pipeline(self) -> EvaluationPipeline:

raise ValueError(f"Unknown evaluation category: {self}")

@property
def evaluators(self) -> list[str]:
"""
Names of bc-eval evaluators (from evaluator/scores.py) to run for this category.

Used for uploading evaluation results to long term storage.
"""
match self:
case EvaluationCategory.BUG_FIX:
return ["resolution_rate", "build_rate"]
case EvaluationCategory.TEST_GENERATION:
return ["resolution_rate", "build_rate", "pre_patch_failed_rate", "post_patch_passed_rate"]

raise ValueError(f"Unknown evaluation category: {self}")

@property
def core_score(self) -> str:
"""Name of the evaluator whose value is considered as CoreScore, required by bc-eval."""
match self:
case EvaluationCategory.BUG_FIX | EvaluationCategory.TEST_GENERATION:
return "ResolutionRate"

raise ValueError(f"Unknown evaluation category: {self}")


@dataclass(frozen=True)
class ContainerConfig:
Expand Down
51 changes: 51 additions & 0 deletions tests/test_category_command.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
from typer.testing import CliRunner

from bcbench.cli import app
from bcbench.types import EvaluationCategory

runner = CliRunner()


def test_bceval_config_prints_evaluators_and_core_score_to_stdout_when_no_github_output(monkeypatch):
monkeypatch.delenv("GITHUB_OUTPUT", raising=False)
monkeypatch.delenv("GITHUB_ACTIONS", raising=False)

result = runner.invoke(app, ["category", "bceval-config", "--category", "bug-fix"])

assert result.exit_code == 0
assert "evaluators=resolution_rate,build_rate" in result.stdout
assert "core_score=ResolutionRate" in result.stdout


def test_bceval_config_appends_to_github_output_file_when_set(tmp_path, monkeypatch):
output_file = tmp_path / "gh_output"
output_file.write_text("pre_existing=keep\n", encoding="utf-8")
monkeypatch.setenv("GITHUB_OUTPUT", str(output_file))
monkeypatch.delenv("GITHUB_ACTIONS", raising=False)

result = runner.invoke(app, ["category", "bceval-config", "--category", "test-generation"])

assert result.exit_code == 0
contents = output_file.read_text(encoding="utf-8")
assert "pre_existing=keep" in contents
assert "evaluators=resolution_rate,build_rate,pre_patch_failed_rate,post_patch_passed_rate" in contents
assert "core_score=ResolutionRate" in contents


def test_bceval_config_supports_every_category(monkeypatch):
monkeypatch.delenv("GITHUB_OUTPUT", raising=False)
monkeypatch.delenv("GITHUB_ACTIONS", raising=False)

for category in EvaluationCategory:
result = runner.invoke(app, ["category", "bceval-config", "--category", category.value])
assert result.exit_code == 0, f"{category}: {result.stdout}"
assert f"evaluators={','.join(category.evaluators)}" in result.stdout
assert f"core_score={category.core_score}" in result.stdout


def test_list_prints_every_category_one_per_line():
result = runner.invoke(app, ["category", "list"])

assert result.exit_code == 0
lines = [line for line in result.stdout.splitlines() if line]
assert lines == [c.value for c in EvaluationCategory]
32 changes: 31 additions & 1 deletion tests/test_result_writer.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import json
from unittest.mock import PropertyMock, patch

from bcbench.dataset.dataset_entry import _BugFixTestGenBase
from bcbench.dataset.dataset_entry import BugFixEntry, _BugFixTestGenBase
from bcbench.results.bceval_export import write_bceval_results
from bcbench.types import AgentMetrics, EvaluationCategory
from tests.conftest import VALID_INSTANCE_ID, create_bugfix_result
Expand Down Expand Up @@ -180,3 +180,33 @@ def test_handles_partial_none_metrics(self, tmp_path, sample_dataset_file, probl
assert data["metadata"]["prompt_tokens"] == 0
assert data["metadata"]["completion_tokens"] == 1500
assert data["metadata"]["latency"] == 100.0

def test_preserves_dict_expected_output_for_lm_checklist_style_categories(self, tmp_path, sample_dataset_file, sample_bugfix_result_with_metrics, problem_statement_dir):
"""A category whose `get_expected_output()` returns assertions (dict) must surface them as a JSON object in `expected`."""
output_dir = tmp_path / "output"
output_dir.mkdir()

checklist_payload = {
"assertions": [
{"text": "The output identifies the root cause.", "level": "critical"},
{"text": "The output mentions the affected codeunit.", "level": "expected"},
]
}

with (
patch.object(_BugFixTestGenBase, "problem_statement_dir", property(lambda self: problem_statement_dir)),
patch.object(EvaluationCategory, "dataset_path", new_callable=PropertyMock, return_value=sample_dataset_file),
patch.object(BugFixEntry, "get_expected_output", lambda self: checklist_payload),
):
write_bceval_results(
results=[sample_bugfix_result_with_metrics],
out_dir=output_dir,
run_id="test_run_checklist",
output_filename="results.jsonl",
category=EvaluationCategory.BUG_FIX,
)

with open(output_dir / "results.jsonl") as f:
data = json.loads(f.readline())

assert data["expected"] == checklist_payload
23 changes: 21 additions & 2 deletions tests/test_type_exhaustiveness.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,5 +39,24 @@ def test_all_categories_handled_in_get_expected_output(sample_dataset_entry_with
input_text = entry.get_task()
expected_output = entry.get_expected_output()
assert isinstance(input_text, str)
assert isinstance(expected_output, str)
assert len(expected_output) > 0
# ExpectedOutput is `str | Checklist`: string for execution-based categories,
# `{"assertions": [...]}` for lm_checklist-driven ones.
if isinstance(expected_output, dict):
assert "assertions" in expected_output
else:
assert isinstance(expected_output, str)
assert expected_output


def test_all_categories_have_evaluators():
for category in EvaluationCategory:
evaluators = category.evaluators
assert isinstance(evaluators, list)
assert evaluators, f"{category} must declare at least one evaluator"
assert all(isinstance(e, str) and e for e in evaluators)


def test_all_categories_have_core_score():
for category in EvaluationCategory:
assert isinstance(category.core_score, str)
assert category.core_score, f"{category} must declare a non-empty core_score"
Loading