microsoft · WaelAbuSeada · Apr 8, 2026 · Apr 9, 2026 · Apr 9, 2026 · Apr 9, 2026
diff --git a/.github/actions/setup-bc-container/action.yml b/.github/actions/setup-bc-container/action.yml
@@ -14,6 +14,10 @@ inputs:
   github-token:
     description: GitHub token for accessing public repositories
     required: true
+  dataset-path:
+    description: Path to the dataset file
+    required: false
+    default: "dataset/bcbench.jsonl"
   skip-container:
     description: Skip BC container setup (only clone repository)
     required: false
@@ -66,5 +70,5 @@ runs:
         $env:ADO_TOKEN = az account get-access-token --resource "499b84ac-1321-427f-aa17-267ca6975798" --query accessToken -o tsv
         Write-Output "::add-mask::$env:ADO_TOKEN"
 
-        .\scripts\Setup-ContainerAndRepository.ps1 -InstanceId "${{ inputs.instance-id }}" ${{ inputs.skip-container == 'true' && '-SkipContainer' || '' }}
+        .\scripts\Setup-ContainerAndRepository.ps1 -InstanceId "${{ inputs.instance-id }}" -DatasetPath "${{ inputs.dataset-path }}" ${{ inputs.skip-container == 'true' && '-SkipContainer' || '' }}
       shell: pwsh
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
@@ -41,7 +41,7 @@ jobs:
         id: random
         shell: pwsh
         run: |
-          $categories = @("bug-fix", "test-generation")
+          $categories = @("bug-fix", "test-generation", "code-review")
           $selected = $categories | Get-Random
           echo "category=$selected" >> $env:GITHUB_OUTPUT
 

diff --git a/.github/workflows/claude-evaluation.yml b/.github/workflows/claude-evaluation.yml
@@ -23,6 +23,7 @@ on:
         options:
           - "bug-fix"
           - "test-generation"
+          - "code-review"
       test-run:
         description: "Indicate this is a test run (with few entries)"
         required: false
@@ -33,6 +34,11 @@ on:
         required: false
         default: false
         type: boolean
+      skip-container-setup:
+        description: "Skip BC container setup (repository setup still runs)"
+        required: false
+        default: false
+        type: boolean
       repeat:
         description: "Number of times to run sequentially (ignored for test runs)"
         required: false
@@ -90,6 +96,8 @@ jobs:
           azure-client-id: ${{ secrets.AZURE_CLIENT_ID }}
           azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }}
           github-token: ${{ secrets.GITHUB_TOKEN }}
+          dataset-path: ${{ inputs.category == 'code-review' && 'dataset/codereview.jsonl' || 'dataset/bcbench.jsonl' }}
+          skip-container: ${{ inputs.category == 'code-review' || inputs.skip-container-setup }}
 
       - name: Setup Python with UV
         uses: ./.github/actions/setup-python-uv
@@ -154,4 +162,4 @@ jobs:
       workflow-file: claude-evaluation.yml
       repeat: ${{ inputs.repeat }}
       workflow-inputs: |
-        {"model": "${{ inputs.model }}", "category": "${{ inputs.category }}", "test-run": "${{ inputs.test-run }}", "al-mcp": "${{ inputs.al-mcp }}"}
+        {"model": "${{ inputs.model }}", "category": "${{ inputs.category }}", "test-run": "${{ inputs.test-run }}", "al-mcp": "${{ inputs.al-mcp }}", "skip-container-setup": "${{ inputs.skip-container-setup }}"}
diff --git a/.github/workflows/copilot-evaluation.yml b/.github/workflows/copilot-evaluation.yml
@@ -30,6 +30,7 @@ on:
         options:
           - "bug-fix"
           - "test-generation"
+          - "code-review"
       test-run:
         description: "Indicate this is a test run (with few entries)"
         required: false
@@ -40,6 +41,11 @@ on:
         required: false
         default: false
         type: boolean
+      skip-container-setup:
+        description: "Skip BC container setup (repository setup still runs)"
+        required: false
+        default: false
+        type: boolean
       repeat:
         description: "Number of times to run sequentially (ignored for test runs)"
         required: false
@@ -97,6 +103,8 @@ jobs:
           azure-client-id: ${{ secrets.AZURE_CLIENT_ID }}
           azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }}
           github-token: ${{ secrets.GITHUB_TOKEN }}
+          dataset-path: ${{ inputs.category == 'code-review' && 'dataset/codereview.jsonl' || 'dataset/bcbench.jsonl' }}
+          skip-container: ${{ inputs.category == 'code-review' || inputs.skip-container-setup }}
 
       - name: Setup Python with UV
         uses: ./.github/actions/setup-python-uv
@@ -170,4 +178,4 @@ jobs:
       workflow-file: copilot-evaluation.yml
       repeat: ${{ inputs.repeat }}
       workflow-inputs: |
-        {"model": "${{ inputs.model }}", "category": "${{ inputs.category }}", "test-run": "${{ inputs.test-run }}", "al-mcp": "${{ inputs.al-mcp }}"}
+        {"model": "${{ inputs.model }}", "category": "${{ inputs.category }}", "test-run": "${{ inputs.test-run }}", "al-mcp": "${{ inputs.al-mcp }}", "skip-container-setup": "${{ inputs.skip-container-setup }}"}
diff --git a/.github/workflows/summarize-results.yml b/.github/workflows/summarize-results.yml
@@ -75,6 +75,14 @@ jobs:
           BRAINTRUST_PROJECT_ID: ${{ secrets.BRAINTRUST_PROJECT_ID }}
           BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }}
         run: |
+          if [ "${{ inputs.category }}" = "code-review" ]; then
+            EVALUATORS="precision_score,recall_score,f1_score,valid_review_output"
+          elif [ "${{ inputs.category }}" = "test-generation" ]; then
+            EVALUATORS="resolution_rate,build_rate,pre_patch_failed_rate,post_patch_passed_rate"
+          else
+            EVALUATORS="resolution_rate,build_rate"
+          fi
+
           # Get Azure DevOps access token from Azure CLI (uses the OIDC token from azure/login)
           ADO_TOKEN=$(az account get-access-token --resource 499b84ac-1321-427f-aa17-267ca6975798 --query accessToken -o tsv)
           echo "::add-mask::$ADO_TOKEN"
@@ -88,7 +96,7 @@ jobs:
           bceval metrics calculate \
             --input-file "${{ inputs.results-dir }}/${{ github.run_id }}/${{ env.BCEVAL_RESULT_FILE }}" \
             --evaluator-definitions "${{ github.workspace }}/evaluator/scores.py" \
-            --evaluators "resolution_rate,build_rate${{ inputs.category == 'test-generation' && ',pre_patch_failed_rate,post_patch_passed_rate' || '' }}" \
+            --evaluators "${EVALUATORS}" \
             --metric-definitions "${{ github.workspace }}/evaluator/metrics.py" \
             --metrics "bc_bench_metrics" \
             --eval-run-name "${{ inputs.agent }} (${{ inputs.model }}) - #${{ github.run_id }}" \

diff --git a/dataset/codereview.jsonl b/dataset/codereview.jsonl
diff --git a/docs/_data/code-review.json b/docs/_data/code-review.json
@@ -0,0 +1,4 @@
+{
+    "runs": [],
+    "aggregate": []
+}
diff --git a/evaluator/scores.py b/evaluator/scores.py
@@ -19,3 +19,23 @@ def __call__(self, *, metadata: dict, **kwargs: object) -> bool:
 class PostPatchPassedRate:
     def __call__(self, *, metadata: dict, **kwargs: object) -> bool:
         return metadata.get("post_patch_passed", False)
+
+
+class PrecisionScore:
+    def __call__(self, *, metadata: dict, **kwargs: object) -> float:
+        return float(metadata.get("precision", 0.0))
+
+
+class RecallScore:
+    def __call__(self, *, metadata: dict, **kwargs: object) -> float:
+        return float(metadata.get("recall", 0.0))
+
+
+class F1Score:
+    def __call__(self, *, metadata: dict, **kwargs: object) -> float:
+        return float(metadata.get("f1", 0.0))
+
+
+class ValidReviewOutput:
+    def __call__(self, *, metadata: dict, **kwargs: object) -> bool:
+        return bool(metadata.get("valid_review_output", False))
diff --git a/src/bcbench/agent/copilot/agent.py b/src/bcbench/agent/copilot/agent.py
@@ -49,7 +49,9 @@ def run_copilot_agent(
     logger.info(f"Executing Copilot CLI in directory: {repo_path}")
     logger.debug(f"Using prompt:\n{prompt}")
 
-    copilot_cmd = shutil.which("copilot.cmd") or shutil.which("copilot")
+    # Prefer copilot.exe over copilot.bat/copilot.cmd shims on Windows: the .bat shim invokes PowerShell,
+    # which re-parses arguments and corrupts prompts containing double quotes (e.g. JSON examples).
+    copilot_cmd = shutil.which("copilot.exe") or shutil.which("copilot.cmd") or shutil.which("copilot")
     if not copilot_cmd:
         raise AgentError("Copilot CLI not found in PATH. Please ensure it is installed and available.")
 

diff --git a/src/bcbench/agent/shared/config.yaml b/src/bcbench/agent/shared/config.yaml
@@ -54,6 +54,13 @@ prompt:
     {{task}}
     {% endif %}
 
+  code-review-template: |
+    @al-code-review
+
+    Review the current branch changes and return findings using the al-code-review skill schema.
+    Save the findings JSON to a file named review.json in the repository root.
+    If there are no findings, write an empty findings list.
+
 # controls:
 # 1. whether to copy custom instructions from `src/bcbench/agent/shared/instructions/<sanitized-repo>/`
 #    - Copilot: copies to repo/.github/ and renames AGENTS.md to copilot-instructions.md
@@ -63,14 +70,14 @@ prompt:
 # NOTE: the canonical source file is AGENTS.md; it is automatically renamed
 #       to the agent-specific filename (AgentType.instruction_filename) during setup
 instructions:
-  enabled: false
+  enabled: true
 
 # controls:
 # 1. whether to copy skills from `src/bcbench/agent/shared/instructions/<sanitized-repo>/skills/`
 #    - Copilot: copies to repo/.github/skills/
 #    - Claude: copies to repo/.claude/skills/
 skills:
-  enabled: false
+  enabled: true
 
 # controls:
 # 1. whether to copy custom agents from `src/bcbench/agent/shared/instructions/<sanitized-repo>/agents/`