From b39da5c2fc5d05311ad4f3ee9557d493924694f5 Mon Sep 17 00:00:00 2001 From: Rolf Bjarne Kvinge Date: Wed, 22 Apr 2026 14:23:33 +0200 Subject: [PATCH 01/17] Add macios-ci-postmortem skill for automated CI failure analysis New Copilot CLI skill that analyzes CI builds across recent PRs to identify failures unrelated to any specific PR: - Flaky tests (pass on rerun with same commit) - Shared regressions (same failure across multiple unrelated PRs) - Infrastructure issues (provisioning, timeouts, etc.) The skill operates in 4 phases: discovery, extraction, classification, and issue filing (with user confirmation before any GitHub issue changes). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .agents/skills/macios-ci-postmortem/SKILL.md | 454 ++++++++++++++++++ .../references/azure-devops-cli.md | 95 ++++ 2 files changed, 549 insertions(+) create mode 100644 .agents/skills/macios-ci-postmortem/SKILL.md create mode 100644 .agents/skills/macios-ci-postmortem/references/azure-devops-cli.md diff --git a/.agents/skills/macios-ci-postmortem/SKILL.md b/.agents/skills/macios-ci-postmortem/SKILL.md new file mode 100644 index 000000000000..2148c0de5e62 --- /dev/null +++ b/.agents/skills/macios-ci-postmortem/SKILL.md @@ -0,0 +1,454 @@ +--- +name: macios-ci-postmortem +description: Post-mortem analysis of CI failures across recent PRs in dotnet/macios. Identifies flaky tests, infrastructure issues, and shared regressions by analyzing builds from the last week. Files or updates GitHub issues for failures unrelated to any specific PR. Use when asked to "find flaky tests", "CI post-mortem", "what's been failing in CI", or "file issues for flaky failures". +--- + +# macios CI Post-Mortem + +Analyze CI failures across recent PRs to identify flaky tests, infrastructure issues, and shared regressions that are not caused by any specific PR. File or update GitHub issues for these. + +## References + +Read these as needed during investigation: + +- `references/azure-devops-cli.md` — az CLI commands, artifact naming conventions, and JSON parsing caveats. + +## Overview + +This skill operates in four phases: + +1. **Discovery** — collect all recent PR-validation builds from AzDO +2. **Extraction** — for failed builds, extract normalized failure records +3. **Classification** — categorize failures as flaky, infrastructure, shared regression, or PR-specific +4. **Issue Actions** — propose GitHub issues, get user confirmation, then file/update + +## Phase 1: Discovery — Collect Recent Builds + +**Start from builds, not PRs.** This is faster, gives access to commit SHAs for rerun detection, and captures builds for PRs that may already be closed. + +### Step 1.1: List recent PR-validation builds + +Use the `az` CLI to get builds from the last 7 days. The macios CI runs on `devdiv.visualstudio.com/DevDiv`. + +```bash +# Get the date 7 days ago in ISO format +SINCE=$(python3 -c "from datetime import datetime, timedelta; print((datetime.utcnow() - timedelta(days=7)).strftime('%Y-%m-%dT%H:%M:%SZ'))") + +# List recent builds for the PR pipeline +az pipelines build list \ + --org https://devdiv.visualstudio.com \ + --project DevDiv \ + --reason pullRequest \ + --result failed \ + --top 200 \ + --query-order finishTimeDescending \ + -o json > /tmp/postmortem_builds.json +``` + +Also fetch partially succeeded builds (these contain test failures): + +```bash +az pipelines build list \ + --org https://devdiv.visualstudio.com \ + --project DevDiv \ + --reason pullRequest \ + --result partiallySucceeded \ + --top 200 \ + --query-order finishTimeDescending \ + -o json > /tmp/postmortem_builds_partial.json +``` + +### Step 1.2: Parse and filter builds + +```python +import json +from datetime import datetime, timedelta, timezone + +since = datetime.now(timezone.utc) - timedelta(days=7) + +def load_builds(path): + with open(path) as f: + content = f.read() + return json.JSONDecoder().raw_decode(content)[0] + +builds = load_builds('/tmp/postmortem_builds.json') + load_builds('/tmp/postmortem_builds_partial.json') + +# Filter to last 7 days and macios pipelines +recent = [] +for b in builds: + finish = b.get('finishTime', '') + if not finish: + continue + ft = datetime.fromisoformat(finish.replace('Z', '+00:00')) + if ft < since: + continue + # Only include macios pipelines + defn = b.get('definition', {}).get('name', '') + if 'macios' not in defn.lower() and 'xamarin-macios' not in defn.lower(): + continue + recent.append({ + 'id': b['id'], + 'result': b['result'], + 'pr': b.get('triggerInfo', {}).get('pr.number', ''), + 'sourceBranch': b.get('sourceBranch', ''), + 'sourceVersion': b.get('sourceVersion', ''), # commit SHA — critical for rerun detection + 'pipeline': defn, + 'finishTime': finish, + }) + +print(f"Found {len(recent)} builds from {len(set(b['pr'] for b in recent if b['pr']))} PRs") +``` + +### Step 1.3: Group builds for rerun detection + +Group by `(pr, pipeline, sourceVersion)`. Multiple builds with the same commit SHA for the same PR/pipeline are reruns. + +```python +from collections import defaultdict + +# Group: (pr, pipeline, commitSHA) -> [builds] +groups = defaultdict(list) +for b in recent: + key = (b['pr'], b['pipeline'], b['sourceVersion']) + groups[key].append(b) + +# Also group by just (pr, pipeline) to see if new commits fixed things +pr_pipeline = defaultdict(list) +for b in recent: + key = (b['pr'], b['pipeline']) + pr_pipeline[key].append(b) +``` + +## Phase 2: Extraction — Get Failure Details + +For each failed/partiallySucceeded build, extract failure information. Use a SQL database to track failures across builds. + +### Step 2.1: Set up failure tracking + +```sql +CREATE TABLE IF NOT EXISTS ci_failures ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + build_id INTEGER, + pr TEXT, + pipeline TEXT, + commit_sha TEXT, + finish_time TEXT, + job_name TEXT, + failure_type TEXT, -- 'TestFailure', 'BuildFailure', 'TimedOut', 'Crashed', 'Infrastructure' + test_fullname TEXT, -- e.g. 'MonoTouchFixtures.SomeTest.TestMethod' + platform TEXT, -- e.g. 'ios', 'tvos', 'macos', 'maccatalyst' + config TEXT, -- e.g. 'Debug (ARM64)', 'Release (x64)' + error_signature TEXT, -- normalized error message / top stack frame + raw_message TEXT +); +``` + +### Step 2.2: For each build, get the timeline and TestSummary artifacts + +Only process builds with failures. For efficiency, first check the timeline for failed jobs, then only download artifacts for those jobs. + +```bash +# Get timeline +az devops invoke --area build --resource timeline \ + --route-parameters project=DevDiv buildId= \ + --org https://devdiv.visualstudio.com -o json > /tmp/timeline_.json +``` + +Parse the timeline to find failed jobs: + +```python +import json + +with open(f'/tmp/timeline_{build_id}.json') as f: + data = json.JSONDecoder().raw_decode(f.read())[0] + +failed_jobs = [] +for r in data.get('records', []): + if r.get('type') == 'Job' and r.get('result') == 'failed': + failed_jobs.append({ + 'name': r['name'], + 'id': r['id'], + 'logId': r.get('log', {}).get('id'), + }) +``` + +### Step 2.3: Download and parse TestSummary artifacts + +For each failed job, download the TestSummary artifact: + +```bash +artifact="TestSummary-simulator_tests-1" +mkdir -p "/tmp/postmortem/${build_id}/${artifact}" +az pipelines runs artifact download \ + --artifact-name "$artifact" \ + --path "/tmp/postmortem/${build_id}/${artifact}" \ + --run-id \ + --org https://devdiv.visualstudio.com --project DevDiv +``` + +Parse the TestSummary.md for individual failures and insert into the SQL database. + +### Step 2.4: For infrastructure/setup failures without TestSummary + +Check the timeline for failed tasks in setup/provisioning stages. Extract error info from task log lines: + +```bash +az devops invoke --area build --resource logs \ + --route-parameters project=DevDiv buildId= logId= \ + --org https://devdiv.visualstudio.com -o json > /tmp/log__.json +``` + +Search for infrastructure-related errors: +- "Provision" failures +- "Reserve bot" failures +- Network/timeout errors +- Xcode installation issues + +### Step 2.5: Normalize failure signatures + +Create a normalized signature for deduplication: + +```python +def normalize_signature(failure_type, test_fullname, error_msg, platform): + """Create a stable key for grouping the same logical failure.""" + if test_fullname: + # For test failures, the test name + platform is the key + return f"{failure_type}|{platform}|{test_fullname}" + elif error_msg: + # For build/infra failures, normalize the error message + # Strip file paths, line numbers, timestamps + import re + normalized = re.sub(r'/[^\s:]+/', '.../', error_msg) + normalized = re.sub(r'line \d+', 'line N', normalized) + normalized = re.sub(r'\d{4}-\d{2}-\d{2}T[\d:.]+Z?', 'TIMESTAMP', normalized) + return f"{failure_type}|{platform}|{normalized[:200]}" + return f"{failure_type}|{platform}|unknown" +``` + +## Phase 3: Classification + +Query the failure database to classify each unique failure. + +### Step 3.1: Identify flaky tests (same commit, different outcomes) + +A failure is **flaky** if the same PR + pipeline + commit SHA has both failing and succeeding builds, OR if a rerun of the exact same configuration passes. + +```sql +-- Find failures where the same commit had a passing build too +-- (builds that aren't in our failure DB were successful) +SELECT DISTINCT error_signature, test_fullname, platform, + COUNT(DISTINCT build_id) as fail_count, + COUNT(DISTINCT pr) as pr_count, + GROUP_CONCAT(DISTINCT pr) as prs +FROM ci_failures +GROUP BY error_signature +HAVING COUNT(DISTINCT build_id) > 0; +``` + +Cross-reference with the build groups from Phase 1: if a `(pr, pipeline, commitSHA)` group has multiple builds and at least one succeeded (not in the failure DB), then failures in the failing builds for that group are flaky. + +### Step 3.2: Identify shared regressions (same failure across unrelated PRs) + +```sql +-- Failures appearing across 2+ unrelated PRs +SELECT error_signature, test_fullname, platform, failure_type, + COUNT(DISTINCT pr) as pr_count, + COUNT(DISTINCT build_id) as build_count, + GROUP_CONCAT(DISTINCT pr) as affected_prs +FROM ci_failures +WHERE pr != '' +GROUP BY error_signature +HAVING COUNT(DISTINCT pr) >= 2 +ORDER BY pr_count DESC; +``` + +If the failure is NOT also identified as flaky (i.e., it doesn't go away on rerun), classify it as a **shared regression**. + +### Step 3.3: Identify infrastructure failures + +Look for patterns in failure_type and error messages: + +```sql +SELECT error_signature, failure_type, raw_message, + COUNT(DISTINCT build_id) as occurrences +FROM ci_failures +WHERE failure_type = 'Infrastructure' + OR raw_message LIKE '%provision%' + OR raw_message LIKE '%reserve bot%' + OR raw_message LIKE '%timeout%waiting%' + OR raw_message LIKE '%network%' + OR raw_message LIKE '%Could not find simulator%' +GROUP BY error_signature +ORDER BY occurrences DESC; +``` + +### Step 3.4: Exclude PR-specific failures + +A failure is PR-specific if: +- It appears in only 1 PR +- It persists across commits within that PR (not a rerun flake) +- It is consistent (never passes on rerun) + +These should be **excluded** from issue filing — they are the PR author's problem. + +### Step 3.5: Produce classification summary + +Create a summary table for user review: + +``` +| Category | Signature (truncated) | Test/Error | Platform | PRs Affected | Occurrences | +|--------------------|--------------------------------|---------------------|-------------|-------------- |-------------| +| Flaky | TestFailure|ios|Mono...Test | SomeTest.Method | ios | 5 | 8 | +| Shared Regression | BuildFailure|macos|error CS... | (build error) | macos | 3 | 3 | +| Infrastructure | Infrastructure|*|provision... | Bot provisioning | all | 4 | 4 | +``` + +## Phase 4: Issue Actions + +### Step 4.1: Search for existing issues + +For each classified failure, search for an existing GitHub issue: + +```bash +# Search by test name or error signature in issue title +gh issue list --repo dotnet/macios --state open \ + --search "" \ + --label "bug" --json number,title,labels,url +``` + +Also search closed issues (may need reopening): + +```bash +gh issue list --repo dotnet/macios --state closed \ + --search "" \ + --label "bug" --json number,title,labels,url +``` + +### Step 4.2: Propose actions to the user + +Present a list of proposed actions **before executing any**. Use `ask_user` to get confirmation. + +For each failure, propose one of: +- **Create new issue** — no existing issue found +- **Comment on existing issue** — matching open issue found, add recent occurrence data +- **Reopen issue** — matching closed issue found, failure has recurred +- **Skip** — user decides this isn't worth tracking + +Format the proposal clearly: + +``` +## Proposed Issue Actions + +### 1. Flaky: MonoTouchFixtures.NetworkTest.TestReachability (iOS) + - Seen in 5 PRs, 8 builds over the past week + - Disappears on rerun → flaky + - Existing issue: #12345 (open) — will add comment with recent data + - **Proposed action:** Comment on #12345 + +### 2. Shared Regression: error CS1234 in SomeFile.cs (macOS) + - Seen in 3 PRs, consistent (no rerun recovery) + - No existing issue found + - **Proposed action:** Create new issue + +### 3. Infrastructure: Bot provisioning timeout + - Seen in 4 builds across 4 PRs + - Existing issue: #11111 (closed) — last closed 2 months ago + - **Proposed action:** Reopen #11111 + +Proceed with these actions? [Confirm / Edit / Skip] +``` + +### Step 4.3: Execute confirmed actions + +#### Create new issue + +```bash +gh issue create --repo dotnet/macios \ + --title "[CI] Flaky: on " \ + --label "bug,CI,flaky-test" \ + --body "$(cat <<'EOF' +## Flaky Test Report (automated) + +**Test:** `` +**Platform:** +**Category:** Flaky / Shared Regression / Infrastructure +**Period:** to + +### Occurrence Summary + +| PR | Build | Commit | Date | Result | +|----|-------|--------|------|--------| +| # | []() | | | Failed | +| # | []() | | | Passed on rerun | + +**Total:** Failed in builds across PRs + +### Error Details + +``` + +``` + +### Classification + +This failure was identified as **flaky** because: +- It appeared across unrelated PRs +- It disappeared on rerun in cases + +--- +*This issue was automatically generated by CI post-mortem analysis.* +EOF +)" +``` + +Use the label `flaky-test` for flaky tests, `infrastructure` for infra issues, and `CI` for all. + +#### Comment on existing issue + +```bash +gh issue comment --repo dotnet/macios --body "$(cat <<'EOF' +## CI Post-Mortem Update () + +This failure was seen again in the past week: + +| PR | Build | Date | Outcome | +|----|-------|------|---------| +| # | | | Failed | +... + +Total: occurrences across PRs this week. +EOF +)" +``` + +#### Reopen closed issue + +```bash +gh issue reopen --repo dotnet/macios +gh issue comment --repo dotnet/macios --body "Reopening — this failure recurred in builds this week. See details below. +..." +``` + +## Important Notes + +### Efficiency + +- Process builds in batches. Don't download artifacts for every build — first check the timeline for failed jobs. +- Use the SQL database to accumulate results incrementally. You can query it between phases. +- Skip builds older than 7 days early in the pipeline. + +### Accuracy + +- **Rerun detection requires matching commit SHA.** A newer commit on the same PR that passes does NOT prove flakiness — the new commit may have fixed the issue. +- **Verify the same job/config ran** before concluding a failure "went away." The test matrix can vary between runs. +- **Don't conflate platforms.** A test failing on iOS and macOS should be tracked separately unless the error signature is identical. + +### Rate Limiting + +- AzDO API calls are subject to rate limits. Add small delays between artifact downloads if processing many builds. +- `gh` CLI may also rate-limit. Batch issue searches where possible. + +### Confirmation + +- **Never file or modify issues without user confirmation.** Always present the classification summary and proposed actions first. +- Let the user edit the proposals (e.g., skip certain failures, change labels, adjust titles). diff --git a/.agents/skills/macios-ci-postmortem/references/azure-devops-cli.md b/.agents/skills/macios-ci-postmortem/references/azure-devops-cli.md new file mode 100644 index 000000000000..8468d08b5598 --- /dev/null +++ b/.agents/skills/macios-ci-postmortem/references/azure-devops-cli.md @@ -0,0 +1,95 @@ +# Azure DevOps CLI Reference for macios CI + +## Authentication + +The `az devops` CLI must be authenticated. Typically this is done via: +```bash +az devops configure --defaults organization=https://devdiv.visualstudio.com project=DevDiv +``` + +Or by passing `--org` and `--project` on each command. + +## Key Commands + +### Build metadata +```bash +az pipelines build show --id -o json +``` +Returns: result, status, sourceBranch, definition, requestedFor, startTime, finishTime. + +### Build timeline (jobs and tasks) +```bash +az devops invoke --area build --resource timeline \ + --route-parameters project=DevDiv buildId= \ + --org https://devdiv.visualstudio.com -o json +``` +Returns: records array with type (Stage/Job/Task), name, result, state, log.id, parentId. + +**Important:** `az pipelines build log list` is NOT a valid command. Use the `az devops invoke` approach above. + +### Task logs +```bash +az devops invoke --area build --resource logs \ + --route-parameters project=DevDiv buildId= logId= \ + --org https://devdiv.visualstudio.com -o json +``` +Returns: value array of log line strings. + +### Artifact listing +```bash +az pipelines runs artifact list --run-id -o json +``` + +### Artifact download +```bash +az pipelines runs artifact download \ + --artifact-name "" \ + --path /tmp/ci-artifacts/ \ + --run-id +``` + +## Common Pipeline Names + +- `xamarin-macios-sim-pr-tests` — PR validation with simulator tests +- Other pipeline names may vary; check `definition.name` from build show. + +## Common Job Names in Timeline + +- `T: monotouch_ios` — iOS monotouch tests +- `T: monotouch_tvos` — tvOS monotouch tests +- `macOS tests` — macOS and Mac Catalyst tests +- `Reserve macOS bot for tests` — bot provisioning +- Various build/packaging jobs + +## JSON Parsing Caveat + +`az devops invoke` output may include trailing non-JSON text. Always parse with: +```python +import json +with open('file.json', 'r') as f: + content = f.read() +data = json.JSONDecoder().raw_decode(content)[0] +``` + +Do NOT use `json.loads(content)` directly — it will fail on the trailing text. + +## Test Artifact Names + +TestSummary and HtmlReport artifacts follow a naming convention: +- `TestSummary-simulator_tests-1` — Markdown summary with pass/fail counts and failure details +- `HtmlReport-simulator_tests-1` — ZIP containing HTML report and NUnit XML files + +Common job names: +- `monotouch_ios`, `monotouch_tvos`, `monotouch_macos`, `monotouch_maccatalyst` +- `dotnettests_ios`, `dotnettests_tvos`, `dotnettests_macos`, `dotnettests_maccatalyst` +- `cecil`, `framework`, `xtro`, `msbuild`, `generator`, `sharpie`, `fsharp`, `linker` +- `introspection`, `xcframework`, `interdependent_binding_projects` + +**Important:** Each artifact download overwrites `TestSummary.md` in the target directory. Always download to separate subdirectories named after the artifact. + +## Key Investigation Strategy + +1. **Start with TestSummary artifacts** — they are the fastest way to identify what failed and why. Raw task logs are 40K+ lines and don't contain standard NUnit patterns inline. +2. **For test failures (not build failures)**, download HtmlReport artifacts and parse the NUnit XML files inside for exact test names, assertions, and stack traces. +3. **Only use raw task logs** when you need build error details (MSB/CS/NU errors) or infrastructure error context. +4. **Map timeline logIds to jobs** using the `parentId` field to trace task → job relationships. From 232ff7cb4f9a30a7f5d51c85049468edbf656b2b Mon Sep 17 00:00:00 2001 From: Rolf Bjarne Kvinge Date: Wed, 22 Apr 2026 14:26:30 +0200 Subject: [PATCH 02/17] Use 'ci-postmortem' label for all issues filed by the skill Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .agents/skills/macios-ci-postmortem/SKILL.md | 8 ++++---- external/Xamarin.MacDev | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.agents/skills/macios-ci-postmortem/SKILL.md b/.agents/skills/macios-ci-postmortem/SKILL.md index 2148c0de5e62..fd2fcf472afe 100644 --- a/.agents/skills/macios-ci-postmortem/SKILL.md +++ b/.agents/skills/macios-ci-postmortem/SKILL.md @@ -313,7 +313,7 @@ For each classified failure, search for an existing GitHub issue: # Search by test name or error signature in issue title gh issue list --repo dotnet/macios --state open \ --search "" \ - --label "bug" --json number,title,labels,url + --label "ci-postmortem" --json number,title,labels,url ``` Also search closed issues (may need reopening): @@ -321,7 +321,7 @@ Also search closed issues (may need reopening): ```bash gh issue list --repo dotnet/macios --state closed \ --search "" \ - --label "bug" --json number,title,labels,url + --label "ci-postmortem" --json number,title,labels,url ``` ### Step 4.2: Propose actions to the user @@ -365,7 +365,7 @@ Proceed with these actions? [Confirm / Edit / Skip] ```bash gh issue create --repo dotnet/macios \ --title "[CI] Flaky: on " \ - --label "bug,CI,flaky-test" \ + --label "bug,CI,ci-postmortem,flaky-test" \ --body "$(cat <<'EOF' ## Flaky Test Report (automated) @@ -401,7 +401,7 @@ EOF )" ``` -Use the label `flaky-test` for flaky tests, `infrastructure` for infra issues, and `CI` for all. +All issues **must** have the `ci-postmortem` label. Additionally use `flaky-test` for flaky tests and `infrastructure` for infra issues. #### Comment on existing issue diff --git a/external/Xamarin.MacDev b/external/Xamarin.MacDev index f1300986199f..5295c1f4fcad 160000 --- a/external/Xamarin.MacDev +++ b/external/Xamarin.MacDev @@ -1 +1 @@ -Subproject commit f1300986199f5489191d2c9712e57bf8a0a3d84a +Subproject commit 5295c1f4fcadb3216af0d903b3896dea15c2d37a From 72574d7569da332927b8335996d6830d5fafbc69 Mon Sep 17 00:00:00 2001 From: Rolf Bjarne Kvinge Date: Wed, 22 Apr 2026 22:19:08 +0200 Subject: [PATCH 03/17] Update CI postmortem skill with deep HTML report analysis Update SKILL.md to reflect lessons learned from running the skill: - Add steps for downloading and parsing HtmlReport artifacts - Add NUnit XML parsing for individual test failures - Add handling for crashes, build failures, and dotnettests - Fix --query-order flag (not supported by az pipelines build list) - Add HTML entity normalization for test name deduplication - Note performance concerns with large artifact downloads Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .agents/skills/macios-ci-postmortem/SKILL.md | 91 +++++++++++++++++--- 1 file changed, 80 insertions(+), 11 deletions(-) diff --git a/.agents/skills/macios-ci-postmortem/SKILL.md b/.agents/skills/macios-ci-postmortem/SKILL.md index fd2fcf472afe..2a54964475d2 100644 --- a/.agents/skills/macios-ci-postmortem/SKILL.md +++ b/.agents/skills/macios-ci-postmortem/SKILL.md @@ -41,7 +41,6 @@ az pipelines build list \ --reason pullRequest \ --result failed \ --top 200 \ - --query-order finishTimeDescending \ -o json > /tmp/postmortem_builds.json ``` @@ -54,7 +53,6 @@ az pipelines build list \ --reason pullRequest \ --result partiallySucceeded \ --top 200 \ - --query-order finishTimeDescending \ -o json > /tmp/postmortem_builds_partial.json ``` @@ -172,9 +170,9 @@ for r in data.get('records', []): }) ``` -### Step 2.3: Download and parse TestSummary artifacts +### Step 2.3: Download TestSummary artifacts (fast triage) -For each failed job, download the TestSummary artifact: +TestSummary artifacts are small and quick to download. Use them first to identify which jobs failed: ```bash artifact="TestSummary-simulator_tests-1" @@ -186,9 +184,78 @@ az pipelines runs artifact download \ --org https://devdiv.visualstudio.com --project DevDiv ``` -Parse the TestSummary.md for individual failures and insert into the SQL database. +Parse the TestSummary.md to determine which jobs have test failures. This is the first-pass filter. -### Step 2.4: For infrastructure/setup failures without TestSummary +### Step 2.4: Download HtmlReport artifacts (deep analysis) + +**This is the critical step.** Each test run produces an HtmlReport artifact containing: +- `tests/index.html` — Main report with all test configurations, pass/fail, inline failure details +- `tests///test--.xml` — NUnit XML with individual test-case results +- `tests///results-.xml` — NUnit results for dotnettests + +**Only download HtmlReport zips for jobs that failed** (use TestSummary for triage first): + +```bash +artifact="HtmlReport-simulator_tests-1" +az pipelines runs artifact download \ + --artifact-name "$artifact" \ + --path "/tmp/postmortem_deep/" \ + --run-id \ + --org https://devdiv.visualstudio.com --project DevDiv +``` + +**Warning:** HtmlReport zips are 60-140MB each. Downloading all of them is slow (1-2 min per artifact). Only download for jobs where TestSummary shows failures. + +### Step 2.5: Parse NUnit XML for individual test failures + +Extract individual test failures from the NUnit XML files inside the HtmlReport zips: + +```python +import zipfile, xml.etree.ElementTree as ET, html + +def extract_failures_from_nunit_xml(xml_content): + """Parse NUnit XML to extract individual failing test cases.""" + root = ET.fromstring(xml_content) + failures = [] + for tc in root.iter('test-case'): + if tc.get('result') == 'Failed': + name = tc.get('fullname', 'Unknown') + msg_el = tc.find('.//failure/message') + stack_el = tc.find('.//failure/stack-trace') + failures.append({ + 'test': name, + 'message': msg_el.text if msg_el is not None else '', + 'stack': stack_el.text[:500] if stack_el is not None else '', + }) + return failures + +# Process a zip file +with zipfile.ZipFile('/tmp/postmortem_deep/html_BUILDID_JOB.zip') as zf: + for name in zf.namelist(): + if name.endswith('.xml') and 'test-' in name and '-clean' not in name: + xml_content = zf.read(name).decode('utf-8', errors='replace') + failures = extract_failures_from_nunit_xml(xml_content) +``` + +**Important**: Skip files ending in `-clean.xml` (these are filtered versions). The root XML tag is `TouchUnitTestRun` (not standard NUnit format, but `test-case` elements follow standard structure). + +For **dotnettests**, individual test failures are listed inline in `
  • ` tags in the HTML (not in separate XML). Parse these from `tests/index.html`: + +```python +import re +# Pattern for inline test failures in dotnettests HTML +failures_in_html = re.findall(r']*>([^<]*(?:Failed|Error)[^<]*)
  • ', html_content) +``` + +### Step 2.6: Handle crashes and build failures + +When a test runner crashes (exit code 134, etc.) or a build fails before tests run, there will be **no NUnit XML results**. These appear in the HTML as: +- `Test run crashed (exit code: NNN)` +- `BuildFailure` + +Capture these from the HTML and record them as separate failure types (CRASH, BUILD_FAILURE). + +### Step 2.7: For infrastructure/setup failures without TestSummary Check the timeline for failed tasks in setup/provisioning stages. Extract error info from task log lines: @@ -204,19 +271,21 @@ Search for infrastructure-related errors: - Network/timeout errors - Xcode installation issues -### Step 2.5: Normalize failure signatures +### Step 2.8: Normalize failure signatures -Create a normalized signature for deduplication: +Create a normalized signature for deduplication. **Important:** HTML entities in test names (e.g., `"` vs `"`) must be normalized to avoid duplicate entries: ```python +import html as html_lib + def normalize_signature(failure_type, test_fullname, error_msg, platform): """Create a stable key for grouping the same logical failure.""" + # Normalize HTML entities if test_fullname: - # For test failures, the test name + platform is the key + test_fullname = html_lib.unescape(test_fullname) return f"{failure_type}|{platform}|{test_fullname}" elif error_msg: - # For build/infra failures, normalize the error message - # Strip file paths, line numbers, timestamps + error_msg = html_lib.unescape(error_msg) import re normalized = re.sub(r'/[^\s:]+/', '.../', error_msg) normalized = re.sub(r'line \d+', 'line N', normalized) From 0282b6a09486c11d514f23e8300bb48b4975c8c7 Mon Sep 17 00:00:00 2001 From: Rolf Bjarne Kvinge Date: Fri, 24 Apr 2026 10:26:12 +0200 Subject: [PATCH 04/17] Update CI postmortem skill: exclude AppSizeTest, one issue per test - Exclude AppSizeTest from filing (expected to fail across PRs) - Add rule: always file one issue per test, never group unrelated tests Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .agents/skills/macios-ci-postmortem/SKILL.md | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/.agents/skills/macios-ci-postmortem/SKILL.md b/.agents/skills/macios-ci-postmortem/SKILL.md index 2a54964475d2..349deac2ce06 100644 --- a/.agents/skills/macios-ci-postmortem/SKILL.md +++ b/.agents/skills/macios-ci-postmortem/SKILL.md @@ -351,7 +351,10 @@ GROUP BY error_signature ORDER BY occurrences DESC; ``` -### Step 3.4: Exclude PR-specific failures +### Step 3.4: Exclude known-noisy and PR-specific failures + +**Always exclude these tests** — they are expected to fail across many PRs and are not actionable: +- `Xamarin.Tests.AppSizeTest.*` — sensitive to any API change, expected cross-PR failures A failure is PR-specific if: - It appears in only 1 PR @@ -360,6 +363,10 @@ A failure is PR-specific if: These should be **excluded** from issue filing — they are the PR author's problem. +### Step 3.5: File one issue per test + +**Always create separate issues for separate unit tests.** It is easier to merge issues than to split them up. Do not group multiple unrelated test failures into a single issue. + ### Step 3.5: Produce classification summary Create a summary table for user review: From ba7d6ddace68a170e22a7ef923f0d12bcc67361f Mon Sep 17 00:00:00 2001 From: Rolf Bjarne Kvinge Date: Tue, 28 Apr 2026 08:05:29 +0200 Subject: [PATCH 05/17] Emphasize TestSummary triage before HtmlReport downloads The HtmlReport download step takes 96% of the total analysis time. Make it explicit that HtmlReports should only be downloaded for jobs where TestSummary confirms test failures, not for all failed jobs. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .agents/skills/macios-ci-postmortem/SKILL.md | 21 +++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/.agents/skills/macios-ci-postmortem/SKILL.md b/.agents/skills/macios-ci-postmortem/SKILL.md index 349deac2ce06..54b6662550a8 100644 --- a/.agents/skills/macios-ci-postmortem/SKILL.md +++ b/.agents/skills/macios-ci-postmortem/SKILL.md @@ -188,23 +188,34 @@ Parse the TestSummary.md to determine which jobs have test failures. This is the ### Step 2.4: Download HtmlReport artifacts (deep analysis) -**This is the critical step.** Each test run produces an HtmlReport artifact containing: +**This is the most time-consuming step — minimize downloads aggressively.** + +Each test run produces an HtmlReport artifact (60-140MB zip) containing: - `tests/index.html` — Main report with all test configurations, pass/fail, inline failure details - `tests///test--.xml` — NUnit XML with individual test-case results - `tests///results-.xml` — NUnit results for dotnettests -**Only download HtmlReport zips for jobs that failed** (use TestSummary for triage first): +**CRITICAL: Only download HtmlReport zips for jobs where TestSummary shows TEST failures (❌ markers).** Do NOT download HtmlReports for: +- Build failures (no test results exist — the build didn't get far enough to run tests) +- Infrastructure failures (bot provisioning, timeout, etc.) +- Jobs where TestSummary shows all tests passed (the job may have failed for other reasons) +To find exact artifact names, first list artifacts for the build: +```bash +az pipelines runs artifact list --run-id \ + --org https://devdiv.visualstudio.com --project DevDiv -o json +``` + +Then download only matching HtmlReport artifacts for jobs with test failures: ```bash -artifact="HtmlReport-simulator_tests-1" az pipelines runs artifact download \ - --artifact-name "$artifact" \ + --artifact-name "HtmlReport--1" \ --path "/tmp/postmortem_deep/" \ --run-id \ --org https://devdiv.visualstudio.com --project DevDiv ``` -**Warning:** HtmlReport zips are 60-140MB each. Downloading all of them is slow (1-2 min per artifact). Only download for jobs where TestSummary shows failures. +**Performance note:** Each download takes 1-3 minutes (sequential, no parallelism in az CLI). Downloading 500 artifacts takes ~2 hours. By filtering with TestSummary first, you can typically reduce this to 50-100 artifacts. ### Step 2.5: Parse NUnit XML for individual test failures From bbc298d8b5fb28aaf02857e136e2c08431a03623 Mon Sep 17 00:00:00 2001 From: Rolf Bjarne Kvinge Date: Tue, 28 Apr 2026 08:42:18 +0200 Subject: [PATCH 06/17] Add infrastructure and bot-specific failure analysis to CI postmortem skill - Extract workerName from timeline to correlate failures with bots - Identify bot-specific failures (disproportionate failure rates) - Detect cross-bot infrastructure patterns (timeouts, REST API, paths) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .agents/skills/macios-ci-postmortem/SKILL.md | 39 +++++++++++++++++--- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/.agents/skills/macios-ci-postmortem/SKILL.md b/.agents/skills/macios-ci-postmortem/SKILL.md index 54b6662550a8..e79d8109aa3f 100644 --- a/.agents/skills/macios-ci-postmortem/SKILL.md +++ b/.agents/skills/macios-ci-postmortem/SKILL.md @@ -344,9 +344,38 @@ ORDER BY pr_count DESC; If the failure is NOT also identified as flaky (i.e., it doesn't go away on rerun), classify it as a **shared regression**. -### Step 3.3: Identify infrastructure failures +### Step 3.3: Identify infrastructure failures and bot-specific issues -Look for patterns in failure_type and error messages: +#### 3.3a: Extract worker/bot info from timelines + +The timeline records contain `workerName` for each Job. Extract this to correlate failures with specific bots: + +```python +for record in timeline['records']: + if record['type'] == 'Job': + worker = record.get('workerName', '') + # Windows bots: "VSM-XAM-126" (no dot suffix) + # macOS bots: "VSM-XAM-56.Sequoia.arm64", "VSCXSDKs-MINI-042.Tahoe.arm64" +``` + +#### 3.3b: Identify bot-specific failures + +Group failures by worker and compute failure rates. A bot is problematic if: +- It has a disproportionate failure rate compared to other bots running the same job type +- The same error message appears on the same bot across multiple unrelated PRs + +```python +# Example: if VSM-XAM-126 has 8/18 failed jobs (44%) while other bots average 5-10%, +# that bot has a specific problem worth filing an issue for. +``` + +#### 3.3c: Identify infrastructure failure patterns + +Also look for cross-bot patterns that affect many PRs: +- **Timeouts**: jobs that time out on multiple different bots across unrelated PRs +- **REST API failures**: `Intermittent failure attempting to call the restapis` across many PRs +- **Path errors**: `Path does not exist` (especially on Windows bots) +- **Provisioning failures**: `Reserve bot`, `provision` errors ```sql SELECT error_signature, failure_type, raw_message, @@ -355,9 +384,9 @@ FROM ci_failures WHERE failure_type = 'Infrastructure' OR raw_message LIKE '%provision%' OR raw_message LIKE '%reserve bot%' - OR raw_message LIKE '%timeout%waiting%' - OR raw_message LIKE '%network%' - OR raw_message LIKE '%Could not find simulator%' + OR raw_message LIKE '%timeout%' + OR raw_message LIKE '%Intermittent failure%' + OR raw_message LIKE '%Path does not exist%' GROUP BY error_signature ORDER BY occurrences DESC; ``` From 66a72d0aa81de6763ec5a91c9a4f981de606d88f Mon Sep 17 00:00:00 2001 From: Rolf Bjarne Kvinge Date: Mon, 4 May 2026 09:54:25 +0200 Subject: [PATCH 07/17] Add 'copilot' label requirement to CI postmortem issues Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .agents/skills/macios-ci-postmortem/SKILL.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.agents/skills/macios-ci-postmortem/SKILL.md b/.agents/skills/macios-ci-postmortem/SKILL.md index e79d8109aa3f..bcbc2146cd42 100644 --- a/.agents/skills/macios-ci-postmortem/SKILL.md +++ b/.agents/skills/macios-ci-postmortem/SKILL.md @@ -481,7 +481,7 @@ Proceed with these actions? [Confirm / Edit / Skip] ```bash gh issue create --repo dotnet/macios \ --title "[CI] Flaky: on " \ - --label "bug,CI,ci-postmortem,flaky-test" \ + --label "bug,CI,ci-postmortem,copilot,flaky-test" \ --body "$(cat <<'EOF' ## Flaky Test Report (automated) @@ -517,7 +517,7 @@ EOF )" ``` -All issues **must** have the `ci-postmortem` label. Additionally use `flaky-test` for flaky tests and `infrastructure` for infra issues. +All issues **must** have the `ci-postmortem` and `copilot` labels. Additionally use `flaky-test` for flaky tests and `infrastructure` for infra issues. #### Comment on existing issue From fee85ef8f23bf1b35e1872ba40d4566f063ba4bb Mon Sep 17 00:00:00 2001 From: Rolf Bjarne Kvinge Date: Mon, 4 May 2026 09:59:09 +0200 Subject: [PATCH 08/17] Add weekly CI postmortem agentic workflow Runs every Sunday (fuzzy schedule) to analyze the past week's CI failures and file issues for flaky tests, infrastructure problems, and bot-specific issues. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .github/workflows/ci-postmortem.lock.yml | 1305 ++++++++++++++++++++++ .github/workflows/ci-postmortem.md | 57 + 2 files changed, 1362 insertions(+) create mode 100644 .github/workflows/ci-postmortem.lock.yml create mode 100644 .github/workflows/ci-postmortem.md diff --git a/.github/workflows/ci-postmortem.lock.yml b/.github/workflows/ci-postmortem.lock.yml new file mode 100644 index 000000000000..9d6b4a9a0a6e --- /dev/null +++ b/.github/workflows/ci-postmortem.lock.yml @@ -0,0 +1,1305 @@ +# gh-aw-metadata: {"schema_version":"v3","frontmatter_hash":"15354af11629eb0049ecb70f03b13ed2df90af330f1d6a40d7ce9a202538bb0b","compiler_version":"v0.71.1","strict":true,"agent_id":"copilot","agent_model":"claude-sonnet-4.5"} +# gh-aw-manifest: {"version":1,"secrets":["COPILOT_GITHUB_TOKEN","GH_AW_GITHUB_MCP_SERVER_TOKEN","GH_AW_GITHUB_TOKEN","GITHUB_TOKEN"],"actions":[{"repo":"actions/checkout","sha":"de0fac2e4500dabe0009e67214ff5f5447ce83dd","version":"v6.0.2"},{"repo":"actions/download-artifact","sha":"3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c","version":"v8.0.1"},{"repo":"actions/github-script","sha":"3a2844b7e9c422d3c10d287c895573f7108da1b3","version":"v9"},{"repo":"actions/setup-node","sha":"48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e","version":"v6.4.0"},{"repo":"actions/upload-artifact","sha":"043fb46d1a93c77aae656e7c1c64a875d1fc6a0a","version":"v7.0.1"},{"repo":"github/gh-aw-actions/setup","sha":"239aec45b78c8799417efdd5bc6d8cc036629ec1","version":"v0.71.1"}],"containers":[{"image":"ghcr.io/github/gh-aw-firewall/agent:0.25.28","digest":"sha256:a8834e285807654bf680154faa710d43fe4365a0868142f5c20e48c85e137a7a","pinned_image":"ghcr.io/github/gh-aw-firewall/agent:0.25.28@sha256:a8834e285807654bf680154faa710d43fe4365a0868142f5c20e48c85e137a7a"},{"image":"ghcr.io/github/gh-aw-firewall/api-proxy:0.25.28","digest":"sha256:93290f2393752252911bd7c39a047f776c0b53063575e7bde4e304962a9a61cb","pinned_image":"ghcr.io/github/gh-aw-firewall/api-proxy:0.25.28@sha256:93290f2393752252911bd7c39a047f776c0b53063575e7bde4e304962a9a61cb"},{"image":"ghcr.io/github/gh-aw-firewall/squid:0.25.28","digest":"sha256:844c18280f82cd1b06345eb2f4e91966b34185bfc51c9f237c3e022e848fb474","pinned_image":"ghcr.io/github/gh-aw-firewall/squid:0.25.28@sha256:844c18280f82cd1b06345eb2f4e91966b34185bfc51c9f237c3e022e848fb474"},{"image":"ghcr.io/github/gh-aw-mcpg:v0.3.0","digest":"sha256:9c2228324fb1f26f39dc9471612e530ae3efc3156dac05efb2e8d212878d454d","pinned_image":"ghcr.io/github/gh-aw-mcpg:v0.3.0@sha256:9c2228324fb1f26f39dc9471612e530ae3efc3156dac05efb2e8d212878d454d"},{"image":"ghcr.io/github/github-mcp-server:v1.0.2","digest":"sha256:26db03408086a99cf1916348dcc4f9614206658f9082a8060dc7c81ad787f4ba","pinned_image":"ghcr.io/github/github-mcp-server:v1.0.2@sha256:26db03408086a99cf1916348dcc4f9614206658f9082a8060dc7c81ad787f4ba"},{"image":"node:lts-alpine","digest":"sha256:d1b3b4da11eefd5941e7f0b9cf17783fc99d9c6fc34884a665f40a06dbdfc94f","pinned_image":"node:lts-alpine@sha256:d1b3b4da11eefd5941e7f0b9cf17783fc99d9c6fc34884a665f40a06dbdfc94f"}]} +# ___ _ _ +# / _ \ | | (_) +# | |_| | __ _ ___ _ __ | |_ _ ___ +# | _ |/ _` |/ _ \ '_ \| __| |/ __| +# | | | | (_| | __/ | | | |_| | (__ +# \_| |_/\__, |\___|_| |_|\__|_|\___| +# __/ | +# _ _ |___/ +# | | | | / _| | +# | | | | ___ _ __ _ __| |_| | _____ ____ +# | |/\| |/ _ \ '__| |/ /| _| |/ _ \ \ /\ / / ___| +# \ /\ / (_) | | | | ( | | | | (_) \ V V /\__ \ +# \/ \/ \___/|_| |_|\_\|_| |_|\___/ \_/\_/ |___/ +# +# This file was automatically generated by gh-aw (v0.71.1). DO NOT EDIT. +# +# To update this file, edit the corresponding .md file and run: +# gh aw compile +# Not all edits will cause changes to this file. +# +# For more information: https://github.github.com/gh-aw/introduction/overview/ +# +# +# Secrets used: +# - COPILOT_GITHUB_TOKEN +# - GH_AW_GITHUB_MCP_SERVER_TOKEN +# - GH_AW_GITHUB_TOKEN +# - GITHUB_TOKEN +# +# Custom actions used: +# - actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 +# - actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1 +# - actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9 +# - actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 +# - actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 +# - github/gh-aw-actions/setup@239aec45b78c8799417efdd5bc6d8cc036629ec1 # v0.71.1 +# +# Container images used: +# - ghcr.io/github/gh-aw-firewall/agent:0.25.28@sha256:a8834e285807654bf680154faa710d43fe4365a0868142f5c20e48c85e137a7a +# - ghcr.io/github/gh-aw-firewall/api-proxy:0.25.28@sha256:93290f2393752252911bd7c39a047f776c0b53063575e7bde4e304962a9a61cb +# - ghcr.io/github/gh-aw-firewall/squid:0.25.28@sha256:844c18280f82cd1b06345eb2f4e91966b34185bfc51c9f237c3e022e848fb474 +# - ghcr.io/github/gh-aw-mcpg:v0.3.0@sha256:9c2228324fb1f26f39dc9471612e530ae3efc3156dac05efb2e8d212878d454d +# - ghcr.io/github/github-mcp-server:v1.0.2@sha256:26db03408086a99cf1916348dcc4f9614206658f9082a8060dc7c81ad787f4ba +# - node:lts-alpine@sha256:d1b3b4da11eefd5941e7f0b9cf17783fc99d9c6fc34884a665f40a06dbdfc94f + +name: "CI Post-Mortem Analysis" +"on": + schedule: + - cron: "45 11 * * 0" + # Friendly format: weekly on sunday (scattered) + workflow_dispatch: + inputs: + aw_context: + default: "" + description: Agent caller context (used internally by Agentic Workflows). + required: false + type: string + +permissions: {} + +concurrency: + group: "gh-aw-${{ github.workflow }}" + +run-name: "CI Post-Mortem Analysis" + +jobs: + activation: + runs-on: ubuntu-slim + permissions: + actions: read + contents: read + outputs: + comment_id: "" + comment_repo: "" + engine_id: ${{ steps.generate_aw_info.outputs.engine_id }} + lockdown_check_failed: ${{ steps.generate_aw_info.outputs.lockdown_check_failed == 'true' }} + model: ${{ steps.generate_aw_info.outputs.model }} + secret_verification_result: ${{ steps.validate-secret.outputs.verification_result }} + setup-trace-id: ${{ steps.setup.outputs.trace-id }} + stale_lock_file_failed: ${{ steps.check-lock-file.outputs.stale_lock_file_failed == 'true' }} + steps: + - name: Setup Scripts + id: setup + uses: github/gh-aw-actions/setup@239aec45b78c8799417efdd5bc6d8cc036629ec1 # v0.71.1 + with: + destination: ${{ runner.temp }}/gh-aw/actions + job-name: ${{ github.job }} + - name: Generate agentic run info + id: generate_aw_info + env: + GH_AW_INFO_ENGINE_ID: "copilot" + GH_AW_INFO_ENGINE_NAME: "GitHub Copilot CLI" + GH_AW_INFO_MODEL: "claude-sonnet-4.5" + GH_AW_INFO_VERSION: "1.0.35" + GH_AW_INFO_AGENT_VERSION: "1.0.35" + GH_AW_INFO_CLI_VERSION: "v0.71.1" + GH_AW_INFO_WORKFLOW_NAME: "CI Post-Mortem Analysis" + GH_AW_INFO_EXPERIMENTAL: "false" + GH_AW_INFO_SUPPORTS_TOOLS_ALLOWLIST: "true" + GH_AW_INFO_STAGED: "false" + GH_AW_INFO_ALLOWED_DOMAINS: '["defaults","dotnet","github","aka.ms","dev.azure.com","devdiv.visualstudio.com","microsoft.com","vsassets.io"]' + GH_AW_INFO_FIREWALL_ENABLED: "true" + GH_AW_INFO_AWF_VERSION: "v0.25.28" + GH_AW_INFO_AWMG_VERSION: "" + GH_AW_INFO_FIREWALL_TYPE: "squid" + GH_AW_COMPILED_STRICT: "true" + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9 + with: + script: | + const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io, getOctokit); + const { main } = require('${{ runner.temp }}/gh-aw/actions/generate_aw_info.cjs'); + await main(core, context); + - name: Validate COPILOT_GITHUB_TOKEN secret + id: validate-secret + run: bash "${RUNNER_TEMP}/gh-aw/actions/validate_multi_secret.sh" COPILOT_GITHUB_TOKEN 'GitHub Copilot CLI' https://github.github.com/gh-aw/reference/engines/#github-copilot-default + env: + COPILOT_GITHUB_TOKEN: ${{ secrets.COPILOT_GITHUB_TOKEN }} + - name: Checkout .github and .agents folders + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + sparse-checkout: | + .github + .agents + .claude + .codex + .crush + .gemini + .opencode + sparse-checkout-cone-mode: true + fetch-depth: 1 + - name: Save agent config folders for base branch restoration + env: + GH_AW_AGENT_FOLDERS: ".agents .claude .codex .crush .gemini .github .opencode" + GH_AW_AGENT_FILES: ".crush.json AGENTS.md CLAUDE.md GEMINI.md opencode.jsonc" + # poutine:ignore untrusted_checkout_exec + run: bash "${RUNNER_TEMP}/gh-aw/actions/save_base_github_folders.sh" + - name: Check workflow lock file + id: check-lock-file + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9 + env: + GH_AW_WORKFLOW_FILE: "ci-postmortem.lock.yml" + GH_AW_CONTEXT_WORKFLOW_REF: "${{ github.workflow_ref }}" + with: + script: | + const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io, getOctokit); + const { main } = require('${{ runner.temp }}/gh-aw/actions/check_workflow_timestamp_api.cjs'); + await main(); + - name: Check compile-agentic version + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9 + env: + GH_AW_COMPILED_VERSION: "v0.71.1" + with: + script: | + const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io, getOctokit); + const { main } = require('${{ runner.temp }}/gh-aw/actions/check_version_updates.cjs'); + await main(); + - name: Create prompt with built-in context + env: + GH_AW_PROMPT: /tmp/gh-aw/aw-prompts/prompt.txt + GH_AW_SAFE_OUTPUTS: ${{ runner.temp }}/gh-aw/safeoutputs/outputs.jsonl + GH_AW_GITHUB_ACTOR: ${{ github.actor }} + GH_AW_GITHUB_EVENT_COMMENT_ID: ${{ github.event.comment.id }} + GH_AW_GITHUB_EVENT_DISCUSSION_NUMBER: ${{ github.event.discussion.number }} + GH_AW_GITHUB_EVENT_ISSUE_NUMBER: ${{ github.event.issue.number }} + GH_AW_GITHUB_EVENT_PULL_REQUEST_NUMBER: ${{ github.event.pull_request.number }} + GH_AW_GITHUB_REPOSITORY: ${{ github.repository }} + GH_AW_GITHUB_RUN_ID: ${{ github.run_id }} + GH_AW_GITHUB_WORKSPACE: ${{ github.workspace }} + # poutine:ignore untrusted_checkout_exec + run: | + bash "${RUNNER_TEMP}/gh-aw/actions/create_prompt_first.sh" + { + cat << 'GH_AW_PROMPT_956c986f1e45d6a0_EOF' + + GH_AW_PROMPT_956c986f1e45d6a0_EOF + cat "${RUNNER_TEMP}/gh-aw/prompts/xpia.md" + cat "${RUNNER_TEMP}/gh-aw/prompts/temp_folder_prompt.md" + cat "${RUNNER_TEMP}/gh-aw/prompts/markdown.md" + cat "${RUNNER_TEMP}/gh-aw/prompts/safe_outputs_prompt.md" + cat << 'GH_AW_PROMPT_956c986f1e45d6a0_EOF' + + Tools: add_comment(max:20), create_issue(max:20), update_issue(max:20), missing_tool, missing_data, noop + + + The following GitHub context information is available for this workflow: + {{#if __GH_AW_GITHUB_ACTOR__ }} + - **actor**: __GH_AW_GITHUB_ACTOR__ + {{/if}} + {{#if __GH_AW_GITHUB_REPOSITORY__ }} + - **repository**: __GH_AW_GITHUB_REPOSITORY__ + {{/if}} + {{#if __GH_AW_GITHUB_WORKSPACE__ }} + - **workspace**: __GH_AW_GITHUB_WORKSPACE__ + {{/if}} + {{#if __GH_AW_GITHUB_EVENT_ISSUE_NUMBER__ }} + - **issue-number**: #__GH_AW_GITHUB_EVENT_ISSUE_NUMBER__ + {{/if}} + {{#if __GH_AW_GITHUB_EVENT_DISCUSSION_NUMBER__ }} + - **discussion-number**: #__GH_AW_GITHUB_EVENT_DISCUSSION_NUMBER__ + {{/if}} + {{#if __GH_AW_GITHUB_EVENT_PULL_REQUEST_NUMBER__ }} + - **pull-request-number**: #__GH_AW_GITHUB_EVENT_PULL_REQUEST_NUMBER__ + {{/if}} + {{#if __GH_AW_GITHUB_EVENT_COMMENT_ID__ }} + - **comment-id**: __GH_AW_GITHUB_EVENT_COMMENT_ID__ + {{/if}} + {{#if __GH_AW_GITHUB_RUN_ID__ }} + - **workflow-run-id**: __GH_AW_GITHUB_RUN_ID__ + {{/if}} + + + GH_AW_PROMPT_956c986f1e45d6a0_EOF + cat "${RUNNER_TEMP}/gh-aw/prompts/github_mcp_tools_with_safeoutputs_prompt.md" + cat << 'GH_AW_PROMPT_956c986f1e45d6a0_EOF' + + {{#runtime-import .github/workflows/ci-postmortem.md}} + GH_AW_PROMPT_956c986f1e45d6a0_EOF + } > "$GH_AW_PROMPT" + - name: Interpolate variables and render templates + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9 + env: + GH_AW_PROMPT: /tmp/gh-aw/aw-prompts/prompt.txt + with: + script: | + const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io, getOctokit); + const { main } = require('${{ runner.temp }}/gh-aw/actions/interpolate_prompt.cjs'); + await main(); + - name: Substitute placeholders + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9 + env: + GH_AW_PROMPT: /tmp/gh-aw/aw-prompts/prompt.txt + GH_AW_GITHUB_ACTOR: ${{ github.actor }} + GH_AW_GITHUB_EVENT_COMMENT_ID: ${{ github.event.comment.id }} + GH_AW_GITHUB_EVENT_DISCUSSION_NUMBER: ${{ github.event.discussion.number }} + GH_AW_GITHUB_EVENT_ISSUE_NUMBER: ${{ github.event.issue.number }} + GH_AW_GITHUB_EVENT_PULL_REQUEST_NUMBER: ${{ github.event.pull_request.number }} + GH_AW_GITHUB_REPOSITORY: ${{ github.repository }} + GH_AW_GITHUB_RUN_ID: ${{ github.run_id }} + GH_AW_GITHUB_WORKSPACE: ${{ github.workspace }} + with: + script: | + const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io, getOctokit); + + const substitutePlaceholders = require('${{ runner.temp }}/gh-aw/actions/substitute_placeholders.cjs'); + + // Call the substitution function + return await substitutePlaceholders({ + file: process.env.GH_AW_PROMPT, + substitutions: { + GH_AW_GITHUB_ACTOR: process.env.GH_AW_GITHUB_ACTOR, + GH_AW_GITHUB_EVENT_COMMENT_ID: process.env.GH_AW_GITHUB_EVENT_COMMENT_ID, + GH_AW_GITHUB_EVENT_DISCUSSION_NUMBER: process.env.GH_AW_GITHUB_EVENT_DISCUSSION_NUMBER, + GH_AW_GITHUB_EVENT_ISSUE_NUMBER: process.env.GH_AW_GITHUB_EVENT_ISSUE_NUMBER, + GH_AW_GITHUB_EVENT_PULL_REQUEST_NUMBER: process.env.GH_AW_GITHUB_EVENT_PULL_REQUEST_NUMBER, + GH_AW_GITHUB_REPOSITORY: process.env.GH_AW_GITHUB_REPOSITORY, + GH_AW_GITHUB_RUN_ID: process.env.GH_AW_GITHUB_RUN_ID, + GH_AW_GITHUB_WORKSPACE: process.env.GH_AW_GITHUB_WORKSPACE + } + }); + - name: Validate prompt placeholders + env: + GH_AW_PROMPT: /tmp/gh-aw/aw-prompts/prompt.txt + # poutine:ignore untrusted_checkout_exec + run: bash "${RUNNER_TEMP}/gh-aw/actions/validate_prompt_placeholders.sh" + - name: Print prompt + env: + GH_AW_PROMPT: /tmp/gh-aw/aw-prompts/prompt.txt + # poutine:ignore untrusted_checkout_exec + run: bash "${RUNNER_TEMP}/gh-aw/actions/print_prompt_summary.sh" + - name: Upload activation artifact + if: success() + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: activation + path: | + /tmp/gh-aw/aw_info.json + /tmp/gh-aw/aw-prompts/prompt.txt + /tmp/gh-aw/github_rate_limits.jsonl + /tmp/gh-aw/base + if-no-files-found: ignore + retention-days: 1 + + agent: + needs: activation + runs-on: ubuntu-latest + permissions: + contents: read + issues: read + concurrency: + group: "gh-aw-copilot-${{ github.workflow }}" + env: + DEFAULT_BRANCH: ${{ github.event.repository.default_branch }} + GH_AW_ASSETS_ALLOWED_EXTS: "" + GH_AW_ASSETS_BRANCH: "" + GH_AW_ASSETS_MAX_SIZE_KB: 0 + GH_AW_MCP_LOG_DIR: /tmp/gh-aw/mcp-logs/safeoutputs + GH_AW_WORKFLOW_ID_SANITIZED: cipostmortem + outputs: + agentic_engine_timeout: ${{ steps.detect-copilot-errors.outputs.agentic_engine_timeout || 'false' }} + checkout_pr_success: ${{ steps.checkout-pr.outputs.checkout_pr_success || 'true' }} + effective_tokens: ${{ steps.parse-mcp-gateway.outputs.effective_tokens }} + has_patch: ${{ steps.collect_output.outputs.has_patch }} + inference_access_error: ${{ steps.detect-copilot-errors.outputs.inference_access_error || 'false' }} + mcp_policy_error: ${{ steps.detect-copilot-errors.outputs.mcp_policy_error || 'false' }} + model: ${{ needs.activation.outputs.model }} + model_not_supported_error: ${{ steps.detect-copilot-errors.outputs.model_not_supported_error || 'false' }} + output: ${{ steps.collect_output.outputs.output }} + output_types: ${{ steps.collect_output.outputs.output_types }} + setup-trace-id: ${{ steps.setup.outputs.trace-id }} + steps: + - name: Setup Scripts + id: setup + uses: github/gh-aw-actions/setup@239aec45b78c8799417efdd5bc6d8cc036629ec1 # v0.71.1 + with: + destination: ${{ runner.temp }}/gh-aw/actions + job-name: ${{ github.job }} + trace-id: ${{ needs.activation.outputs.setup-trace-id }} + - name: Set runtime paths + id: set-runtime-paths + run: | + { + echo "GH_AW_SAFE_OUTPUTS=${RUNNER_TEMP}/gh-aw/safeoutputs/outputs.jsonl" + echo "GH_AW_SAFE_OUTPUTS_CONFIG_PATH=${RUNNER_TEMP}/gh-aw/safeoutputs/config.json" + echo "GH_AW_SAFE_OUTPUTS_TOOLS_PATH=${RUNNER_TEMP}/gh-aw/safeoutputs/tools.json" + } >> "$GITHUB_OUTPUT" + - name: Checkout repository + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + - name: Create gh-aw temp directory + run: bash "${RUNNER_TEMP}/gh-aw/actions/create_gh_aw_tmp_dir.sh" + - name: Configure gh CLI for GitHub Enterprise + run: bash "${RUNNER_TEMP}/gh-aw/actions/configure_gh_for_ghe.sh" + env: + GH_TOKEN: ${{ github.token }} + - name: Configure Git credentials + env: + REPO_NAME: ${{ github.repository }} + SERVER_URL: ${{ github.server_url }} + GITHUB_TOKEN: ${{ github.token }} + run: | + git config --global user.email "github-actions[bot]@users.noreply.github.com" + git config --global user.name "github-actions[bot]" + git config --global am.keepcr true + # Re-authenticate git with GitHub token + SERVER_URL_STRIPPED="${SERVER_URL#https://}" + git remote set-url origin "https://x-access-token:${GITHUB_TOKEN}@${SERVER_URL_STRIPPED}/${REPO_NAME}.git" + echo "Git configured with standard GitHub Actions identity" + - name: Checkout PR branch + id: checkout-pr + if: | + github.event.pull_request || github.event.issue.pull_request + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9 + env: + GH_TOKEN: ${{ secrets.GH_AW_GITHUB_MCP_SERVER_TOKEN || secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }} + with: + github-token: ${{ secrets.GH_AW_GITHUB_MCP_SERVER_TOKEN || secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }} + script: | + const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io, getOctokit); + const { main } = require('${{ runner.temp }}/gh-aw/actions/checkout_pr_branch.cjs'); + await main(); + - name: Install GitHub Copilot CLI + run: bash "${RUNNER_TEMP}/gh-aw/actions/install_copilot_cli.sh" 1.0.35 + env: + GH_HOST: github.com + - name: Install AWF binary + run: bash "${RUNNER_TEMP}/gh-aw/actions/install_awf_binary.sh" v0.25.28 + - name: Parse integrity filter lists + id: parse-guard-vars + env: + GH_AW_BLOCKED_USERS_VAR: ${{ vars.GH_AW_GITHUB_BLOCKED_USERS || '' }} + GH_AW_TRUSTED_USERS_VAR: ${{ vars.GH_AW_GITHUB_TRUSTED_USERS || '' }} + GH_AW_APPROVAL_LABELS_VAR: ${{ vars.GH_AW_GITHUB_APPROVAL_LABELS || '' }} + run: bash "${RUNNER_TEMP}/gh-aw/actions/parse_guard_list.sh" + - name: Download activation artifact + uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1 + with: + name: activation + path: /tmp/gh-aw + - name: Restore agent config folders from base branch + if: steps.checkout-pr.outcome == 'success' + env: + GH_AW_AGENT_FOLDERS: ".agents .claude .codex .crush .gemini .github .opencode" + GH_AW_AGENT_FILES: ".crush.json AGENTS.md CLAUDE.md GEMINI.md opencode.jsonc" + run: bash "${RUNNER_TEMP}/gh-aw/actions/restore_base_github_folders.sh" + - name: Download container images + run: bash "${RUNNER_TEMP}/gh-aw/actions/download_docker_images.sh" ghcr.io/github/gh-aw-firewall/agent:0.25.28@sha256:a8834e285807654bf680154faa710d43fe4365a0868142f5c20e48c85e137a7a ghcr.io/github/gh-aw-firewall/api-proxy:0.25.28@sha256:93290f2393752252911bd7c39a047f776c0b53063575e7bde4e304962a9a61cb ghcr.io/github/gh-aw-firewall/squid:0.25.28@sha256:844c18280f82cd1b06345eb2f4e91966b34185bfc51c9f237c3e022e848fb474 ghcr.io/github/gh-aw-mcpg:v0.3.0@sha256:9c2228324fb1f26f39dc9471612e530ae3efc3156dac05efb2e8d212878d454d ghcr.io/github/github-mcp-server:v1.0.2@sha256:26db03408086a99cf1916348dcc4f9614206658f9082a8060dc7c81ad787f4ba node:lts-alpine@sha256:d1b3b4da11eefd5941e7f0b9cf17783fc99d9c6fc34884a665f40a06dbdfc94f + - name: Write Safe Outputs Config + run: | + mkdir -p "${RUNNER_TEMP}/gh-aw/safeoutputs" + mkdir -p /tmp/gh-aw/safeoutputs + mkdir -p /tmp/gh-aw/mcp-logs/safeoutputs + cat > "${RUNNER_TEMP}/gh-aw/safeoutputs/config.json" << 'GH_AW_SAFE_OUTPUTS_CONFIG_c2d474e65378a915_EOF' + {"add_comment":{"max":20},"create_issue":{"max":20},"create_report_incomplete_issue":{},"missing_data":{},"missing_tool":{},"noop":{"max":1,"report-as-issue":"true"},"report_incomplete":{},"update_issue":{"allow_body":true,"max":20}} + GH_AW_SAFE_OUTPUTS_CONFIG_c2d474e65378a915_EOF + - name: Write Safe Outputs Tools + env: + GH_AW_TOOLS_META_JSON: | + { + "description_suffixes": { + "add_comment": " CONSTRAINTS: Maximum 20 comment(s) can be added. Supports reply_to_id for discussion threading.", + "create_issue": " CONSTRAINTS: Maximum 20 issue(s) can be created.", + "update_issue": " CONSTRAINTS: Maximum 20 issue(s) can be updated." + }, + "repo_params": {}, + "dynamic_tools": [] + } + GH_AW_VALIDATION_JSON: | + { + "add_comment": { + "defaultMax": 1, + "fields": { + "body": { + "required": true, + "type": "string", + "sanitize": true, + "maxLength": 65000 + }, + "item_number": { + "issueOrPRNumber": true + }, + "reply_to_id": { + "type": "string", + "maxLength": 256 + }, + "repo": { + "type": "string", + "maxLength": 256 + } + } + }, + "create_issue": { + "defaultMax": 1, + "fields": { + "body": { + "required": true, + "type": "string", + "sanitize": true, + "maxLength": 65000 + }, + "labels": { + "type": "array", + "itemType": "string", + "itemSanitize": true, + "itemMaxLength": 128 + }, + "parent": { + "issueOrPRNumber": true + }, + "repo": { + "type": "string", + "maxLength": 256 + }, + "temporary_id": { + "type": "string" + }, + "title": { + "required": true, + "type": "string", + "sanitize": true, + "maxLength": 128 + } + } + }, + "missing_data": { + "defaultMax": 20, + "fields": { + "alternatives": { + "type": "string", + "sanitize": true, + "maxLength": 256 + }, + "context": { + "type": "string", + "sanitize": true, + "maxLength": 256 + }, + "data_type": { + "type": "string", + "sanitize": true, + "maxLength": 128 + }, + "reason": { + "type": "string", + "sanitize": true, + "maxLength": 256 + } + } + }, + "missing_tool": { + "defaultMax": 20, + "fields": { + "alternatives": { + "type": "string", + "sanitize": true, + "maxLength": 512 + }, + "reason": { + "required": true, + "type": "string", + "sanitize": true, + "maxLength": 256 + }, + "tool": { + "type": "string", + "sanitize": true, + "maxLength": 128 + } + } + }, + "noop": { + "defaultMax": 1, + "fields": { + "message": { + "required": true, + "type": "string", + "sanitize": true, + "maxLength": 65000 + } + } + }, + "report_incomplete": { + "defaultMax": 5, + "fields": { + "details": { + "type": "string", + "sanitize": true, + "maxLength": 65000 + }, + "reason": { + "required": true, + "type": "string", + "sanitize": true, + "maxLength": 1024 + } + } + }, + "update_issue": { + "defaultMax": 1, + "fields": { + "assignees": { + "type": "array", + "itemType": "string", + "itemSanitize": true, + "itemMaxLength": 39 + }, + "body": { + "type": "string", + "sanitize": true, + "maxLength": 65000 + }, + "issue_number": { + "issueOrPRNumber": true + }, + "labels": { + "type": "array", + "itemType": "string", + "itemSanitize": true, + "itemMaxLength": 128 + }, + "milestone": { + "optionalPositiveInteger": true + }, + "operation": { + "type": "string", + "enum": [ + "replace", + "append", + "prepend", + "replace-island" + ] + }, + "repo": { + "type": "string", + "maxLength": 256 + }, + "status": { + "type": "string", + "enum": [ + "open", + "closed" + ] + }, + "title": { + "type": "string", + "sanitize": true, + "maxLength": 128 + } + }, + "customValidation": "requiresOneOf:status,title,body" + } + } + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9 + with: + script: | + const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io, getOctokit); + const { main } = require('${{ runner.temp }}/gh-aw/actions/generate_safe_outputs_tools.cjs'); + await main(); + - name: Generate Safe Outputs MCP Server Config + id: safe-outputs-config + run: | + # Generate a secure random API key (360 bits of entropy, 40+ chars) + # Mask immediately to prevent timing vulnerabilities + API_KEY=$(openssl rand -base64 45 | tr -d '/+=') + echo "::add-mask::${API_KEY}" + + PORT=3001 + + # Set outputs for next steps + { + echo "safe_outputs_api_key=${API_KEY}" + echo "safe_outputs_port=${PORT}" + } >> "$GITHUB_OUTPUT" + + echo "Safe Outputs MCP server will run on port ${PORT}" + + - name: Start Safe Outputs MCP HTTP Server + id: safe-outputs-start + env: + DEBUG: '*' + GH_AW_SAFE_OUTPUTS: ${{ steps.set-runtime-paths.outputs.GH_AW_SAFE_OUTPUTS }} + GH_AW_SAFE_OUTPUTS_PORT: ${{ steps.safe-outputs-config.outputs.safe_outputs_port }} + GH_AW_SAFE_OUTPUTS_API_KEY: ${{ steps.safe-outputs-config.outputs.safe_outputs_api_key }} + GH_AW_SAFE_OUTPUTS_TOOLS_PATH: ${{ runner.temp }}/gh-aw/safeoutputs/tools.json + GH_AW_SAFE_OUTPUTS_CONFIG_PATH: ${{ runner.temp }}/gh-aw/safeoutputs/config.json + GH_AW_MCP_LOG_DIR: /tmp/gh-aw/mcp-logs/safeoutputs + run: | + # Environment variables are set above to prevent template injection + export DEBUG + export GH_AW_SAFE_OUTPUTS + export GH_AW_SAFE_OUTPUTS_PORT + export GH_AW_SAFE_OUTPUTS_API_KEY + export GH_AW_SAFE_OUTPUTS_TOOLS_PATH + export GH_AW_SAFE_OUTPUTS_CONFIG_PATH + export GH_AW_MCP_LOG_DIR + + bash "${RUNNER_TEMP}/gh-aw/actions/start_safe_outputs_server.sh" + + - name: Start MCP Gateway + id: start-mcp-gateway + env: + GH_AW_SAFE_OUTPUTS: ${{ steps.set-runtime-paths.outputs.GH_AW_SAFE_OUTPUTS }} + GH_AW_SAFE_OUTPUTS_API_KEY: ${{ steps.safe-outputs-start.outputs.api_key }} + GH_AW_SAFE_OUTPUTS_PORT: ${{ steps.safe-outputs-start.outputs.port }} + GITHUB_MCP_SERVER_TOKEN: ${{ secrets.GH_AW_GITHUB_MCP_SERVER_TOKEN || secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }} + run: | + set -eo pipefail + mkdir -p "${RUNNER_TEMP}/gh-aw/mcp-config" + + # Export gateway environment variables for MCP config and gateway script + export MCP_GATEWAY_PORT="8080" + export MCP_GATEWAY_DOMAIN="host.docker.internal" + MCP_GATEWAY_API_KEY=$(openssl rand -base64 45 | tr -d '/+=') + echo "::add-mask::${MCP_GATEWAY_API_KEY}" + export MCP_GATEWAY_API_KEY + export MCP_GATEWAY_PAYLOAD_DIR="/tmp/gh-aw/mcp-payloads" + mkdir -p "${MCP_GATEWAY_PAYLOAD_DIR}" + export MCP_GATEWAY_PAYLOAD_SIZE_THRESHOLD="524288" + export DEBUG="*" + + export GH_AW_ENGINE="copilot" + MCP_GATEWAY_UID=$(id -u 2>/dev/null || echo '0') + MCP_GATEWAY_GID=$(id -g 2>/dev/null || echo '0') + DOCKER_SOCK_GID=$(stat -c '%g' /var/run/docker.sock 2>/dev/null || echo '0') + export MCP_GATEWAY_DOCKER_COMMAND='docker run -i --rm --network host --add-host host.docker.internal:127.0.0.1 --user '"${MCP_GATEWAY_UID}"':'"${MCP_GATEWAY_GID}"' --group-add '"${DOCKER_SOCK_GID}"' -v /var/run/docker.sock:/var/run/docker.sock -e MCP_GATEWAY_PORT -e MCP_GATEWAY_DOMAIN -e MCP_GATEWAY_API_KEY -e MCP_GATEWAY_PAYLOAD_DIR -e MCP_GATEWAY_PAYLOAD_SIZE_THRESHOLD -e DEBUG -e MCP_GATEWAY_LOG_DIR -e GH_AW_MCP_LOG_DIR -e GH_AW_SAFE_OUTPUTS -e GH_AW_SAFE_OUTPUTS_CONFIG_PATH -e GH_AW_SAFE_OUTPUTS_TOOLS_PATH -e GH_AW_ASSETS_BRANCH -e GH_AW_ASSETS_MAX_SIZE_KB -e GH_AW_ASSETS_ALLOWED_EXTS -e DEFAULT_BRANCH -e GITHUB_MCP_SERVER_TOKEN -e GITHUB_MCP_GUARD_MIN_INTEGRITY -e GITHUB_MCP_GUARD_REPOS -e GITHUB_REPOSITORY -e GITHUB_SERVER_URL -e GITHUB_SHA -e GITHUB_WORKSPACE -e GITHUB_TOKEN -e GITHUB_RUN_ID -e GITHUB_RUN_NUMBER -e GITHUB_RUN_ATTEMPT -e GITHUB_JOB -e GITHUB_ACTION -e GITHUB_EVENT_NAME -e GITHUB_EVENT_PATH -e GITHUB_ACTOR -e GITHUB_ACTOR_ID -e GITHUB_TRIGGERING_ACTOR -e GITHUB_WORKFLOW -e GITHUB_WORKFLOW_REF -e GITHUB_WORKFLOW_SHA -e GITHUB_REF -e GITHUB_REF_NAME -e GITHUB_REF_TYPE -e GITHUB_HEAD_REF -e GITHUB_BASE_REF -e GH_AW_SAFE_OUTPUTS_PORT -e GH_AW_SAFE_OUTPUTS_API_KEY -v /tmp/gh-aw/mcp-payloads:/tmp/gh-aw/mcp-payloads:rw -v /opt:/opt:ro -v /tmp:/tmp:rw -v '"${GITHUB_WORKSPACE}"':'"${GITHUB_WORKSPACE}"':rw ghcr.io/github/gh-aw-mcpg:v0.3.0' + + mkdir -p /home/runner/.copilot + GH_AW_NODE=$(which node 2>/dev/null || command -v node 2>/dev/null || echo node) + cat << GH_AW_MCP_CONFIG_7bd5cd6513b45b21_EOF | "$GH_AW_NODE" "${RUNNER_TEMP}/gh-aw/actions/start_mcp_gateway.cjs" + { + "mcpServers": { + "github": { + "type": "stdio", + "container": "ghcr.io/github/github-mcp-server:v1.0.2", + "env": { + "GITHUB_HOST": "\${GITHUB_SERVER_URL}", + "GITHUB_PERSONAL_ACCESS_TOKEN": "\${GITHUB_MCP_SERVER_TOKEN}", + "GITHUB_READ_ONLY": "1", + "GITHUB_TOOLSETS": "issues,repos" + }, + "guard-policies": { + "allow-only": { + "approval-labels": ${{ steps.parse-guard-vars.outputs.approval_labels }}, + "blocked-users": ${{ steps.parse-guard-vars.outputs.blocked_users }}, + "min-integrity": "none", + "repos": "all", + "trusted-users": ${{ steps.parse-guard-vars.outputs.trusted_users }} + } + } + }, + "safeoutputs": { + "type": "http", + "url": "http://host.docker.internal:$GH_AW_SAFE_OUTPUTS_PORT", + "headers": { + "Authorization": "\${GH_AW_SAFE_OUTPUTS_API_KEY}" + }, + "guard-policies": { + "write-sink": { + "accept": [ + "*" + ] + } + } + } + }, + "gateway": { + "port": $MCP_GATEWAY_PORT, + "domain": "${MCP_GATEWAY_DOMAIN}", + "apiKey": "${MCP_GATEWAY_API_KEY}", + "payloadDir": "${MCP_GATEWAY_PAYLOAD_DIR}" + } + } + GH_AW_MCP_CONFIG_7bd5cd6513b45b21_EOF + - name: Clean git credentials + continue-on-error: true + run: bash "${RUNNER_TEMP}/gh-aw/actions/clean_git_credentials.sh" + - name: Execute GitHub Copilot CLI + id: agentic_execution + # Copilot CLI tool arguments (sorted): + timeout-minutes: 20 + run: | + set -o pipefail + touch /tmp/gh-aw/agent-step-summary.md + GH_AW_NODE_BIN=$(command -v node 2>/dev/null || true) + export GH_AW_NODE_BIN + (umask 177 && touch /tmp/gh-aw/agent-stdio.log) + # shellcheck disable=SC1003 + sudo -E awf --container-workdir "${GITHUB_WORKSPACE}" --mount "${RUNNER_TEMP}/gh-aw:${RUNNER_TEMP}/gh-aw:ro" --mount "${RUNNER_TEMP}/gh-aw:/host${RUNNER_TEMP}/gh-aw:ro" --env-all --exclude-env COPILOT_GITHUB_TOKEN --exclude-env GITHUB_MCP_SERVER_TOKEN --exclude-env MCP_GATEWAY_API_KEY --allow-domains '*.githubusercontent.com,*.vsblob.vsassets.io,aka.ms,api.business.githubcopilot.com,api.enterprise.githubcopilot.com,api.github.com,api.githubcopilot.com,api.individual.githubcopilot.com,api.nuget.org,api.snapcraft.io,archive.ubuntu.com,azure.archive.ubuntu.com,azuresearch-usnc.nuget.org,azuresearch-ussc.nuget.org,builds.dotnet.microsoft.com,ci.dot.net,codeload.github.com,crl.geotrust.com,crl.globalsign.com,crl.identrust.com,crl.sectigo.com,crl.thawte.com,crl.usertrust.com,crl.verisign.com,crl3.digicert.com,crl4.digicert.com,crls.ssl.com,dc.services.visualstudio.com,dev.azure.com,devdiv.visualstudio.com,dist.nuget.org,docs.github.com,dot.net,dotnet.microsoft.com,dotnetcli.blob.core.windows.net,github-cloud.githubusercontent.com,github-cloud.s3.amazonaws.com,github.blog,github.com,github.githubassets.com,host.docker.internal,json-schema.org,json.schemastore.org,keyserver.ubuntu.com,lfs.github.com,microsoft.com,nuget.org,nuget.pkg.github.com,nugetregistryv2prod.blob.core.windows.net,objects.githubusercontent.com,ocsp.digicert.com,ocsp.geotrust.com,ocsp.globalsign.com,ocsp.identrust.com,ocsp.sectigo.com,ocsp.ssl.com,ocsp.thawte.com,ocsp.usertrust.com,ocsp.verisign.com,oneocsp.microsoft.com,packagecloud.io,packages.cloud.google.com,packages.microsoft.com,pkgs.dev.azure.com,ppa.launchpad.net,raw.githubusercontent.com,registry.npmjs.org,s.symcb.com,s.symcd.com,security.ubuntu.com,telemetry.enterprise.githubcopilot.com,ts-crl.ws.symantec.com,ts-ocsp.ws.symantec.com,vsassets.io,www.googleapis.com,www.microsoft.com' --log-level info --proxy-logs-dir /tmp/gh-aw/sandbox/firewall/logs --audit-dir /tmp/gh-aw/sandbox/firewall/audit --enable-host-access --allow-host-ports 80,443,8080 --image-tag 0.25.28,squid=sha256:844c18280f82cd1b06345eb2f4e91966b34185bfc51c9f237c3e022e848fb474,agent=sha256:a8834e285807654bf680154faa710d43fe4365a0868142f5c20e48c85e137a7a,api-proxy=sha256:93290f2393752252911bd7c39a047f776c0b53063575e7bde4e304962a9a61cb,cli-proxy=sha256:fdf310e4678ce58d248c466b89399e9680a3003038fd19322c388559016aaac7 --skip-pull --enable-api-proxy \ + -- /bin/bash -c 'GH_AW_NODE_EXEC="${GH_AW_NODE_BIN:-}"; if [ -z "$GH_AW_NODE_EXEC" ] || [ ! -x "$GH_AW_NODE_EXEC" ]; then GH_AW_NODE_EXEC="$(command -v node 2>/dev/null || echo node)"; fi; "$GH_AW_NODE_EXEC" ${RUNNER_TEMP}/gh-aw/actions/copilot_driver.cjs /usr/local/bin/copilot --add-dir /tmp/gh-aw/ --log-level all --log-dir /tmp/gh-aw/sandbox/agent/logs/ --disable-builtin-mcps --no-ask-user --allow-all-tools --allow-all-paths --add-dir "${GITHUB_WORKSPACE}" --prompt-file /tmp/gh-aw/aw-prompts/prompt.txt' 2>&1 | tee -a /tmp/gh-aw/agent-stdio.log + env: + COPILOT_AGENT_RUNNER_TYPE: STANDALONE + COPILOT_API_KEY: dummy-byok-key-for-offline-mode + COPILOT_GITHUB_TOKEN: ${{ secrets.COPILOT_GITHUB_TOKEN }} + COPILOT_MODEL: claude-sonnet-4.5 + GH_AW_MCP_CONFIG: /home/runner/.copilot/mcp-config.json + GH_AW_PHASE: agent + GH_AW_PROMPT: /tmp/gh-aw/aw-prompts/prompt.txt + GH_AW_SAFE_OUTPUTS: ${{ steps.set-runtime-paths.outputs.GH_AW_SAFE_OUTPUTS }} + GH_AW_VERSION: v0.71.1 + GITHUB_API_URL: ${{ github.api_url }} + GITHUB_AW: true + GITHUB_COPILOT_INTEGRATION_ID: agentic-workflows + GITHUB_HEAD_REF: ${{ github.head_ref }} + GITHUB_MCP_SERVER_TOKEN: ${{ secrets.GH_AW_GITHUB_MCP_SERVER_TOKEN || secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }} + GITHUB_REF_NAME: ${{ github.ref_name }} + GITHUB_SERVER_URL: ${{ github.server_url }} + GITHUB_STEP_SUMMARY: /tmp/gh-aw/agent-step-summary.md + GITHUB_WORKSPACE: ${{ github.workspace }} + GIT_AUTHOR_EMAIL: github-actions[bot]@users.noreply.github.com + GIT_AUTHOR_NAME: github-actions[bot] + GIT_COMMITTER_EMAIL: github-actions[bot]@users.noreply.github.com + GIT_COMMITTER_NAME: github-actions[bot] + XDG_CONFIG_HOME: /home/runner + - name: Detect Copilot errors + id: detect-copilot-errors + if: always() + continue-on-error: true + run: node "${RUNNER_TEMP}/gh-aw/actions/detect_copilot_errors.cjs" + - name: Configure Git credentials + env: + REPO_NAME: ${{ github.repository }} + SERVER_URL: ${{ github.server_url }} + GITHUB_TOKEN: ${{ github.token }} + run: | + git config --global user.email "github-actions[bot]@users.noreply.github.com" + git config --global user.name "github-actions[bot]" + git config --global am.keepcr true + # Re-authenticate git with GitHub token + SERVER_URL_STRIPPED="${SERVER_URL#https://}" + git remote set-url origin "https://x-access-token:${GITHUB_TOKEN}@${SERVER_URL_STRIPPED}/${REPO_NAME}.git" + echo "Git configured with standard GitHub Actions identity" + - name: Copy Copilot session state files to logs + if: always() + continue-on-error: true + run: bash "${RUNNER_TEMP}/gh-aw/actions/copy_copilot_session_state.sh" + - name: Stop MCP Gateway + if: always() + continue-on-error: true + env: + MCP_GATEWAY_PORT: ${{ steps.start-mcp-gateway.outputs.gateway-port }} + MCP_GATEWAY_API_KEY: ${{ steps.start-mcp-gateway.outputs.gateway-api-key }} + GATEWAY_PID: ${{ steps.start-mcp-gateway.outputs.gateway-pid }} + run: | + bash "${RUNNER_TEMP}/gh-aw/actions/stop_mcp_gateway.sh" "$GATEWAY_PID" + - name: Redact secrets in logs + if: always() + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9 + with: + script: | + const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io, getOctokit); + const { main } = require('${{ runner.temp }}/gh-aw/actions/redact_secrets.cjs'); + await main(); + env: + GH_AW_SECRET_NAMES: 'COPILOT_GITHUB_TOKEN,GH_AW_GITHUB_MCP_SERVER_TOKEN,GH_AW_GITHUB_TOKEN,GITHUB_TOKEN' + SECRET_COPILOT_GITHUB_TOKEN: ${{ secrets.COPILOT_GITHUB_TOKEN }} + SECRET_GH_AW_GITHUB_MCP_SERVER_TOKEN: ${{ secrets.GH_AW_GITHUB_MCP_SERVER_TOKEN }} + SECRET_GH_AW_GITHUB_TOKEN: ${{ secrets.GH_AW_GITHUB_TOKEN }} + SECRET_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Append agent step summary + if: always() + run: bash "${RUNNER_TEMP}/gh-aw/actions/append_agent_step_summary.sh" + - name: Copy Safe Outputs + if: always() + env: + GH_AW_SAFE_OUTPUTS: ${{ steps.set-runtime-paths.outputs.GH_AW_SAFE_OUTPUTS }} + run: | + mkdir -p /tmp/gh-aw + cp "$GH_AW_SAFE_OUTPUTS" /tmp/gh-aw/safeoutputs.jsonl 2>/dev/null || true + - name: Ingest agent output + id: collect_output + if: always() + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9 + env: + GH_AW_SAFE_OUTPUTS: ${{ steps.set-runtime-paths.outputs.GH_AW_SAFE_OUTPUTS }} + GH_AW_ALLOWED_DOMAINS: "*.githubusercontent.com,*.vsblob.vsassets.io,aka.ms,api.business.githubcopilot.com,api.enterprise.githubcopilot.com,api.github.com,api.githubcopilot.com,api.individual.githubcopilot.com,api.nuget.org,api.snapcraft.io,archive.ubuntu.com,azure.archive.ubuntu.com,azuresearch-usnc.nuget.org,azuresearch-ussc.nuget.org,builds.dotnet.microsoft.com,ci.dot.net,codeload.github.com,crl.geotrust.com,crl.globalsign.com,crl.identrust.com,crl.sectigo.com,crl.thawte.com,crl.usertrust.com,crl.verisign.com,crl3.digicert.com,crl4.digicert.com,crls.ssl.com,dc.services.visualstudio.com,dev.azure.com,devdiv.visualstudio.com,dist.nuget.org,docs.github.com,dot.net,dotnet.microsoft.com,dotnetcli.blob.core.windows.net,github-cloud.githubusercontent.com,github-cloud.s3.amazonaws.com,github.blog,github.com,github.githubassets.com,host.docker.internal,json-schema.org,json.schemastore.org,keyserver.ubuntu.com,lfs.github.com,microsoft.com,nuget.org,nuget.pkg.github.com,nugetregistryv2prod.blob.core.windows.net,objects.githubusercontent.com,ocsp.digicert.com,ocsp.geotrust.com,ocsp.globalsign.com,ocsp.identrust.com,ocsp.sectigo.com,ocsp.ssl.com,ocsp.thawte.com,ocsp.usertrust.com,ocsp.verisign.com,oneocsp.microsoft.com,packagecloud.io,packages.cloud.google.com,packages.microsoft.com,pkgs.dev.azure.com,ppa.launchpad.net,raw.githubusercontent.com,registry.npmjs.org,s.symcb.com,s.symcd.com,security.ubuntu.com,telemetry.enterprise.githubcopilot.com,ts-crl.ws.symantec.com,ts-ocsp.ws.symantec.com,vsassets.io,www.googleapis.com,www.microsoft.com" + GITHUB_SERVER_URL: ${{ github.server_url }} + GITHUB_API_URL: ${{ github.api_url }} + with: + script: | + const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io, getOctokit); + const { main } = require('${{ runner.temp }}/gh-aw/actions/collect_ndjson_output.cjs'); + await main(); + - name: Parse agent logs for step summary + if: always() + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9 + env: + GH_AW_AGENT_OUTPUT: /tmp/gh-aw/sandbox/agent/logs/ + with: + script: | + const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io, getOctokit); + const { main } = require('${{ runner.temp }}/gh-aw/actions/parse_copilot_log.cjs'); + await main(); + - name: Parse MCP Gateway logs for step summary + if: always() + id: parse-mcp-gateway + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9 + with: + script: | + const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io, getOctokit); + const { main } = require('${{ runner.temp }}/gh-aw/actions/parse_mcp_gateway_log.cjs'); + await main(); + - name: Print firewall logs + if: always() + continue-on-error: true + env: + AWF_LOGS_DIR: /tmp/gh-aw/sandbox/firewall/logs + run: | + # Fix permissions on firewall logs/audit dirs so they can be uploaded as artifacts + # AWF runs with sudo, creating files owned by root + sudo chmod -R a+r /tmp/gh-aw/sandbox/firewall 2>/dev/null || true + # Only run awf logs summary if awf command exists (it may not be installed if workflow failed before install step) + if command -v awf &> /dev/null; then + awf logs summary | tee -a "$GITHUB_STEP_SUMMARY" + else + echo 'AWF binary not installed, skipping firewall log summary' + fi + - name: Parse token usage for step summary + if: always() + continue-on-error: true + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9 + with: + script: | + const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io, getOctokit); + const { main } = require('${{ runner.temp }}/gh-aw/actions/parse_token_usage.cjs'); + await main(); + - name: Write agent output placeholder if missing + if: always() + run: | + if [ ! -f /tmp/gh-aw/agent_output.json ]; then + echo '{"items":[]}' > /tmp/gh-aw/agent_output.json + fi + - name: Upload agent artifacts + if: always() + continue-on-error: true + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: agent + path: | + /tmp/gh-aw/aw-prompts/prompt.txt + /tmp/gh-aw/sandbox/agent/logs/ + /tmp/gh-aw/redacted-urls.log + /tmp/gh-aw/mcp-logs/ + /tmp/gh-aw/proxy-logs/ + !/tmp/gh-aw/proxy-logs/proxy-tls/ + /tmp/gh-aw/agent_usage.json + /tmp/gh-aw/agent-stdio.log + /tmp/gh-aw/agent/ + /tmp/gh-aw/github_rate_limits.jsonl + /tmp/gh-aw/safeoutputs.jsonl + /tmp/gh-aw/agent_output.json + /tmp/gh-aw/aw-*.patch + /tmp/gh-aw/aw-*.bundle + /tmp/gh-aw/sandbox/firewall/logs/ + /tmp/gh-aw/sandbox/firewall/audit/ + if-no-files-found: ignore + + conclusion: + needs: + - activation + - agent + - detection + - safe_outputs + if: > + always() && (needs.agent.result != 'skipped' || needs.activation.outputs.lockdown_check_failed == 'true' || + needs.activation.outputs.stale_lock_file_failed == 'true') + runs-on: ubuntu-slim + permissions: + contents: read + discussions: write + issues: write + pull-requests: write + concurrency: + group: "gh-aw-conclusion-ci-postmortem" + cancel-in-progress: false + outputs: + incomplete_count: ${{ steps.report_incomplete.outputs.incomplete_count }} + noop_message: ${{ steps.noop.outputs.noop_message }} + tools_reported: ${{ steps.missing_tool.outputs.tools_reported }} + total_count: ${{ steps.missing_tool.outputs.total_count }} + steps: + - name: Setup Scripts + id: setup + uses: github/gh-aw-actions/setup@239aec45b78c8799417efdd5bc6d8cc036629ec1 # v0.71.1 + with: + destination: ${{ runner.temp }}/gh-aw/actions + job-name: ${{ github.job }} + trace-id: ${{ needs.activation.outputs.setup-trace-id }} + - name: Download agent output artifact + id: download-agent-output + continue-on-error: true + uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1 + with: + name: agent + path: /tmp/gh-aw/ + - name: Setup agent output environment variable + id: setup-agent-output-env + if: steps.download-agent-output.outcome == 'success' + run: | + mkdir -p /tmp/gh-aw/ + find "/tmp/gh-aw/" -type f -print + echo "GH_AW_AGENT_OUTPUT=/tmp/gh-aw/agent_output.json" >> "$GITHUB_OUTPUT" + - name: Process no-op messages + id: noop + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9 + env: + GH_AW_AGENT_OUTPUT: ${{ steps.setup-agent-output-env.outputs.GH_AW_AGENT_OUTPUT }} + GH_AW_NOOP_MAX: "1" + GH_AW_WORKFLOW_NAME: "CI Post-Mortem Analysis" + GH_AW_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + GH_AW_AGENT_CONCLUSION: ${{ needs.agent.result }} + GH_AW_NOOP_REPORT_AS_ISSUE: "true" + with: + github-token: ${{ secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }} + script: | + const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io, getOctokit); + const { main } = require('${{ runner.temp }}/gh-aw/actions/handle_noop_message.cjs'); + await main(); + - name: Log detection run + id: detection_runs + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9 + env: + GH_AW_AGENT_OUTPUT: ${{ steps.setup-agent-output-env.outputs.GH_AW_AGENT_OUTPUT }} + GH_AW_WORKFLOW_NAME: "CI Post-Mortem Analysis" + GH_AW_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + GH_AW_DETECTION_CONCLUSION: ${{ needs.detection.outputs.detection_conclusion }} + GH_AW_DETECTION_REASON: ${{ needs.detection.outputs.detection_reason }} + with: + github-token: ${{ secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }} + script: | + const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io, getOctokit); + const { main } = require('${{ runner.temp }}/gh-aw/actions/handle_detection_runs.cjs'); + await main(); + - name: Record missing tool + id: missing_tool + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9 + env: + GH_AW_AGENT_OUTPUT: ${{ steps.setup-agent-output-env.outputs.GH_AW_AGENT_OUTPUT }} + GH_AW_MISSING_TOOL_CREATE_ISSUE: "true" + GH_AW_WORKFLOW_NAME: "CI Post-Mortem Analysis" + with: + github-token: ${{ secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }} + script: | + const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io, getOctokit); + const { main } = require('${{ runner.temp }}/gh-aw/actions/missing_tool.cjs'); + await main(); + - name: Record incomplete + id: report_incomplete + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9 + env: + GH_AW_AGENT_OUTPUT: ${{ steps.setup-agent-output-env.outputs.GH_AW_AGENT_OUTPUT }} + GH_AW_REPORT_INCOMPLETE_CREATE_ISSUE: "true" + GH_AW_WORKFLOW_NAME: "CI Post-Mortem Analysis" + with: + github-token: ${{ secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }} + script: | + const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io, getOctokit); + const { main } = require('${{ runner.temp }}/gh-aw/actions/report_incomplete_handler.cjs'); + await main(); + - name: Handle agent failure + id: handle_agent_failure + if: always() + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9 + env: + GH_AW_AGENT_OUTPUT: ${{ steps.setup-agent-output-env.outputs.GH_AW_AGENT_OUTPUT }} + GH_AW_WORKFLOW_NAME: "CI Post-Mortem Analysis" + GH_AW_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + GH_AW_AGENT_CONCLUSION: ${{ needs.agent.result }} + GH_AW_WORKFLOW_ID: "ci-postmortem" + GH_AW_ACTION_FAILURE_ISSUE_EXPIRES_HOURS: "168" + GH_AW_ENGINE_ID: "copilot" + GH_AW_SECRET_VERIFICATION_RESULT: ${{ needs.activation.outputs.secret_verification_result }} + GH_AW_CHECKOUT_PR_SUCCESS: ${{ needs.agent.outputs.checkout_pr_success }} + GH_AW_INFERENCE_ACCESS_ERROR: ${{ needs.agent.outputs.inference_access_error }} + GH_AW_MCP_POLICY_ERROR: ${{ needs.agent.outputs.mcp_policy_error }} + GH_AW_AGENTIC_ENGINE_TIMEOUT: ${{ needs.agent.outputs.agentic_engine_timeout }} + GH_AW_MODEL_NOT_SUPPORTED_ERROR: ${{ needs.agent.outputs.model_not_supported_error }} + GH_AW_LOCKDOWN_CHECK_FAILED: ${{ needs.activation.outputs.lockdown_check_failed }} + GH_AW_STALE_LOCK_FILE_FAILED: ${{ needs.activation.outputs.stale_lock_file_failed }} + GH_AW_GROUP_REPORTS: "false" + GH_AW_FAILURE_REPORT_AS_ISSUE: "true" + GH_AW_TIMEOUT_MINUTES: "20" + with: + github-token: ${{ secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }} + script: | + const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io, getOctokit); + const { main } = require('${{ runner.temp }}/gh-aw/actions/handle_agent_failure.cjs'); + await main(); + + detection: + needs: + - activation + - agent + if: > + always() && needs.agent.result != 'skipped' && (needs.agent.outputs.output_types != '' || needs.agent.outputs.has_patch == 'true') + runs-on: ubuntu-latest + permissions: + contents: read + outputs: + detection_conclusion: ${{ steps.detection_conclusion.outputs.conclusion }} + detection_reason: ${{ steps.detection_conclusion.outputs.reason }} + detection_success: ${{ steps.detection_conclusion.outputs.success }} + steps: + - name: Setup Scripts + id: setup + uses: github/gh-aw-actions/setup@239aec45b78c8799417efdd5bc6d8cc036629ec1 # v0.71.1 + with: + destination: ${{ runner.temp }}/gh-aw/actions + job-name: ${{ github.job }} + trace-id: ${{ needs.activation.outputs.setup-trace-id }} + - name: Download agent output artifact + id: download-agent-output + continue-on-error: true + uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1 + with: + name: agent + path: /tmp/gh-aw/ + - name: Setup agent output environment variable + id: setup-agent-output-env + if: steps.download-agent-output.outcome == 'success' + run: | + mkdir -p /tmp/gh-aw/ + find "/tmp/gh-aw/" -type f -print + echo "GH_AW_AGENT_OUTPUT=/tmp/gh-aw/agent_output.json" >> "$GITHUB_OUTPUT" + - name: Checkout repository for patch context + if: needs.agent.outputs.has_patch == 'true' + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + # --- Threat Detection --- + - name: Clean stale firewall files from agent artifact + run: | + rm -rf /tmp/gh-aw/sandbox/firewall/logs + rm -rf /tmp/gh-aw/sandbox/firewall/audit + - name: Download container images + run: bash "${RUNNER_TEMP}/gh-aw/actions/download_docker_images.sh" ghcr.io/github/gh-aw-firewall/agent:0.25.28@sha256:a8834e285807654bf680154faa710d43fe4365a0868142f5c20e48c85e137a7a ghcr.io/github/gh-aw-firewall/api-proxy:0.25.28@sha256:93290f2393752252911bd7c39a047f776c0b53063575e7bde4e304962a9a61cb ghcr.io/github/gh-aw-firewall/squid:0.25.28@sha256:844c18280f82cd1b06345eb2f4e91966b34185bfc51c9f237c3e022e848fb474 + - name: Check if detection needed + id: detection_guard + if: always() + env: + OUTPUT_TYPES: ${{ needs.agent.outputs.output_types }} + HAS_PATCH: ${{ needs.agent.outputs.has_patch }} + run: | + if [[ -n "$OUTPUT_TYPES" || "$HAS_PATCH" == "true" ]]; then + echo "run_detection=true" >> "$GITHUB_OUTPUT" + echo "Detection will run: output_types=$OUTPUT_TYPES, has_patch=$HAS_PATCH" + else + echo "run_detection=false" >> "$GITHUB_OUTPUT" + echo "Detection skipped: no agent outputs or patches to analyze" + fi + - name: Clear MCP configuration for detection + if: always() && steps.detection_guard.outputs.run_detection == 'true' + run: | + rm -f "${RUNNER_TEMP}/gh-aw/mcp-config/mcp-servers.json" + rm -f /home/runner/.copilot/mcp-config.json + rm -f "$GITHUB_WORKSPACE/.gemini/settings.json" + - name: Prepare threat detection files + if: always() && steps.detection_guard.outputs.run_detection == 'true' + run: | + mkdir -p /tmp/gh-aw/threat-detection/aw-prompts + cp /tmp/gh-aw/aw-prompts/prompt.txt /tmp/gh-aw/threat-detection/aw-prompts/prompt.txt 2>/dev/null || true + cp /tmp/gh-aw/agent_output.json /tmp/gh-aw/threat-detection/agent_output.json 2>/dev/null || true + for f in /tmp/gh-aw/aw-*.patch; do + [ -f "$f" ] && cp "$f" /tmp/gh-aw/threat-detection/ 2>/dev/null || true + done + for f in /tmp/gh-aw/aw-*.bundle; do + [ -f "$f" ] && cp "$f" /tmp/gh-aw/threat-detection/ 2>/dev/null || true + done + echo "Prepared threat detection files:" + ls -la /tmp/gh-aw/threat-detection/ 2>/dev/null || true + - name: Setup threat detection + if: always() && steps.detection_guard.outputs.run_detection == 'true' + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9 + env: + WORKFLOW_NAME: "CI Post-Mortem Analysis" + WORKFLOW_DESCRIPTION: "No description provided" + HAS_PATCH: ${{ needs.agent.outputs.has_patch }} + with: + script: | + const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io, getOctokit); + const { main } = require('${{ runner.temp }}/gh-aw/actions/setup_threat_detection.cjs'); + await main(); + - name: Ensure threat-detection directory and log + if: always() && steps.detection_guard.outputs.run_detection == 'true' + run: | + mkdir -p /tmp/gh-aw/threat-detection + touch /tmp/gh-aw/threat-detection/detection.log + - name: Setup Node.js + uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 + with: + node-version: '24' + package-manager-cache: false + - name: Install GitHub Copilot CLI + run: bash "${RUNNER_TEMP}/gh-aw/actions/install_copilot_cli.sh" 1.0.35 + env: + GH_HOST: github.com + - name: Install AWF binary + run: bash "${RUNNER_TEMP}/gh-aw/actions/install_awf_binary.sh" v0.25.28 + - name: Execute GitHub Copilot CLI + if: always() && steps.detection_guard.outputs.run_detection == 'true' + id: detection_agentic_execution + # Copilot CLI tool arguments (sorted): + timeout-minutes: 20 + run: | + set -o pipefail + touch /tmp/gh-aw/agent-step-summary.md + GH_AW_NODE_BIN=$(command -v node 2>/dev/null || true) + export GH_AW_NODE_BIN + (umask 177 && touch /tmp/gh-aw/threat-detection/detection.log) + # shellcheck disable=SC1003 + sudo -E awf --container-workdir "${GITHUB_WORKSPACE}" --mount "${RUNNER_TEMP}/gh-aw:${RUNNER_TEMP}/gh-aw:ro" --mount "${RUNNER_TEMP}/gh-aw:/host${RUNNER_TEMP}/gh-aw:ro" --env-all --exclude-env COPILOT_GITHUB_TOKEN --allow-domains api.business.githubcopilot.com,api.enterprise.githubcopilot.com,api.github.com,api.githubcopilot.com,api.individual.githubcopilot.com,github.com,host.docker.internal,telemetry.enterprise.githubcopilot.com --log-level info --proxy-logs-dir /tmp/gh-aw/sandbox/firewall/logs --audit-dir /tmp/gh-aw/sandbox/firewall/audit --enable-host-access --allow-host-ports 80,443,8080 --image-tag 0.25.28,squid=sha256:844c18280f82cd1b06345eb2f4e91966b34185bfc51c9f237c3e022e848fb474,agent=sha256:a8834e285807654bf680154faa710d43fe4365a0868142f5c20e48c85e137a7a,api-proxy=sha256:93290f2393752252911bd7c39a047f776c0b53063575e7bde4e304962a9a61cb,cli-proxy=sha256:fdf310e4678ce58d248c466b89399e9680a3003038fd19322c388559016aaac7 --skip-pull --enable-api-proxy \ + -- /bin/bash -c 'GH_AW_NODE_EXEC="${GH_AW_NODE_BIN:-}"; if [ -z "$GH_AW_NODE_EXEC" ] || [ ! -x "$GH_AW_NODE_EXEC" ]; then GH_AW_NODE_EXEC="$(command -v node 2>/dev/null || echo node)"; fi; "$GH_AW_NODE_EXEC" ${RUNNER_TEMP}/gh-aw/actions/copilot_driver.cjs /usr/local/bin/copilot --add-dir /tmp/gh-aw/ --log-level all --log-dir /tmp/gh-aw/sandbox/agent/logs/ --disable-builtin-mcps --no-ask-user --allow-all-tools --add-dir "${GITHUB_WORKSPACE}" --prompt-file /tmp/gh-aw/aw-prompts/prompt.txt' 2>&1 | tee -a /tmp/gh-aw/threat-detection/detection.log + env: + COPILOT_AGENT_RUNNER_TYPE: STANDALONE + COPILOT_API_KEY: dummy-byok-key-for-offline-mode + COPILOT_GITHUB_TOKEN: ${{ secrets.COPILOT_GITHUB_TOKEN }} + COPILOT_MODEL: claude-sonnet-4.5 + GH_AW_PHASE: detection + GH_AW_PROMPT: /tmp/gh-aw/aw-prompts/prompt.txt + GH_AW_VERSION: v0.71.1 + GITHUB_API_URL: ${{ github.api_url }} + GITHUB_AW: true + GITHUB_COPILOT_INTEGRATION_ID: agentic-workflows + GITHUB_HEAD_REF: ${{ github.head_ref }} + GITHUB_REF_NAME: ${{ github.ref_name }} + GITHUB_SERVER_URL: ${{ github.server_url }} + GITHUB_STEP_SUMMARY: /tmp/gh-aw/agent-step-summary.md + GITHUB_WORKSPACE: ${{ github.workspace }} + GIT_AUTHOR_EMAIL: github-actions[bot]@users.noreply.github.com + GIT_AUTHOR_NAME: github-actions[bot] + GIT_COMMITTER_EMAIL: github-actions[bot]@users.noreply.github.com + GIT_COMMITTER_NAME: github-actions[bot] + XDG_CONFIG_HOME: /home/runner + - name: Upload threat detection log + if: always() && steps.detection_guard.outputs.run_detection == 'true' + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: detection + path: /tmp/gh-aw/threat-detection/detection.log + if-no-files-found: ignore + - name: Parse and conclude threat detection + id: detection_conclusion + if: always() + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9 + env: + RUN_DETECTION: ${{ steps.detection_guard.outputs.run_detection }} + GH_AW_DETECTION_CONTINUE_ON_ERROR: "true" + with: + script: | + const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io, getOctokit); + const { main } = require('${{ runner.temp }}/gh-aw/actions/parse_threat_detection_results.cjs'); + await main(); + + safe_outputs: + needs: + - activation + - agent + - detection + if: (!cancelled()) && needs.agent.result != 'skipped' && needs.detection.result == 'success' + runs-on: ubuntu-slim + permissions: + contents: read + discussions: write + issues: write + pull-requests: write + timeout-minutes: 15 + env: + GH_AW_CALLER_WORKFLOW_ID: "${{ github.repository }}/ci-postmortem" + GH_AW_DETECTION_CONCLUSION: ${{ needs.detection.outputs.detection_conclusion }} + GH_AW_DETECTION_REASON: ${{ needs.detection.outputs.detection_reason }} + GH_AW_EFFECTIVE_TOKENS: ${{ needs.agent.outputs.effective_tokens }} + GH_AW_ENGINE_ID: "copilot" + GH_AW_ENGINE_MODEL: "claude-sonnet-4.5" + GH_AW_ENGINE_VERSION: "1.0.35" + GH_AW_WORKFLOW_ID: "ci-postmortem" + GH_AW_WORKFLOW_NAME: "CI Post-Mortem Analysis" + outputs: + code_push_failure_count: ${{ steps.process_safe_outputs.outputs.code_push_failure_count }} + code_push_failure_errors: ${{ steps.process_safe_outputs.outputs.code_push_failure_errors }} + comment_id: ${{ steps.process_safe_outputs.outputs.comment_id }} + comment_url: ${{ steps.process_safe_outputs.outputs.comment_url }} + create_discussion_error_count: ${{ steps.process_safe_outputs.outputs.create_discussion_error_count }} + create_discussion_errors: ${{ steps.process_safe_outputs.outputs.create_discussion_errors }} + created_issue_number: ${{ steps.process_safe_outputs.outputs.created_issue_number }} + created_issue_url: ${{ steps.process_safe_outputs.outputs.created_issue_url }} + process_safe_outputs_processed_count: ${{ steps.process_safe_outputs.outputs.processed_count }} + process_safe_outputs_temporary_id_map: ${{ steps.process_safe_outputs.outputs.temporary_id_map }} + steps: + - name: Setup Scripts + id: setup + uses: github/gh-aw-actions/setup@239aec45b78c8799417efdd5bc6d8cc036629ec1 # v0.71.1 + with: + destination: ${{ runner.temp }}/gh-aw/actions + job-name: ${{ github.job }} + trace-id: ${{ needs.activation.outputs.setup-trace-id }} + - name: Download agent output artifact + id: download-agent-output + continue-on-error: true + uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1 + with: + name: agent + path: /tmp/gh-aw/ + - name: Setup agent output environment variable + id: setup-agent-output-env + if: steps.download-agent-output.outcome == 'success' + run: | + mkdir -p /tmp/gh-aw/ + find "/tmp/gh-aw/" -type f -print + echo "GH_AW_AGENT_OUTPUT=/tmp/gh-aw/agent_output.json" >> "$GITHUB_OUTPUT" + - name: Configure GH_HOST for enterprise compatibility + id: ghes-host-config + shell: bash + run: | + # Derive GH_HOST from GITHUB_SERVER_URL so the gh CLI targets the correct + # GitHub instance (GHES/GHEC). On github.com this is a harmless no-op. + GH_HOST="${GITHUB_SERVER_URL#https://}" + GH_HOST="${GH_HOST#http://}" + echo "GH_HOST=${GH_HOST}" >> "$GITHUB_ENV" + - name: Process Safe Outputs + id: process_safe_outputs + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9 + env: + GH_AW_AGENT_OUTPUT: ${{ steps.setup-agent-output-env.outputs.GH_AW_AGENT_OUTPUT }} + GH_AW_ALLOWED_DOMAINS: "*.githubusercontent.com,*.vsblob.vsassets.io,aka.ms,api.business.githubcopilot.com,api.enterprise.githubcopilot.com,api.github.com,api.githubcopilot.com,api.individual.githubcopilot.com,api.nuget.org,api.snapcraft.io,archive.ubuntu.com,azure.archive.ubuntu.com,azuresearch-usnc.nuget.org,azuresearch-ussc.nuget.org,builds.dotnet.microsoft.com,ci.dot.net,codeload.github.com,crl.geotrust.com,crl.globalsign.com,crl.identrust.com,crl.sectigo.com,crl.thawte.com,crl.usertrust.com,crl.verisign.com,crl3.digicert.com,crl4.digicert.com,crls.ssl.com,dc.services.visualstudio.com,dev.azure.com,devdiv.visualstudio.com,dist.nuget.org,docs.github.com,dot.net,dotnet.microsoft.com,dotnetcli.blob.core.windows.net,github-cloud.githubusercontent.com,github-cloud.s3.amazonaws.com,github.blog,github.com,github.githubassets.com,host.docker.internal,json-schema.org,json.schemastore.org,keyserver.ubuntu.com,lfs.github.com,microsoft.com,nuget.org,nuget.pkg.github.com,nugetregistryv2prod.blob.core.windows.net,objects.githubusercontent.com,ocsp.digicert.com,ocsp.geotrust.com,ocsp.globalsign.com,ocsp.identrust.com,ocsp.sectigo.com,ocsp.ssl.com,ocsp.thawte.com,ocsp.usertrust.com,ocsp.verisign.com,oneocsp.microsoft.com,packagecloud.io,packages.cloud.google.com,packages.microsoft.com,pkgs.dev.azure.com,ppa.launchpad.net,raw.githubusercontent.com,registry.npmjs.org,s.symcb.com,s.symcd.com,security.ubuntu.com,telemetry.enterprise.githubcopilot.com,ts-crl.ws.symantec.com,ts-ocsp.ws.symantec.com,vsassets.io,www.googleapis.com,www.microsoft.com" + GITHUB_SERVER_URL: ${{ github.server_url }} + GITHUB_API_URL: ${{ github.api_url }} + GH_AW_SAFE_OUTPUTS_HANDLER_CONFIG: "{\"add_comment\":{\"max\":20},\"create_issue\":{\"max\":20},\"create_report_incomplete_issue\":{},\"missing_data\":{},\"missing_tool\":{},\"noop\":{\"max\":1,\"report-as-issue\":\"true\"},\"report_incomplete\":{},\"update_issue\":{\"allow_body\":true,\"max\":20}}" + with: + github-token: ${{ secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }} + script: | + const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io, getOctokit); + const { main } = require('${{ runner.temp }}/gh-aw/actions/safe_output_handler_manager.cjs'); + await main(); + - name: Upload Safe Outputs Items + if: always() + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: safe-outputs-items + path: | + /tmp/gh-aw/safe-output-items.jsonl + /tmp/gh-aw/temporary-id-map.json + if-no-files-found: ignore + diff --git a/.github/workflows/ci-postmortem.md b/.github/workflows/ci-postmortem.md new file mode 100644 index 000000000000..64065525ca6e --- /dev/null +++ b/.github/workflows/ci-postmortem.md @@ -0,0 +1,57 @@ +--- +on: + schedule: + - cron: "weekly on sunday" + workflow_dispatch: +permissions: + contents: read + issues: read +engine: + id: copilot + model: claude-sonnet-4.5 +network: + allowed: + - defaults + - dotnet + - github + - "aka.ms" + - "dev.azure.com" + - "devdiv.visualstudio.com" + - "microsoft.com" + - "vsassets.io" +tools: + github: + toolsets: [issues, repos] + min-integrity: none +safe-outputs: + create-issue: + max: 20 + add-comment: + max: 20 + update-issue: + max: 20 +--- + +# CI Post-Mortem Analysis + +Perform a weekly post-mortem analysis of CI failures across recent PRs in dotnet/macios to identify flaky tests, infrastructure issues, and shared regressions that are not caused by any specific PR. + +## Instructions + +1. Read the skill definition from `.agents/skills/macios-ci-postmortem/SKILL.md` — this contains the full 4-phase workflow. +2. Read the Azure DevOps CLI reference from `.agents/skills/macios-ci-postmortem/references/azure-devops-cli.md`. +3. Execute all four phases of the workflow: + - **Phase 1: Discovery** — collect all PR-validation builds from the last 7 days + - **Phase 2: Extraction** — download TestSummary artifacts for triage, then HtmlReport artifacts only for jobs with test failures, and parse NUnit XML for individual test-level failures + - **Phase 3: Classification** — categorize failures as flaky (cross-PR or rerun-recovered), infrastructure (bot-specific or cross-bot), or PR-specific (exclude these). Also exclude `AppSizeTest` failures. + - **Phase 4: Issue Actions** — search for existing `ci-postmortem` issues, then file new issues or comment on existing ones +4. All issues must have the `ci-postmortem` and `copilot` labels. +5. File one issue per distinct test failure — do not group unrelated test failures together. +6. For infrastructure issues, check if failures are concentrated on specific bots by extracting `workerName` from build timelines. + +## Constraints + +- Only file issues for failures that appear across 2+ unrelated PRs, or that are confirmed flaky by rerun recovery (same commit, different outcome). +- Never file issues for PR-specific failures — those are the PR author's responsibility. +- Always search for existing `ci-postmortem` issues before creating new ones. Comment on existing issues if the failure is already tracked. +- Always exclude `AppSizeTest` failures — they are expected to fail across PRs. From 3fbf470ee58c74aac91b1461d8e6e8e498584b79 Mon Sep 17 00:00:00 2001 From: Rolf Bjarne Kvinge Date: Mon, 4 May 2026 10:11:56 +0200 Subject: [PATCH 09/17] Revert Xamarin.MacDev and dependency bumps to match main Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- eng/Version.Details.props | 10 ++-- eng/Version.Details.xml | 20 +++---- external/Xamarin.MacDev | 2 +- msbuild/Xamarin.MacDev.Tasks/Tasks/ILLink.cs | 57 +------------------ .../Xamarin.MacDev.Tasks/Tasks/XamarinTask.cs | 11 +--- 5 files changed, 21 insertions(+), 79 deletions(-) diff --git a/eng/Version.Details.props b/eng/Version.Details.props index a447028e53ff..29cc2d2400dd 100644 --- a/eng/Version.Details.props +++ b/eng/Version.Details.props @@ -19,18 +19,18 @@ This file should be imported by eng/Versions.props 26.0.11017 18.5.9227 - 26.4.9015 + 26.4.9013 26.0.11017 18.5.9227 - 26.4.9015 + 26.4.9013 26.0.11017 15.5.9227 - 26.4.9015 + 26.4.9013 26.0.11017 18.5.9227 - 26.4.9015 + 26.4.9013 - 11.0.0-prerelease.26224.1 + 11.0.0-prerelease.26217.1 18.0.9617 18.0.9617 diff --git a/eng/Version.Details.xml b/eng/Version.Details.xml index 176e4e77385e..8c06ab691cbd 100644 --- a/eng/Version.Details.xml +++ b/eng/Version.Details.xml @@ -60,21 +60,21 @@ 797d30720e5e629d23eb146935da94cb1b61047e - + https://github.com/dotnet/macios - ac80159dab3bdd969c7e38fceb02499d3be92ac4 + 996ec2eecca8f9b580e6f5e0d5c9dee82d40eb11 - + https://github.com/dotnet/macios - ac80159dab3bdd969c7e38fceb02499d3be92ac4 + 996ec2eecca8f9b580e6f5e0d5c9dee82d40eb11 - + https://github.com/dotnet/macios - ac80159dab3bdd969c7e38fceb02499d3be92ac4 + 996ec2eecca8f9b580e6f5e0d5c9dee82d40eb11 - + https://github.com/dotnet/macios - ac80159dab3bdd969c7e38fceb02499d3be92ac4 + 996ec2eecca8f9b580e6f5e0d5c9dee82d40eb11 @@ -107,9 +107,9 @@ https://github.com/dotnet/dotnet e43cbe04901ea4cf359ed0883b0533abab224ba2 - + https://github.com/dotnet/xharness - 888ef3e553a0716745ecab689e13b816639b5a5a + 866707736d49c2323628744716cda2475b3af9ee https://github.com/dotnet/dotnet diff --git a/external/Xamarin.MacDev b/external/Xamarin.MacDev index 5295c1f4fcad..f1300986199f 160000 --- a/external/Xamarin.MacDev +++ b/external/Xamarin.MacDev @@ -1 +1 @@ -Subproject commit 5295c1f4fcadb3216af0d903b3896dea15c2d37a +Subproject commit f1300986199f5489191d2c9712e57bf8a0a3d84a diff --git a/msbuild/Xamarin.MacDev.Tasks/Tasks/ILLink.cs b/msbuild/Xamarin.MacDev.Tasks/Tasks/ILLink.cs index fec1b1aae419..2a3bb34e96bd 100644 --- a/msbuild/Xamarin.MacDev.Tasks/Tasks/ILLink.cs +++ b/msbuild/Xamarin.MacDev.Tasks/Tasks/ILLink.cs @@ -32,51 +32,10 @@ public class ILLink : global::ILLink.Tasks.ILLink, ITaskCallback, IHasSessionId [Output] public ITaskItem [] LinkedItems { get; set; } = Array.Empty (); - // if the linked output should be copied to windows (as opposed to only creating empty output files) - public bool CopyToWindows { get; set; } - - ITaskItem []? linkerCacheItemsToCopyToWindows; - ITaskItem [] LinkerCacheItemsToCopyToWindows { - get { - if (!CopyToWindows) - return []; - - // We might get called before LinkerCacheItems has been populated, in which case we don't want to cache any results. - if (LinkerCacheItems.Length == 0) - return []; - - if (linkerCacheItemsToCopyToWindows is null) { - linkerCacheItemsToCopyToWindows = LinkerCacheItems.Where (item => { - var extension = item.GetMetadata ("Extension"); - switch (extension.ToLowerInvariant ()) { - case ".h": - case ".m": - case ".mm": - return false; // we don't need any native code on Windows. - default: - return true; // copy the rest of the files to Windows - } - }).ToArray (); - } - return linkerCacheItemsToCopyToWindows; - } - } - public override bool Execute () { - if (this.ShouldExecuteRemotely (SessionId)) { - if (XamarinTask.ExecuteRemotely (this, out var taskRunner)) { - if (CopyToWindows) { - var filesToCopy = new List (); - filesToCopy.AddRange (LinkedItems); - filesToCopy.AddRange (LinkerCacheItemsToCopyToWindows); - XamarinTask.CopyFilesToWindowsAsync (this, taskRunner, filesToCopy).Wait (); - } - return true; - } - - return false; - } + if (this.ShouldExecuteRemotely (SessionId)) + return XamarinTask.ExecuteRemotely (this); // Capture execution start time for Mac-side detection var executionStartTime = DateTime.UtcNow; @@ -127,18 +86,6 @@ ITaskItem [] GetAllFilesWithMetadata (string directory, DateTime executionStartT public bool ShouldCreateOutputFile (ITaskItem item) { - if (CopyToWindows) { - if (Array.IndexOf (LinkedItems, item) >= 0) { - Log.LogMessage (MessageImportance.Low, "Not creating output file '{0}' because the entire file will be copied to Windows", item.ItemSpec); - return false; - } - - if (Array.IndexOf (LinkerCacheItemsToCopyToWindows, item) >= 0) { - Log.LogMessage (MessageImportance.Low, "Not creating output file '{0}' because the entire file will be copied to Windows (because it's not native code)", item.ItemSpec); - return false; - } - } - var modifiedMetadata = item.GetMetadata ("Modified"); var wasModified = bool.TryParse (modifiedMetadata, out var modified) && modified; diff --git a/msbuild/Xamarin.MacDev.Tasks/Tasks/XamarinTask.cs b/msbuild/Xamarin.MacDev.Tasks/Tasks/XamarinTask.cs index 603f85252b07..f25ef2fd388c 100644 --- a/msbuild/Xamarin.MacDev.Tasks/Tasks/XamarinTask.cs +++ b/msbuild/Xamarin.MacDev.Tasks/Tasks/XamarinTask.cs @@ -317,19 +317,14 @@ protected internal static IEnumerable CreateItemsForAllFilesRecursive return CreateItemsForAllFilesRecursively (directories?.Select (v => v.ItemSpec)); } - internal static async global::System.Threading.Tasks.Task CopyFilesToWindowsAsync (Task task, TaskRunner runner, IEnumerable items) + internal async global::System.Threading.Tasks.Task CopyFilesToWindowsAsync (TaskRunner runner, IEnumerable items) { foreach (var item in items) { - task.Log.LogMessage (MessageImportance.Low, $"Copying {item.ItemSpec} from the remote Mac to Windows"); - await runner.GetFileAsync (task, item.ItemSpec).ConfigureAwait (false); + Log.LogMessage (MessageImportance.Low, $"Copying {item.ItemSpec} from the remote Mac to Windows"); + await runner.GetFileAsync (this, item.ItemSpec).ConfigureAwait (false); } } - internal global::System.Threading.Tasks.Task CopyFilesToWindowsAsync (TaskRunner runner, IEnumerable items) - { - return CopyFilesToWindowsAsync (this, runner, items); - } - /// /// Computes the executable to launch given the specified tool in the Xcode's toolchain: /// * If is specified, return that. From aa9e596eb0b24aa113be060d666f4700029e7b08 Mon Sep 17 00:00:00 2001 From: Rolf Bjarne Kvinge Date: Mon, 4 May 2026 10:12:38 +0200 Subject: [PATCH 10/17] Fix: align dependency files with origin/main Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- eng/Version.Details.props | 10 ++-- eng/Version.Details.xml | 20 +++---- msbuild/Xamarin.MacDev.Tasks/Tasks/ILLink.cs | 57 ++++++++++++++++++- .../Xamarin.MacDev.Tasks/Tasks/XamarinTask.cs | 11 +++- 4 files changed, 78 insertions(+), 20 deletions(-) diff --git a/eng/Version.Details.props b/eng/Version.Details.props index 29cc2d2400dd..a447028e53ff 100644 --- a/eng/Version.Details.props +++ b/eng/Version.Details.props @@ -19,18 +19,18 @@ This file should be imported by eng/Versions.props 26.0.11017 18.5.9227 - 26.4.9013 + 26.4.9015 26.0.11017 18.5.9227 - 26.4.9013 + 26.4.9015 26.0.11017 15.5.9227 - 26.4.9013 + 26.4.9015 26.0.11017 18.5.9227 - 26.4.9013 + 26.4.9015 - 11.0.0-prerelease.26217.1 + 11.0.0-prerelease.26224.1 18.0.9617 18.0.9617 diff --git a/eng/Version.Details.xml b/eng/Version.Details.xml index 8c06ab691cbd..176e4e77385e 100644 --- a/eng/Version.Details.xml +++ b/eng/Version.Details.xml @@ -60,21 +60,21 @@ 797d30720e5e629d23eb146935da94cb1b61047e - + https://github.com/dotnet/macios - 996ec2eecca8f9b580e6f5e0d5c9dee82d40eb11 + ac80159dab3bdd969c7e38fceb02499d3be92ac4 - + https://github.com/dotnet/macios - 996ec2eecca8f9b580e6f5e0d5c9dee82d40eb11 + ac80159dab3bdd969c7e38fceb02499d3be92ac4 - + https://github.com/dotnet/macios - 996ec2eecca8f9b580e6f5e0d5c9dee82d40eb11 + ac80159dab3bdd969c7e38fceb02499d3be92ac4 - + https://github.com/dotnet/macios - 996ec2eecca8f9b580e6f5e0d5c9dee82d40eb11 + ac80159dab3bdd969c7e38fceb02499d3be92ac4 @@ -107,9 +107,9 @@ https://github.com/dotnet/dotnet e43cbe04901ea4cf359ed0883b0533abab224ba2 - + https://github.com/dotnet/xharness - 866707736d49c2323628744716cda2475b3af9ee + 888ef3e553a0716745ecab689e13b816639b5a5a https://github.com/dotnet/dotnet diff --git a/msbuild/Xamarin.MacDev.Tasks/Tasks/ILLink.cs b/msbuild/Xamarin.MacDev.Tasks/Tasks/ILLink.cs index 2a3bb34e96bd..fec1b1aae419 100644 --- a/msbuild/Xamarin.MacDev.Tasks/Tasks/ILLink.cs +++ b/msbuild/Xamarin.MacDev.Tasks/Tasks/ILLink.cs @@ -32,10 +32,51 @@ public class ILLink : global::ILLink.Tasks.ILLink, ITaskCallback, IHasSessionId [Output] public ITaskItem [] LinkedItems { get; set; } = Array.Empty (); + // if the linked output should be copied to windows (as opposed to only creating empty output files) + public bool CopyToWindows { get; set; } + + ITaskItem []? linkerCacheItemsToCopyToWindows; + ITaskItem [] LinkerCacheItemsToCopyToWindows { + get { + if (!CopyToWindows) + return []; + + // We might get called before LinkerCacheItems has been populated, in which case we don't want to cache any results. + if (LinkerCacheItems.Length == 0) + return []; + + if (linkerCacheItemsToCopyToWindows is null) { + linkerCacheItemsToCopyToWindows = LinkerCacheItems.Where (item => { + var extension = item.GetMetadata ("Extension"); + switch (extension.ToLowerInvariant ()) { + case ".h": + case ".m": + case ".mm": + return false; // we don't need any native code on Windows. + default: + return true; // copy the rest of the files to Windows + } + }).ToArray (); + } + return linkerCacheItemsToCopyToWindows; + } + } + public override bool Execute () { - if (this.ShouldExecuteRemotely (SessionId)) - return XamarinTask.ExecuteRemotely (this); + if (this.ShouldExecuteRemotely (SessionId)) { + if (XamarinTask.ExecuteRemotely (this, out var taskRunner)) { + if (CopyToWindows) { + var filesToCopy = new List (); + filesToCopy.AddRange (LinkedItems); + filesToCopy.AddRange (LinkerCacheItemsToCopyToWindows); + XamarinTask.CopyFilesToWindowsAsync (this, taskRunner, filesToCopy).Wait (); + } + return true; + } + + return false; + } // Capture execution start time for Mac-side detection var executionStartTime = DateTime.UtcNow; @@ -86,6 +127,18 @@ ITaskItem [] GetAllFilesWithMetadata (string directory, DateTime executionStartT public bool ShouldCreateOutputFile (ITaskItem item) { + if (CopyToWindows) { + if (Array.IndexOf (LinkedItems, item) >= 0) { + Log.LogMessage (MessageImportance.Low, "Not creating output file '{0}' because the entire file will be copied to Windows", item.ItemSpec); + return false; + } + + if (Array.IndexOf (LinkerCacheItemsToCopyToWindows, item) >= 0) { + Log.LogMessage (MessageImportance.Low, "Not creating output file '{0}' because the entire file will be copied to Windows (because it's not native code)", item.ItemSpec); + return false; + } + } + var modifiedMetadata = item.GetMetadata ("Modified"); var wasModified = bool.TryParse (modifiedMetadata, out var modified) && modified; diff --git a/msbuild/Xamarin.MacDev.Tasks/Tasks/XamarinTask.cs b/msbuild/Xamarin.MacDev.Tasks/Tasks/XamarinTask.cs index f25ef2fd388c..603f85252b07 100644 --- a/msbuild/Xamarin.MacDev.Tasks/Tasks/XamarinTask.cs +++ b/msbuild/Xamarin.MacDev.Tasks/Tasks/XamarinTask.cs @@ -317,14 +317,19 @@ protected internal static IEnumerable CreateItemsForAllFilesRecursive return CreateItemsForAllFilesRecursively (directories?.Select (v => v.ItemSpec)); } - internal async global::System.Threading.Tasks.Task CopyFilesToWindowsAsync (TaskRunner runner, IEnumerable items) + internal static async global::System.Threading.Tasks.Task CopyFilesToWindowsAsync (Task task, TaskRunner runner, IEnumerable items) { foreach (var item in items) { - Log.LogMessage (MessageImportance.Low, $"Copying {item.ItemSpec} from the remote Mac to Windows"); - await runner.GetFileAsync (this, item.ItemSpec).ConfigureAwait (false); + task.Log.LogMessage (MessageImportance.Low, $"Copying {item.ItemSpec} from the remote Mac to Windows"); + await runner.GetFileAsync (task, item.ItemSpec).ConfigureAwait (false); } } + internal global::System.Threading.Tasks.Task CopyFilesToWindowsAsync (TaskRunner runner, IEnumerable items) + { + return CopyFilesToWindowsAsync (this, runner, items); + } + /// /// Computes the executable to launch given the specified tool in the Xcode's toolchain: /// * If is specified, return that. From bee0a3cc3d5aa1a3aa3f813e7e5e9eac4af9cf60 Mon Sep 17 00:00:00 2001 From: Rolf Bjarne Kvinge Date: Mon, 4 May 2026 16:20:29 +0200 Subject: [PATCH 11/17] Add reopen policy rules for closed postmortem issues - Don't reopen fix-closed issues less than 2 weeks old - Require failing builds from main branch - Allow reopen for lack-of-info or debug-instrumentation closures - Always allow commenting on closed issues with explanation Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .agents/skills/macios-ci-postmortem/SKILL.md | 25 ++++++++++++++++++-- external/Xamarin.MacDev | 2 +- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/.agents/skills/macios-ci-postmortem/SKILL.md b/.agents/skills/macios-ci-postmortem/SKILL.md index bcbc2146cd42..3792d8611d68 100644 --- a/.agents/skills/macios-ci-postmortem/SKILL.md +++ b/.agents/skills/macios-ci-postmortem/SKILL.md @@ -440,14 +440,35 @@ gh issue list --repo dotnet/macios --state closed \ --label "ci-postmortem" --json number,title,labels,url ``` -### Step 4.2: Propose actions to the user +### Step 4.2: Decide whether to reopen closed issues + +When a matching **closed** issue is found, apply these rules to decide whether to reopen it: + +1. **Check the close reason.** Read the issue body/comments to determine *why* it was closed: + - **Fix merged** — a code change was merged to fix the problem. + - **Lack of information** — closed because there wasn't enough data to act on. + - **Debug instrumentation merged** — a PR was merged to gather more diagnostic info. + +2. **If closed because a fix was merged:** + - **Do NOT reopen if the issue was closed less than 2 weeks ago.** The failing builds in the analysis window likely predate the fix. Comment on the closed issue with the analysis results and note why it's not being reopened. + - **Do NOT reopen unless the new failing build is from the `main` branch** (or targets `main` via a PR that incorporates the fix commit). Builds from older branches or PRs that branched before the fix don't count. + - **After 2 weeks**, if the failure is still appearing in builds that incorporate the fix, reopen the issue. + +3. **If closed for lack of information:** reopen if the new analysis provides that missing information. + +4. **If closed because debug instrumentation was merged:** reopen if any of the failing builds provide the additional diagnostic data that was being collected. + +5. **Always OK to comment** on a closed issue with analysis data, even if not reopening. Include a note explaining why the issue is not being reopened (e.g., "Not reopening — the fix in #NNNN was merged on DATE, and all failing builds predate that fix."). + +### Step 4.3: Propose actions to the user Present a list of proposed actions **before executing any**. Use `ask_user` to get confirmation. For each failure, propose one of: - **Create new issue** — no existing issue found - **Comment on existing issue** — matching open issue found, add recent occurrence data -- **Reopen issue** — matching closed issue found, failure has recurred +- **Reopen issue** — matching closed issue found, failure confirmed post-fix (see Step 4.2) +- **Comment on closed issue (no reopen)** — matching closed issue found, but reopen criteria not met - **Skip** — user decides this isn't worth tracking Format the proposal clearly: diff --git a/external/Xamarin.MacDev b/external/Xamarin.MacDev index f1300986199f..5295c1f4fcad 160000 --- a/external/Xamarin.MacDev +++ b/external/Xamarin.MacDev @@ -1 +1 @@ -Subproject commit f1300986199f5489191d2c9712e57bf8a0a3d84a +Subproject commit 5295c1f4fcadb3216af0d903b3896dea15c2d37a From bb28093fe287104e5bab95cb11d76f24b1c7956a Mon Sep 17 00:00:00 2001 From: Rolf Bjarne Kvinge Date: Mon, 4 May 2026 16:23:27 +0200 Subject: [PATCH 12/17] Require specific error messages in postmortem issues Include actual compiler/linker/assertion errors from NUnit XML. Flag when different PRs show different errors for the same test (likely different root causes). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .agents/skills/macios-ci-postmortem/SKILL.md | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/.agents/skills/macios-ci-postmortem/SKILL.md b/.agents/skills/macios-ci-postmortem/SKILL.md index 3792d8611d68..ae9f05a0a15f 100644 --- a/.agents/skills/macios-ci-postmortem/SKILL.md +++ b/.agents/skills/macios-ci-postmortem/SKILL.md @@ -522,10 +522,28 @@ gh issue create --repo dotnet/macios \ ### Error Details +Include the **specific error messages** from the NUnit XML failure messages. If the failure is a build error, include the actual compiler/linker error codes and messages. If different PRs/builds show different error messages for the same test, list them separately — they may be different root causes. + +``` + + + +``` + +If different builds have different errors for the same test, show each variant: + +**Variant A** (builds ): ``` - + ``` +**Variant B** (builds ): +``` + +``` + +**Important:** If different PRs show different error messages for the "same" test failure, they are likely **different root causes** and should be investigated separately. Consider splitting into separate issues or noting that the grouping may be incorrect. + ### Classification This failure was identified as **flaky** because: From 03ae5f6e1fb0d37b570d330a360a62dc91fbf605 Mon Sep 17 00:00:00 2001 From: Rolf Bjarne Kvinge Date: Mon, 4 May 2026 16:27:03 +0200 Subject: [PATCH 13/17] Require deep links to specific job/step in postmortem issues Use AzDO URL format with j= and t= parameters from timeline record IDs to link directly to the failing log, not just the build. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .agents/skills/macios-ci-postmortem/SKILL.md | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/.agents/skills/macios-ci-postmortem/SKILL.md b/.agents/skills/macios-ci-postmortem/SKILL.md index ae9f05a0a15f..f678f7e41630 100644 --- a/.agents/skills/macios-ci-postmortem/SKILL.md +++ b/.agents/skills/macios-ci-postmortem/SKILL.md @@ -513,13 +513,17 @@ gh issue create --repo dotnet/macios \ ### Occurrence Summary -| PR | Build | Commit | Date | Result | -|----|-------|--------|------|--------| -| # | []() | | | Failed | -| # | []() | | | Passed on rerun | +| PR | Build | Bot | Direct Link | +|----|-------|-----|-------------| +| # | | | []() | **Total:** Failed in builds across PRs +**Deep links:** Always link to the specific job and step/task, not just the build. Use the AzDO URL format: +`https://devdiv.visualstudio.com/DevDiv/_build/results?buildId=BUILD_ID&view=logs&j=JOB_RECORD_ID&t=TASK_RECORD_ID` + +The `j=` (job) and `t=` (task) parameters are the `id` fields from the timeline records. This takes the reader directly to the failing log rather than requiring them to click through multiple jobs. + ### Error Details Include the **specific error messages** from the NUnit XML failure messages. If the failure is a build error, include the actual compiler/linker error codes and messages. If different PRs/builds show different error messages for the same test, list them separately — they may be different root causes. From 7302eb5fd9f0a5dd3bbc1c286d378350ae842358 Mon Sep 17 00:00:00 2001 From: Rolf Bjarne Kvinge Date: Mon, 4 May 2026 16:31:22 +0200 Subject: [PATCH 14/17] Distinguish symptoms from root causes in infra analysis 'Path does not exist' on artifact publish is a downstream symptom of earlier failures (Install dotnet workloads, azdev-secrets, etc). Always trace back to the first failed task in the timeline. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .agents/skills/macios-ci-postmortem/SKILL.md | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/.agents/skills/macios-ci-postmortem/SKILL.md b/.agents/skills/macios-ci-postmortem/SKILL.md index f678f7e41630..c390889b48ed 100644 --- a/.agents/skills/macios-ci-postmortem/SKILL.md +++ b/.agents/skills/macios-ci-postmortem/SKILL.md @@ -374,8 +374,16 @@ Group failures by worker and compute failure rates. A bot is problematic if: Also look for cross-bot patterns that affect many PRs: - **Timeouts**: jobs that time out on multiple different bots across unrelated PRs - **REST API failures**: `Intermittent failure attempting to call the restapis` across many PRs -- **Path errors**: `Path does not exist` (especially on Windows bots) - **Provisioning failures**: `Reserve bot`, `provision` errors +- **Workload install failures**: `Install dotnet workloads` failing + +**IMPORTANT: Distinguish symptoms from root causes.** Some pipeline steps have `continueOnError: true`, which means their errors are logged as warnings but do not cause the job to fail. For example, `Publish Artifact: TestSummary` and `Publish Artifact: HtmlReport` often report `Path does not exist` — but this is because an **earlier step** failed before the tests could run and produce those artifacts. Always trace back to the **first failing task** in the job's timeline to find the actual root cause. + +To find the actual root cause in a failed job: +1. List all Task records under the job sorted by execution order +2. Find tasks with `result == 'failed'` (not `succeededWithIssues`) +3. The earliest `failed` task is typically the root cause +4. Tasks with `succeededWithIssues` that report `Path does not exist` are downstream symptoms ```sql SELECT error_signature, failure_type, raw_message, From 7876e144ba8d148bc0e1e1bd77030f5c0a1576d1 Mon Sep 17 00:00:00 2001 From: Rolf Bjarne Kvinge Date: Mon, 4 May 2026 17:17:32 +0200 Subject: [PATCH 15/17] Strengthen cascading failure guidance in postmortem skill Emphasize that only the first failed step (without continueOnError) is the root cause. All subsequent failures are cascading and must not be reported as separate issues. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .agents/skills/macios-ci-postmortem/SKILL.md | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/.agents/skills/macios-ci-postmortem/SKILL.md b/.agents/skills/macios-ci-postmortem/SKILL.md index c390889b48ed..ce6cd8c61503 100644 --- a/.agents/skills/macios-ci-postmortem/SKILL.md +++ b/.agents/skills/macios-ci-postmortem/SKILL.md @@ -377,13 +377,17 @@ Also look for cross-bot patterns that affect many PRs: - **Provisioning failures**: `Reserve bot`, `provision` errors - **Workload install failures**: `Install dotnet workloads` failing -**IMPORTANT: Distinguish symptoms from root causes.** Some pipeline steps have `continueOnError: true`, which means their errors are logged as warnings but do not cause the job to fail. For example, `Publish Artifact: TestSummary` and `Publish Artifact: HtmlReport` often report `Path does not exist` — but this is because an **earlier step** failed before the tests could run and produce those artifacts. Always trace back to the **first failing task** in the job's timeline to find the actual root cause. +**CRITICAL: Always identify the FIRST failed step as the root cause.** In any failed job, only the first step with `result == 'failed'` (and without `continueOnError: true`) is the root cause. All subsequent failures in the same job are cascading effects and must NOT be reported as separate issues. Common cascading patterns: +- `Publish Artifact: TestSummary/HtmlReport` → reports `Path does not exist` because tests never ran +- `Prepare tests results and Html Report` → fails because earlier steps didn't produce results +- Any step after a failed `Checkout`, `Verify ssh connection`, `Download secrets`, or `Install dotnet workloads` To find the actual root cause in a failed job: 1. List all Task records under the job sorted by execution order -2. Find tasks with `result == 'failed'` (not `succeededWithIssues`) -3. The earliest `failed` task is typically the root cause -4. Tasks with `succeededWithIssues` that report `Path does not exist` are downstream symptoms +2. Find the **first** task with `result == 'failed'` +3. Verify this task does NOT have `continueOnError: true` — if it does, skip it and check the next failed task +4. That task is the root cause; all later failures in the same job are cascading +5. **Never file an issue for a cascading failure** — always file for the root cause step ```sql SELECT error_signature, failure_type, raw_message, From 4d15697caf67cc555cf5eb420087eb5d03035e64 Mon Sep 17 00:00:00 2001 From: Rolf Bjarne Kvinge Date: Mon, 4 May 2026 17:35:27 +0200 Subject: [PATCH 16/17] Revert submodule bump. --- external/Xamarin.MacDev | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external/Xamarin.MacDev b/external/Xamarin.MacDev index 5295c1f4fcad..f1300986199f 160000 --- a/external/Xamarin.MacDev +++ b/external/Xamarin.MacDev @@ -1 +1 @@ -Subproject commit 5295c1f4fcadb3216af0d903b3896dea15c2d37a +Subproject commit f1300986199f5489191d2c9712e57bf8a0a3d84a From 0ffa72ff671669a0c3ac525fe8f0fe13cb1c2ba1 Mon Sep 17 00:00:00 2001 From: Rolf Bjarne Kvinge Date: Thu, 7 May 2026 22:36:37 +0200 Subject: [PATCH 17/17] Add Windows integration stage macOS bot tracking rule For failures in the Windows integration stage, always identify the macOS bot from the 'Reserve macOS bot for tests' job, even when the failure is on a Windows bot (e.g. ssh connection failures). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .agents/skills/macios-ci-postmortem/SKILL.md | 26 +++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/.agents/skills/macios-ci-postmortem/SKILL.md b/.agents/skills/macios-ci-postmortem/SKILL.md index ce6cd8c61503..8d1b18e71c68 100644 --- a/.agents/skills/macios-ci-postmortem/SKILL.md +++ b/.agents/skills/macios-ci-postmortem/SKILL.md @@ -369,7 +369,31 @@ Group failures by worker and compute failure rates. A bot is problematic if: # that bot has a specific problem worth filing an issue for. ``` -#### 3.3c: Identify infrastructure failure patterns +#### 3.3c: Windows integration stage — identify the macOS bot + +The 'Windows integration' stage has three jobs that work together: +1. **Reserve macOS bot for tests** — reserves a macOS bot and records its name +2. **Dotnet tests** — runs on a Windows bot, connecting to the reserved macOS bot via ssh +3. **Re-enable macOS bot for tests** — releases the macOS bot + +If **any** job in this stage fails, always extract the macOS bot name from the 'Reserve macOS bot for tests' job's `workerName` and include it in the issue. This is critical because: +- A 'Verify ssh connection' failure on the Windows bot is really a problem with the **macOS bot** it's trying to reach +- A 'Download secrets' failure on the macOS bot is specific to that bot +- Correlating the macOS bot name across issues reveals patterns (e.g., VSM-XAM-13 having persistent problems) + +```python +# For any failure in the Windows integration stage: +# 1. Find the 'Reserve macOS bot for tests' job in the timeline +# 2. Extract its workerName — this is the macOS bot +# 3. Include "macOS bot: " in the issue, even if the +# failure is in the 'Dotnet tests' job running on a Windows bot +for record in timeline['records']: + if record['type'] == 'Job' and 'Reserve' in record.get('name', '') and 'macOS' in record.get('name', ''): + macos_bot = record.get('workerName', 'unknown') + break +``` + +#### 3.3d: Identify infrastructure failure patterns Also look for cross-bot patterns that affect many PRs: - **Timeouts**: jobs that time out on multiple different bots across unrelated PRs