Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions .github/workflows/rocm-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -183,10 +183,13 @@ jobs:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
run: |
rm -f FAIL_*
rm -rf test-results && mkdir -p test-results

docker exec \
-e TEST_SGPU=1 \
-e TEST_LEVEL=${{ env.TEST_LEVEL }} \
-e JUNITXML_PREFIX=/workspace/test-results/ \
-e JUNITXML_SUFFIX=.xml \
-e HF_TOKEN="$HF_TOKEN" \
te-runner bash -c "$(cat <<'EOF'
#!/usr/bin/bash
Expand Down Expand Up @@ -270,6 +273,13 @@ jobs:
EOF
)"

- name: Generate test report
if: always()
run: |
command -v python3 >/dev/null 2>&1 || { echo "python3 not available; skipping report"; exit 0; }
python3 ci/junit_report.py test-results \
--title "sGPU Tests (${{ matrix.arch_label }})"

- name: Check suite failure status
if: always()
run: |
Expand Down Expand Up @@ -299,6 +309,7 @@ jobs:
name: logs-sgpu-${{ matrix.arch_label }}
path: |
*.log
test-results/*.xml
if-no-files-found: ignore
retention-days: 5

Expand Down Expand Up @@ -387,13 +398,17 @@ jobs:
*) echo "::error::Unknown framework: ${{ matrix.framework }}"; exit 1 ;;
esac

rm -rf test-results && mkdir -p test-results

docker exec \
-e TEST_MGPU=1 \
-e TEST_LEVEL=${{ env.TEST_LEVEL }} \
-e TEST_SCRIPT=$TEST_SCRIPT \
-e LOG_FILE=$LOG_FILE \
-e SUITE_NAME=$SUITE_NAME \
-e NVTE_FRAMEWORK=${{ matrix.framework }} \
-e JUNITXML_PREFIX=/workspace/test-results/ \
-e JUNITXML_SUFFIX=.xml \
-e HF_TOKEN="$HF_TOKEN" \
te-runner bash -c "$(cat <<'EOF'
#!/usr/bin/bash
Expand All @@ -414,13 +429,21 @@ jobs:
EOF
)"

- name: Generate test report
if: always()
run: |
command -v python3 >/dev/null 2>&1 || { echo "python3 not available; skipping report"; exit 0; }
python3 ci/junit_report.py test-results \
--title "mGPU ${{ matrix.framework == 'pytorch' && 'Torch' || 'JAX' }} (${{ matrix.arch_label }})"

- name: Upload logs
if: always()
uses: actions/upload-artifact@v4
with:
name: logs-mgpu-${{ matrix.arch_label }}-${{ matrix.framework }}
path: |
*.log
test-results/*.xml
if-no-files-found: ignore
retention-days: 5

Expand Down
24 changes: 23 additions & 1 deletion ci/_utils.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,17 @@ TEST_START_TS=`date +%s`
#To disable some logs trimming
export CI=1

# Crash/hang visibility and bounding:
# - PYTHONFAULTHANDLER dumps a Python traceback on fatal signals (segfaults).
# - PYTEST_TIMEOUT bounds every individual test item so a single hang cannot
# stall the whole CI job; the offending test is recorded as a failure with a
# traceback instead of the run silently timing out hours later.
# All are overridable from the environment.
export PYTHONFAULTHANDLER=1
: ${PYTEST_TIMEOUT:=1200} # per-test (per-parametrization) timeout, seconds
: ${PYTEST_TIMEOUT_METHOD:=thread} # 'thread' reliably unsticks GPU/collective hangs
: ${CTEST_TIMEOUT:=1200} # per-cpp-test timeout, seconds

_script_error_count=0
_run_error_count=0
_ignored_error_count=0
Expand Down Expand Up @@ -213,6 +224,12 @@ get_pytest_junitxml() {
fi
}

get_ctest_junitxml() {
if [ -n "$JUNITXML_PREFIX$JUNITXML_SUFFIX" ]; then
echo "--output-junit ${JUNITXML_PREFIX}$1${JUNITXML_SUFFIX}"
fi
}

check_test_filter() {
test -z "$TEST_FILTER" && return 0
for _tf in $TEST_FILTER; do
Expand Down Expand Up @@ -266,7 +283,12 @@ pytest_run() {
check_test_filter $_test_name_tag || return
_start_ts=`date +%s`
echo "Run [$_test_variant_tag] $@ at `time_elapsed $TEST_START_TS`"
python3 -m pytest -v -rfEs `get_pytest_junitxml $_test_name_tag` $TEST_PYTEST_ARGS "$TEST_DIR/$@"
# A per-test timeout is applied to every item. Callers may still append their
# own --timeout/--timeout-method (e.g. distributed tests); since argparse
# takes the last value, a caller-supplied override wins over these defaults.
python3 -m pytest -v -rfEs \
--timeout=$PYTEST_TIMEOUT --timeout-method=$PYTEST_TIMEOUT_METHOD \
`get_pytest_junitxml $_test_name_tag` $TEST_PYTEST_ARGS "$TEST_DIR/$@"
test $? -eq 0 || test_run_error "[$_test_variant_tag] $1"
echo "Done [$_test_variant_tag] $1 in `time_elapsed $_start_ts`"
}
6 changes: 4 additions & 2 deletions ci/core.sh
Original file line number Diff line number Diff line change
Expand Up @@ -31,14 +31,16 @@ fi
check_test_filter "nongemm"
if [ $? -eq 0 ]; then
echo ===== Run non GEMM tests =====
ctest --test-dir build -j"$n_parallel_jobs" -V --output-on-failure -E "GEMMTestSuite"
ctest --test-dir build -j"$n_parallel_jobs" -V --output-on-failure \
--timeout $CTEST_TIMEOUT $(get_ctest_junitxml core.nongemm) -E "GEMMTestSuite"
test $? -eq 0 || test_run_error "non-GEMM"
fi

check_test_filter "gemm"
if [ $? -eq 0 ]; then
echo ===== Run GEMM tests =====
ctest --test-dir build -j"$n_parallel_jobs" -V --output-on-failure -R "GEMMTestSuite"
ctest --test-dir build -j"$n_parallel_jobs" -V --output-on-failure \
--timeout $CTEST_TIMEOUT $(get_ctest_junitxml core.gemm) -R "GEMMTestSuite"
test $? -eq 0 || test_run_error "GEMM"
fi

Expand Down
201 changes: 201 additions & 0 deletions ci/junit_report.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
#!/usr/bin/env python3
# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
#
# See LICENSE for license information.

"""Summarize pytest/ctest JUnit XML results into a GitHub Actions report.

Reads every ``*.xml`` file in the given directory -- each produced by a single
pytest file invocation (see ``get_pytest_junitxml`` in ``_utils.sh``) or a
ctest run -- aggregates pass/fail/error/skip/timeout counts, writes a Markdown
digest to ``$GITHUB_STEP_SUMMARY`` (or stdout when run locally), and emits
``::error::`` workflow annotations for the failing tests.

Design notes:
* Standard library only, so it runs on any runner without provisioning.
* Purely informational -- it always exits 0 and never gates the job. The
pass/fail gate stays with the existing ``FAIL_*`` markers / suite exit
codes. This keeps the change strictly additive.
* A run that is cut off mid-way (hang/crash/job-timeout) still produces a
digest for every test file that finished. Files whose XML is missing or
truncated are surfaced explicitly as "incomplete" rather than silently
dropped, which is exactly the signal that is invisible today.
"""

import glob
import os
import sys
import xml.etree.ElementTree as ET
from collections import defaultdict

# UI shows at most ~10 annotations of each level; cap to keep the log readable.
ANNOTATION_CAP = 20


def iter_testsuites(root):
"""Yield every <testsuite>; handles both <testsuites> and bare roots."""
if root.tag == "testsuite":
yield root
else:
yield from root.iter("testsuite")


def classify(testcase):
"""Return (status, first-line-message) for a <testcase>.

status is one of: passed, failed, error, skipped.
"""
for tag, status in (("failure", "failed"), ("error", "error"), ("skipped", "skipped")):
el = testcase.find(tag)
if el is not None:
msg = (el.get("message") or el.text or "").strip()
return status, msg
return "passed", ""


def is_timeout(message):
m = message.lower()
return "timeout" in m or "timed out" in m


def emit(lines):
"""Append the report to the step summary if available, else stdout."""
text = "\n".join(lines) + "\n"
summary = os.environ.get("GITHUB_STEP_SUMMARY")
if summary:
with open(summary, "a", encoding="utf-8") as fh:
fh.write(text)
else:
sys.stdout.write(text)


def main():
if len(sys.argv) < 2:
print("usage: junit_report.py <results-dir> [--title TITLE]", file=sys.stderr)
return 0
results_dir = sys.argv[1]
title = "Test Results"
if "--title" in sys.argv:
title = sys.argv[sys.argv.index("--title") + 1]

xml_files = sorted(glob.glob(os.path.join(results_dir, "*.xml")))

lines = []
lines.append(f"## {title}\n")

if not xml_files:
lines.append(
"> :warning: **No JUnit XML files were produced.** No test file "
"completed far enough to write results -- the run likely crashed or "
"hung before any suite finished. Inspect the uploaded `*.log` "
"artifacts to see where it stopped.\n"
)
emit(lines)
return 0

totals = defaultdict(float) # passed/failed/error/skipped/timeout/incomplete/time
per_file = [] # (name, counts, time)
failures = [] # (file, testid, label, message)

for xf in xml_files:
name = os.path.basename(xf)[: -len(".xml")]
counts = defaultdict(int)
suite_time = 0.0

try:
root = ET.parse(xf).getroot()
except (ET.ParseError, OSError) as exc:
# Truncated/unreadable XML => the pytest process was killed while
# writing it (hard timeout, segfault, or job cancellation).
counts["incomplete"] += 1
totals["incomplete"] += 1
per_file.append((name, counts, 0.0))
failures.append((name, "(whole file)", "incomplete",
f"unparseable/truncated XML: {exc}"))
continue

for ts in iter_testsuites(root):
try:
suite_time += float(ts.get("time") or 0.0)
except ValueError:
pass
for tc in ts.findall("testcase"):
status, msg = classify(tc)
counts[status] += 1
totals[status] += 1
if status in ("failed", "error"):
label = status
if is_timeout(msg):
label = "timeout"
totals["timeout"] += 1
cls = tc.get("classname", "")
tcname = tc.get("name", "")
testid = f"{cls}::{tcname}" if cls else tcname
failures.append((name, testid, label,
msg.splitlines()[0] if msg else ""))

totals["time"] += suite_time
per_file.append((name, counts, suite_time))

n_pass = int(totals["passed"])
n_fail = int(totals["failed"])
n_err = int(totals["error"])
n_skip = int(totals["skipped"])
n_to = int(totals["timeout"])
n_incomplete = int(totals["incomplete"])
total_tests = n_pass + n_fail + n_err + n_skip

ok = (n_fail + n_err + n_incomplete) == 0
headline = ":white_check_mark:" if ok else ":x:"
summary_line = (
f"{headline} **{total_tests} tests** -- {n_pass} passed, {n_fail} failed, "
f"{n_err} errored, {n_skip} skipped"
)
if n_to:
summary_line += f" ({n_to} timed out)"
if n_incomplete:
summary_line += f"; **{n_incomplete} file(s) incomplete**"
summary_line += f" -- {totals['time']:.0f}s across {len(xml_files)} files\n"
lines.append(summary_line)

# Per-file breakdown (collapsed to keep the summary scannable).
lines.append("<details><summary>Per-file breakdown</summary>\n")
lines.append("| Test file (backend.label) | Pass | Fail | Error | Skip | Time (s) |")
lines.append("|---|---:|---:|---:|---:|---:|")
for name, counts, t in per_file:
bad = counts["failed"] + counts["error"] + counts["incomplete"]
mark = "" if bad == 0 else " :warning:"
lines.append(
f"| {name}{mark} | {counts['passed']} | {counts['failed']} | "
f"{counts['error']} | {counts['skipped']} | {t:.0f} |"
)
lines.append("\n</details>\n")

# Failure / error / timeout detail (always expanded -- this is the payload).
if failures:
lines.append("### Failures / errors / timeouts\n")
for fname, testid, label, msg in failures:
entry = f"- **[{label}]** `{testid}` _(in {fname})_"
if msg:
entry += f" -- {msg}"
lines.append(entry)
lines.append("")

emit(lines)

# Inline workflow annotations.
for i, (fname, testid, label, msg) in enumerate(failures):
if i >= ANNOTATION_CAP:
print(
f"::warning::{len(failures) - ANNOTATION_CAP} more failures "
"omitted from annotations; see the job summary for the full list."
)
break
body = msg or label
print(f"::error title={label}: {testid}::{body}")

return 0


if __name__ == "__main__":
sys.exit(main())
2 changes: 1 addition & 1 deletion ci/pytorch.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ TEST_DIR=${TE_PATH}tests/pytorch
#: ${TEST_WORKERS:=4}

install_prerequisites() {
pip install 'numpy>=1.22.4' pandas safetensors
pip install 'numpy>=1.22.4' pandas safetensors pytest-timeout
rc=$?
if [ $rc -ne 0 ]; then
script_error "Failed to install test prerequisites"
Expand Down
Loading