From 33d8fe2aaf2f75d9ea1e21f9faf83abbe0c4f286 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Tue, 2 Jun 2026 20:54:53 +0000 Subject: [PATCH] Updated test logging and timeouts --- .github/workflows/rocm-ci.yml | 23 ++++ ci/_utils.sh | 24 +++- ci/core.sh | 6 +- ci/junit_report.py | 201 ++++++++++++++++++++++++++++++++++ ci/pytorch.sh | 2 +- 5 files changed, 252 insertions(+), 4 deletions(-) create mode 100644 ci/junit_report.py diff --git a/.github/workflows/rocm-ci.yml b/.github/workflows/rocm-ci.yml index e2fb09c15..80ccc080e 100644 --- a/.github/workflows/rocm-ci.yml +++ b/.github/workflows/rocm-ci.yml @@ -183,10 +183,13 @@ jobs: HF_TOKEN: ${{ secrets.HF_TOKEN }} run: | rm -f FAIL_* + rm -rf test-results && mkdir -p test-results docker exec \ -e TEST_SGPU=1 \ -e TEST_LEVEL=${{ env.TEST_LEVEL }} \ + -e JUNITXML_PREFIX=/workspace/test-results/ \ + -e JUNITXML_SUFFIX=.xml \ -e HF_TOKEN="$HF_TOKEN" \ te-runner bash -c "$(cat <<'EOF' #!/usr/bin/bash @@ -270,6 +273,13 @@ jobs: EOF )" + - name: Generate test report + if: always() + run: | + command -v python3 >/dev/null 2>&1 || { echo "python3 not available; skipping report"; exit 0; } + python3 ci/junit_report.py test-results \ + --title "sGPU Tests (${{ matrix.arch_label }})" + - name: Check suite failure status if: always() run: | @@ -299,6 +309,7 @@ jobs: name: logs-sgpu-${{ matrix.arch_label }} path: | *.log + test-results/*.xml if-no-files-found: ignore retention-days: 5 @@ -387,6 +398,8 @@ jobs: *) echo "::error::Unknown framework: ${{ matrix.framework }}"; exit 1 ;; esac + rm -rf test-results && mkdir -p test-results + docker exec \ -e TEST_MGPU=1 \ -e TEST_LEVEL=${{ env.TEST_LEVEL }} \ @@ -394,6 +407,8 @@ jobs: -e LOG_FILE=$LOG_FILE \ -e SUITE_NAME=$SUITE_NAME \ -e NVTE_FRAMEWORK=${{ matrix.framework }} \ + -e JUNITXML_PREFIX=/workspace/test-results/ \ + -e JUNITXML_SUFFIX=.xml \ -e HF_TOKEN="$HF_TOKEN" \ te-runner bash -c "$(cat <<'EOF' #!/usr/bin/bash @@ -414,6 +429,13 @@ jobs: EOF )" + - name: Generate test report + if: always() + run: | + command -v python3 >/dev/null 2>&1 || { echo "python3 not available; skipping report"; exit 0; } + python3 ci/junit_report.py test-results \ + --title "mGPU ${{ matrix.framework == 'pytorch' && 'Torch' || 'JAX' }} (${{ matrix.arch_label }})" + - name: Upload logs if: always() uses: actions/upload-artifact@v4 @@ -421,6 +443,7 @@ jobs: name: logs-mgpu-${{ matrix.arch_label }}-${{ matrix.framework }} path: | *.log + test-results/*.xml if-no-files-found: ignore retention-days: 5 diff --git a/ci/_utils.sh b/ci/_utils.sh index 9c0f4a847..87659477c 100644 --- a/ci/_utils.sh +++ b/ci/_utils.sh @@ -23,6 +23,17 @@ TEST_START_TS=`date +%s` #To disable some logs trimming export CI=1 +# Crash/hang visibility and bounding: +# - PYTHONFAULTHANDLER dumps a Python traceback on fatal signals (segfaults). +# - PYTEST_TIMEOUT bounds every individual test item so a single hang cannot +# stall the whole CI job; the offending test is recorded as a failure with a +# traceback instead of the run silently timing out hours later. +# All are overridable from the environment. +export PYTHONFAULTHANDLER=1 +: ${PYTEST_TIMEOUT:=1200} # per-test (per-parametrization) timeout, seconds +: ${PYTEST_TIMEOUT_METHOD:=thread} # 'thread' reliably unsticks GPU/collective hangs +: ${CTEST_TIMEOUT:=1200} # per-cpp-test timeout, seconds + _script_error_count=0 _run_error_count=0 _ignored_error_count=0 @@ -213,6 +224,12 @@ get_pytest_junitxml() { fi } +get_ctest_junitxml() { + if [ -n "$JUNITXML_PREFIX$JUNITXML_SUFFIX" ]; then + echo "--output-junit ${JUNITXML_PREFIX}$1${JUNITXML_SUFFIX}" + fi +} + check_test_filter() { test -z "$TEST_FILTER" && return 0 for _tf in $TEST_FILTER; do @@ -266,7 +283,12 @@ pytest_run() { check_test_filter $_test_name_tag || return _start_ts=`date +%s` echo "Run [$_test_variant_tag] $@ at `time_elapsed $TEST_START_TS`" - python3 -m pytest -v -rfEs `get_pytest_junitxml $_test_name_tag` $TEST_PYTEST_ARGS "$TEST_DIR/$@" + # A per-test timeout is applied to every item. Callers may still append their + # own --timeout/--timeout-method (e.g. distributed tests); since argparse + # takes the last value, a caller-supplied override wins over these defaults. + python3 -m pytest -v -rfEs \ + --timeout=$PYTEST_TIMEOUT --timeout-method=$PYTEST_TIMEOUT_METHOD \ + `get_pytest_junitxml $_test_name_tag` $TEST_PYTEST_ARGS "$TEST_DIR/$@" test $? -eq 0 || test_run_error "[$_test_variant_tag] $1" echo "Done [$_test_variant_tag] $1 in `time_elapsed $_start_ts`" } diff --git a/ci/core.sh b/ci/core.sh index e940b12ff..7fbc40d95 100755 --- a/ci/core.sh +++ b/ci/core.sh @@ -31,14 +31,16 @@ fi check_test_filter "nongemm" if [ $? -eq 0 ]; then echo ===== Run non GEMM tests ===== - ctest --test-dir build -j"$n_parallel_jobs" -V --output-on-failure -E "GEMMTestSuite" + ctest --test-dir build -j"$n_parallel_jobs" -V --output-on-failure \ + --timeout $CTEST_TIMEOUT $(get_ctest_junitxml core.nongemm) -E "GEMMTestSuite" test $? -eq 0 || test_run_error "non-GEMM" fi check_test_filter "gemm" if [ $? -eq 0 ]; then echo ===== Run GEMM tests ===== - ctest --test-dir build -j"$n_parallel_jobs" -V --output-on-failure -R "GEMMTestSuite" + ctest --test-dir build -j"$n_parallel_jobs" -V --output-on-failure \ + --timeout $CTEST_TIMEOUT $(get_ctest_junitxml core.gemm) -R "GEMMTestSuite" test $? -eq 0 || test_run_error "GEMM" fi diff --git a/ci/junit_report.py b/ci/junit_report.py new file mode 100644 index 000000000..550586e71 --- /dev/null +++ b/ci/junit_report.py @@ -0,0 +1,201 @@ +#!/usr/bin/env python3 +# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved. +# +# See LICENSE for license information. + +"""Summarize pytest/ctest JUnit XML results into a GitHub Actions report. + +Reads every ``*.xml`` file in the given directory -- each produced by a single +pytest file invocation (see ``get_pytest_junitxml`` in ``_utils.sh``) or a +ctest run -- aggregates pass/fail/error/skip/timeout counts, writes a Markdown +digest to ``$GITHUB_STEP_SUMMARY`` (or stdout when run locally), and emits +``::error::`` workflow annotations for the failing tests. + +Design notes: + * Standard library only, so it runs on any runner without provisioning. + * Purely informational -- it always exits 0 and never gates the job. The + pass/fail gate stays with the existing ``FAIL_*`` markers / suite exit + codes. This keeps the change strictly additive. + * A run that is cut off mid-way (hang/crash/job-timeout) still produces a + digest for every test file that finished. Files whose XML is missing or + truncated are surfaced explicitly as "incomplete" rather than silently + dropped, which is exactly the signal that is invisible today. +""" + +import glob +import os +import sys +import xml.etree.ElementTree as ET +from collections import defaultdict + +# UI shows at most ~10 annotations of each level; cap to keep the log readable. +ANNOTATION_CAP = 20 + + +def iter_testsuites(root): + """Yield every ; handles both and bare roots.""" + if root.tag == "testsuite": + yield root + else: + yield from root.iter("testsuite") + + +def classify(testcase): + """Return (status, first-line-message) for a . + + status is one of: passed, failed, error, skipped. + """ + for tag, status in (("failure", "failed"), ("error", "error"), ("skipped", "skipped")): + el = testcase.find(tag) + if el is not None: + msg = (el.get("message") or el.text or "").strip() + return status, msg + return "passed", "" + + +def is_timeout(message): + m = message.lower() + return "timeout" in m or "timed out" in m + + +def emit(lines): + """Append the report to the step summary if available, else stdout.""" + text = "\n".join(lines) + "\n" + summary = os.environ.get("GITHUB_STEP_SUMMARY") + if summary: + with open(summary, "a", encoding="utf-8") as fh: + fh.write(text) + else: + sys.stdout.write(text) + + +def main(): + if len(sys.argv) < 2: + print("usage: junit_report.py [--title TITLE]", file=sys.stderr) + return 0 + results_dir = sys.argv[1] + title = "Test Results" + if "--title" in sys.argv: + title = sys.argv[sys.argv.index("--title") + 1] + + xml_files = sorted(glob.glob(os.path.join(results_dir, "*.xml"))) + + lines = [] + lines.append(f"## {title}\n") + + if not xml_files: + lines.append( + "> :warning: **No JUnit XML files were produced.** No test file " + "completed far enough to write results -- the run likely crashed or " + "hung before any suite finished. Inspect the uploaded `*.log` " + "artifacts to see where it stopped.\n" + ) + emit(lines) + return 0 + + totals = defaultdict(float) # passed/failed/error/skipped/timeout/incomplete/time + per_file = [] # (name, counts, time) + failures = [] # (file, testid, label, message) + + for xf in xml_files: + name = os.path.basename(xf)[: -len(".xml")] + counts = defaultdict(int) + suite_time = 0.0 + + try: + root = ET.parse(xf).getroot() + except (ET.ParseError, OSError) as exc: + # Truncated/unreadable XML => the pytest process was killed while + # writing it (hard timeout, segfault, or job cancellation). + counts["incomplete"] += 1 + totals["incomplete"] += 1 + per_file.append((name, counts, 0.0)) + failures.append((name, "(whole file)", "incomplete", + f"unparseable/truncated XML: {exc}")) + continue + + for ts in iter_testsuites(root): + try: + suite_time += float(ts.get("time") or 0.0) + except ValueError: + pass + for tc in ts.findall("testcase"): + status, msg = classify(tc) + counts[status] += 1 + totals[status] += 1 + if status in ("failed", "error"): + label = status + if is_timeout(msg): + label = "timeout" + totals["timeout"] += 1 + cls = tc.get("classname", "") + tcname = tc.get("name", "") + testid = f"{cls}::{tcname}" if cls else tcname + failures.append((name, testid, label, + msg.splitlines()[0] if msg else "")) + + totals["time"] += suite_time + per_file.append((name, counts, suite_time)) + + n_pass = int(totals["passed"]) + n_fail = int(totals["failed"]) + n_err = int(totals["error"]) + n_skip = int(totals["skipped"]) + n_to = int(totals["timeout"]) + n_incomplete = int(totals["incomplete"]) + total_tests = n_pass + n_fail + n_err + n_skip + + ok = (n_fail + n_err + n_incomplete) == 0 + headline = ":white_check_mark:" if ok else ":x:" + summary_line = ( + f"{headline} **{total_tests} tests** -- {n_pass} passed, {n_fail} failed, " + f"{n_err} errored, {n_skip} skipped" + ) + if n_to: + summary_line += f" ({n_to} timed out)" + if n_incomplete: + summary_line += f"; **{n_incomplete} file(s) incomplete**" + summary_line += f" -- {totals['time']:.0f}s across {len(xml_files)} files\n" + lines.append(summary_line) + + # Per-file breakdown (collapsed to keep the summary scannable). + lines.append("
Per-file breakdown\n") + lines.append("| Test file (backend.label) | Pass | Fail | Error | Skip | Time (s) |") + lines.append("|---|---:|---:|---:|---:|---:|") + for name, counts, t in per_file: + bad = counts["failed"] + counts["error"] + counts["incomplete"] + mark = "" if bad == 0 else " :warning:" + lines.append( + f"| {name}{mark} | {counts['passed']} | {counts['failed']} | " + f"{counts['error']} | {counts['skipped']} | {t:.0f} |" + ) + lines.append("\n
\n") + + # Failure / error / timeout detail (always expanded -- this is the payload). + if failures: + lines.append("### Failures / errors / timeouts\n") + for fname, testid, label, msg in failures: + entry = f"- **[{label}]** `{testid}` _(in {fname})_" + if msg: + entry += f" -- {msg}" + lines.append(entry) + lines.append("") + + emit(lines) + + # Inline workflow annotations. + for i, (fname, testid, label, msg) in enumerate(failures): + if i >= ANNOTATION_CAP: + print( + f"::warning::{len(failures) - ANNOTATION_CAP} more failures " + "omitted from annotations; see the job summary for the full list." + ) + break + body = msg or label + print(f"::error title={label}: {testid}::{body}") + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/ci/pytorch.sh b/ci/pytorch.sh index 32fbf02f8..b63e880e7 100755 --- a/ci/pytorch.sh +++ b/ci/pytorch.sh @@ -12,7 +12,7 @@ TEST_DIR=${TE_PATH}tests/pytorch #: ${TEST_WORKERS:=4} install_prerequisites() { - pip install 'numpy>=1.22.4' pandas safetensors + pip install 'numpy>=1.22.4' pandas safetensors pytest-timeout rc=$? if [ $rc -ne 0 ]; then script_error "Failed to install test prerequisites"