From 33d8fe2aaf2f75d9ea1e21f9faf83abbe0c4f286 Mon Sep 17 00:00:00 2001
From: Meekail Zain <zainmeekail@gmail.com>
Date: Tue, 2 Jun 2026 20:54:53 +0000
Subject: [PATCH] Updated test logging and timeouts

---
 .github/workflows/rocm-ci.yml |  23 ++++
 ci/_utils.sh                  |  24 +++-
 ci/core.sh                    |   6 +-
 ci/junit_report.py            | 201 ++++++++++++++++++++++++++++++++++
 ci/pytorch.sh                 |   2 +-
 5 files changed, 252 insertions(+), 4 deletions(-)
 create mode 100644 ci/junit_report.py

diff --git a/.github/workflows/rocm-ci.yml b/.github/workflows/rocm-ci.yml
index e2fb09c15..80ccc080e 100644
--- a/.github/workflows/rocm-ci.yml
+++ b/.github/workflows/rocm-ci.yml
@@ -183,10 +183,13 @@ jobs:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
         run: |
           rm -f FAIL_*
+          rm -rf test-results && mkdir -p test-results
 
           docker exec \
             -e TEST_SGPU=1 \
             -e TEST_LEVEL=${{ env.TEST_LEVEL }} \
+            -e JUNITXML_PREFIX=/workspace/test-results/ \
+            -e JUNITXML_SUFFIX=.xml \
             -e HF_TOKEN="$HF_TOKEN" \
             te-runner bash -c "$(cat <<'EOF'
           #!/usr/bin/bash
@@ -270,6 +273,13 @@ jobs:
           EOF
           )"
 
+      - name: Generate test report
+        if: always()
+        run: |
+          command -v python3 >/dev/null 2>&1 || { echo "python3 not available; skipping report"; exit 0; }
+          python3 ci/junit_report.py test-results \
+            --title "sGPU Tests (${{ matrix.arch_label }})"
+
       - name: Check suite failure status
         if: always()
         run: |
@@ -299,6 +309,7 @@ jobs:
           name: logs-sgpu-${{ matrix.arch_label }}
           path: |
             *.log
+            test-results/*.xml
           if-no-files-found: ignore
           retention-days: 5
 
@@ -387,6 +398,8 @@ jobs:
             *)       echo "::error::Unknown framework: ${{ matrix.framework }}"; exit 1 ;;
           esac
 
+          rm -rf test-results && mkdir -p test-results
+
           docker exec \
             -e TEST_MGPU=1 \
             -e TEST_LEVEL=${{ env.TEST_LEVEL }} \
@@ -394,6 +407,8 @@ jobs:
             -e LOG_FILE=$LOG_FILE \
             -e SUITE_NAME=$SUITE_NAME \
             -e NVTE_FRAMEWORK=${{ matrix.framework }} \
+            -e JUNITXML_PREFIX=/workspace/test-results/ \
+            -e JUNITXML_SUFFIX=.xml \
             -e HF_TOKEN="$HF_TOKEN" \
             te-runner bash -c "$(cat <<'EOF'
           #!/usr/bin/bash
@@ -414,6 +429,13 @@ jobs:
           EOF
           )"
 
+      - name: Generate test report
+        if: always()
+        run: |
+          command -v python3 >/dev/null 2>&1 || { echo "python3 not available; skipping report"; exit 0; }
+          python3 ci/junit_report.py test-results \
+            --title "mGPU ${{ matrix.framework == 'pytorch' && 'Torch' || 'JAX' }} (${{ matrix.arch_label }})"
+
       - name: Upload logs
         if: always()
         uses: actions/upload-artifact@v4
@@ -421,6 +443,7 @@ jobs:
           name: logs-mgpu-${{ matrix.arch_label }}-${{ matrix.framework }}
           path: |
             *.log
+            test-results/*.xml
           if-no-files-found: ignore
           retention-days: 5
 
diff --git a/ci/_utils.sh b/ci/_utils.sh
index 9c0f4a847..87659477c 100644
--- a/ci/_utils.sh
+++ b/ci/_utils.sh
@@ -23,6 +23,17 @@ TEST_START_TS=`date +%s`
 #To disable some logs trimming
 export CI=1
 
+# Crash/hang visibility and bounding:
+# - PYTHONFAULTHANDLER dumps a Python traceback on fatal signals (segfaults).
+# - PYTEST_TIMEOUT bounds every individual test item so a single hang cannot
+#   stall the whole CI job; the offending test is recorded as a failure with a
+#   traceback instead of the run silently timing out hours later.
+# All are overridable from the environment.
+export PYTHONFAULTHANDLER=1
+: ${PYTEST_TIMEOUT:=1200}          # per-test (per-parametrization) timeout, seconds
+: ${PYTEST_TIMEOUT_METHOD:=thread} # 'thread' reliably unsticks GPU/collective hangs
+: ${CTEST_TIMEOUT:=1200}           # per-cpp-test timeout, seconds
+
 _script_error_count=0
 _run_error_count=0
 _ignored_error_count=0
@@ -213,6 +224,12 @@ get_pytest_junitxml() {
     fi
 }
 
+get_ctest_junitxml() {
+    if [ -n "$JUNITXML_PREFIX$JUNITXML_SUFFIX" ]; then
+        echo "--output-junit ${JUNITXML_PREFIX}$1${JUNITXML_SUFFIX}"
+    fi
+}
+
 check_test_filter() {
     test -z "$TEST_FILTER" && return 0
     for _tf in $TEST_FILTER; do
@@ -266,7 +283,12 @@ pytest_run() {
     check_test_filter $_test_name_tag || return
     _start_ts=`date +%s`
     echo "Run [$_test_variant_tag] $@ at `time_elapsed $TEST_START_TS`"
-    python3 -m pytest -v -rfEs `get_pytest_junitxml $_test_name_tag` $TEST_PYTEST_ARGS "$TEST_DIR/$@"
+    # A per-test timeout is applied to every item. Callers may still append their
+    # own --timeout/--timeout-method (e.g. distributed tests); since argparse
+    # takes the last value, a caller-supplied override wins over these defaults.
+    python3 -m pytest -v -rfEs \
+        --timeout=$PYTEST_TIMEOUT --timeout-method=$PYTEST_TIMEOUT_METHOD \
+        `get_pytest_junitxml $_test_name_tag` $TEST_PYTEST_ARGS "$TEST_DIR/$@"
     test $? -eq 0 || test_run_error "[$_test_variant_tag] $1"
     echo "Done [$_test_variant_tag] $1 in `time_elapsed $_start_ts`"
 }
diff --git a/ci/core.sh b/ci/core.sh
index e940b12ff..7fbc40d95 100755
--- a/ci/core.sh
+++ b/ci/core.sh
@@ -31,14 +31,16 @@ fi
 check_test_filter "nongemm"
 if [ $? -eq 0 ]; then
     echo ===== Run non GEMM tests =====
-    ctest --test-dir build -j"$n_parallel_jobs" -V --output-on-failure -E "GEMMTestSuite"
+    ctest --test-dir build -j"$n_parallel_jobs" -V --output-on-failure \
+        --timeout $CTEST_TIMEOUT $(get_ctest_junitxml core.nongemm) -E "GEMMTestSuite"
     test $? -eq 0 || test_run_error "non-GEMM"
 fi
 
 check_test_filter "gemm"
 if [ $? -eq 0 ]; then
     echo  ===== Run GEMM tests =====
-    ctest --test-dir build -j"$n_parallel_jobs" -V --output-on-failure -R "GEMMTestSuite"
+    ctest --test-dir build -j"$n_parallel_jobs" -V --output-on-failure \
+        --timeout $CTEST_TIMEOUT $(get_ctest_junitxml core.gemm) -R "GEMMTestSuite"
     test $? -eq 0 || test_run_error "GEMM"
 fi
 
diff --git a/ci/junit_report.py b/ci/junit_report.py
new file mode 100644
index 000000000..550586e71
--- /dev/null
+++ b/ci/junit_report.py
@@ -0,0 +1,201 @@
+#!/usr/bin/env python3
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+
+"""Summarize pytest/ctest JUnit XML results into a GitHub Actions report.
+
+Reads every ``*.xml`` file in the given directory -- each produced by a single
+pytest file invocation (see ``get_pytest_junitxml`` in ``_utils.sh``) or a
+ctest run -- aggregates pass/fail/error/skip/timeout counts, writes a Markdown
+digest to ``$GITHUB_STEP_SUMMARY`` (or stdout when run locally), and emits
+``::error::`` workflow annotations for the failing tests.
+
+Design notes:
+  * Standard library only, so it runs on any runner without provisioning.
+  * Purely informational -- it always exits 0 and never gates the job. The
+    pass/fail gate stays with the existing ``FAIL_*`` markers / suite exit
+    codes. This keeps the change strictly additive.
+  * A run that is cut off mid-way (hang/crash/job-timeout) still produces a
+    digest for every test file that finished. Files whose XML is missing or
+    truncated are surfaced explicitly as "incomplete" rather than silently
+    dropped, which is exactly the signal that is invisible today.
+"""
+
+import glob
+import os
+import sys
+import xml.etree.ElementTree as ET
+from collections import defaultdict
+
+# UI shows at most ~10 annotations of each level; cap to keep the log readable.
+ANNOTATION_CAP = 20
+
+
+def iter_testsuites(root):
+    """Yield every <testsuite>; handles both <testsuites> and bare roots."""
+    if root.tag == "testsuite":
+        yield root
+    else:
+        yield from root.iter("testsuite")
+
+
+def classify(testcase):
+    """Return (status, first-line-message) for a <testcase>.
+
+    status is one of: passed, failed, error, skipped.
+    """
+    for tag, status in (("failure", "failed"), ("error", "error"), ("skipped", "skipped")):
+        el = testcase.find(tag)
+        if el is not None:
+            msg = (el.get("message") or el.text or "").strip()
+            return status, msg
+    return "passed", ""
+
+
+def is_timeout(message):
+    m = message.lower()
+    return "timeout" in m or "timed out" in m
+
+
+def emit(lines):
+    """Append the report to the step summary if available, else stdout."""
+    text = "\n".join(lines) + "\n"
+    summary = os.environ.get("GITHUB_STEP_SUMMARY")
+    if summary:
+        with open(summary, "a", encoding="utf-8") as fh:
+            fh.write(text)
+    else:
+        sys.stdout.write(text)
+
+
+def main():
+    if len(sys.argv) < 2:
+        print("usage: junit_report.py <results-dir> [--title TITLE]", file=sys.stderr)
+        return 0
+    results_dir = sys.argv[1]
+    title = "Test Results"
+    if "--title" in sys.argv:
+        title = sys.argv[sys.argv.index("--title") + 1]
+
+    xml_files = sorted(glob.glob(os.path.join(results_dir, "*.xml")))
+
+    lines = []
+    lines.append(f"## {title}\n")
+
+    if not xml_files:
+        lines.append(
+            "> :warning: **No JUnit XML files were produced.** No test file "
+            "completed far enough to write results -- the run likely crashed or "
+            "hung before any suite finished. Inspect the uploaded `*.log` "
+            "artifacts to see where it stopped.\n"
+        )
+        emit(lines)
+        return 0
+
+    totals = defaultdict(float)  # passed/failed/error/skipped/timeout/incomplete/time
+    per_file = []                # (name, counts, time)
+    failures = []                # (file, testid, label, message)
+
+    for xf in xml_files:
+        name = os.path.basename(xf)[: -len(".xml")]
+        counts = defaultdict(int)
+        suite_time = 0.0
+
+        try:
+            root = ET.parse(xf).getroot()
+        except (ET.ParseError, OSError) as exc:
+            # Truncated/unreadable XML => the pytest process was killed while
+            # writing it (hard timeout, segfault, or job cancellation).
+            counts["incomplete"] += 1
+            totals["incomplete"] += 1
+            per_file.append((name, counts, 0.0))
+            failures.append((name, "(whole file)", "incomplete",
+                             f"unparseable/truncated XML: {exc}"))
+            continue
+
+        for ts in iter_testsuites(root):
+            try:
+                suite_time += float(ts.get("time") or 0.0)
+            except ValueError:
+                pass
+            for tc in ts.findall("testcase"):
+                status, msg = classify(tc)
+                counts[status] += 1
+                totals[status] += 1
+                if status in ("failed", "error"):
+                    label = status
+                    if is_timeout(msg):
+                        label = "timeout"
+                        totals["timeout"] += 1
+                    cls = tc.get("classname", "")
+                    tcname = tc.get("name", "")
+                    testid = f"{cls}::{tcname}" if cls else tcname
+                    failures.append((name, testid, label,
+                                     msg.splitlines()[0] if msg else ""))
+
+        totals["time"] += suite_time
+        per_file.append((name, counts, suite_time))
+
+    n_pass = int(totals["passed"])
+    n_fail = int(totals["failed"])
+    n_err = int(totals["error"])
+    n_skip = int(totals["skipped"])
+    n_to = int(totals["timeout"])
+    n_incomplete = int(totals["incomplete"])
+    total_tests = n_pass + n_fail + n_err + n_skip
+
+    ok = (n_fail + n_err + n_incomplete) == 0
+    headline = ":white_check_mark:" if ok else ":x:"
+    summary_line = (
+        f"{headline} **{total_tests} tests** -- {n_pass} passed, {n_fail} failed, "
+        f"{n_err} errored, {n_skip} skipped"
+    )
+    if n_to:
+        summary_line += f" ({n_to} timed out)"
+    if n_incomplete:
+        summary_line += f"; **{n_incomplete} file(s) incomplete**"
+    summary_line += f" -- {totals['time']:.0f}s across {len(xml_files)} files\n"
+    lines.append(summary_line)
+
+    # Per-file breakdown (collapsed to keep the summary scannable).
+    lines.append("<details><summary>Per-file breakdown</summary>\n")
+    lines.append("| Test file (backend.label) | Pass | Fail | Error | Skip | Time (s) |")
+    lines.append("|---|---:|---:|---:|---:|---:|")
+    for name, counts, t in per_file:
+        bad = counts["failed"] + counts["error"] + counts["incomplete"]
+        mark = "" if bad == 0 else " :warning:"
+        lines.append(
+            f"| {name}{mark} | {counts['passed']} | {counts['failed']} | "
+            f"{counts['error']} | {counts['skipped']} | {t:.0f} |"
+        )
+    lines.append("\n</details>\n")
+
+    # Failure / error / timeout detail (always expanded -- this is the payload).
+    if failures:
+        lines.append("### Failures / errors / timeouts\n")
+        for fname, testid, label, msg in failures:
+            entry = f"- **[{label}]** `{testid}` _(in {fname})_"
+            if msg:
+                entry += f" -- {msg}"
+            lines.append(entry)
+        lines.append("")
+
+    emit(lines)
+
+    # Inline workflow annotations.
+    for i, (fname, testid, label, msg) in enumerate(failures):
+        if i >= ANNOTATION_CAP:
+            print(
+                f"::warning::{len(failures) - ANNOTATION_CAP} more failures "
+                "omitted from annotations; see the job summary for the full list."
+            )
+            break
+        body = msg or label
+        print(f"::error title={label}: {testid}::{body}")
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/ci/pytorch.sh b/ci/pytorch.sh
index 32fbf02f8..b63e880e7 100755
--- a/ci/pytorch.sh
+++ b/ci/pytorch.sh
@@ -12,7 +12,7 @@ TEST_DIR=${TE_PATH}tests/pytorch
 #: ${TEST_WORKERS:=4}
 
 install_prerequisites() {
-    pip install 'numpy>=1.22.4' pandas safetensors
+    pip install 'numpy>=1.22.4' pandas safetensors pytest-timeout
     rc=$?
     if [ $rc -ne 0 ]; then
         script_error "Failed to install test prerequisites"