diff --git a/.github/workflows/_runner-siracusa-tiled.yml b/.github/workflows/_runner-siracusa-tiled.yml
index cc09f234..16047857 100644
--- a/.github/workflows/_runner-siracusa-tiled.yml
+++ b/.github/workflows/_runner-siracusa-tiled.yml
@@ -17,6 +17,10 @@ name: _runner-siracusa-tiled
       pytest-marker:
         required: true
         type: string
+      pytest-extra-args:
+        required: false
+        type: string
+        default: ""
 
 jobs:
   test-runner-siracusa-tiled:
@@ -36,5 +40,24 @@ jobs:
       - name: Run Test
         run: |
           cd DeeployTest
-          pytest test_platforms.py -v -m "siracusa_tiled and ${{ inputs.pytest-marker }}"
+          # Memory snapshot helps diagnose 137/OOM kills postmortem.
+          echo "=== free -m before pytest ==="; free -m || true
+          pytest test_platforms.py -v -m "siracusa_tiled and ${{ inputs.pytest-marker }}" ${{ inputs.pytest-extra-args }}
+          echo "=== free -m after pytest ==="; free -m || true
         shell: bash
+      - name: Build footprint summary
+        if: always()
+        env:
+          FOOTPRINT_SUMMARY_LABEL: ${{ inputs.pytest-marker }}
+        run: |
+          cd DeeployTest
+          python3 ../scripts/ci_footprint_summary.py TEST_SIRACUSA || true
+        shell: bash
+      - name: Upload sim out.txt
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: sim-out-${{ inputs.pytest-marker }}
+          path: DeeployTest/out.txt
+          if-no-files-found: ignore
+          retention-days: 7
diff --git a/.github/workflows/ci-platform-siracusa-tiled.yml b/.github/workflows/ci-platform-siracusa-tiled.yml
index b65cbb75..f443d2b1 100644
--- a/.github/workflows/ci-platform-siracusa-tiled.yml
+++ b/.github/workflows/ci-platform-siracusa-tiled.yml
@@ -29,20 +29,36 @@ jobs:
     with:
       docker_image_deeploy: ${{ inputs.docker_image_deeploy }}
 
-  # Training tests - L2 singlebuffer
-  siracusa-training-tiled-l2-singlebuffer:
+  # NOTE: L2 singlebuffer still commented out — only need fresh L3
+  # singlebuffer numbers for the big-CCT tiled cycle (other 3 already
+  # measured).  Restore the L2 entry below before merging.
+  #
+  # # Training tests - L2 singlebuffer
+  # siracusa-training-tiled-l2-singlebuffer:
+  #   needs: select-env
+  #   uses: ./.github/workflows/_runner-siracusa-tiled.yml
+  #   with:
+  #     runner: ${{ needs.select-env.outputs.runner }}
+  #     docker-image: ${{ needs.select-env.outputs.image }}
+  #     pytest-marker: "training and l2 and singlebuffer"
+
+  # Training tests - L3 singlebuffer (re-run to pick up big-CCT tiled).
+  siracusa-training-tiled-l3-singlebuffer:
     needs: select-env
     uses: ./.github/workflows/_runner-siracusa-tiled.yml
     with:
       runner: ${{ needs.select-env.outputs.runner }}
       docker-image: ${{ needs.select-env.outputs.image }}
-      pytest-marker: "training and l2 and singlebuffer"
+      pytest-marker: "training and l3 and singlebuffer"
 
-  # Training tests - L3 singlebuffer (models that spill weights/activations to L3)
-  siracusa-training-tiled-l3-singlebuffer:
+  # Training tests - L3 untiled baseline.  Codegen post-process rewrites
+  # every L1-annotated buffer to FC L2 so cluster cores access kernel
+  # buffers via the fabric — "untiled, L2-resident working set" cycle
+  # semantics for all 4 L3 models.
+  siracusa-training-tiled-l3-untiled:
     needs: select-env
     uses: ./.github/workflows/_runner-siracusa-tiled.yml
     with:
       runner: ${{ needs.select-env.outputs.runner }}
       docker-image: ${{ needs.select-env.outputs.image }}
-      pytest-marker: "training and l3 and singlebuffer"
+      pytest-marker: "training and l3 and untiled"
diff --git a/.github/workflows/ci-platform-siracusa.yml b/.github/workflows/ci-platform-siracusa.yml
index 7a4f415e..c1e7db5d 100644
--- a/.github/workflows/ci-platform-siracusa.yml
+++ b/.github/workflows/ci-platform-siracusa.yml
@@ -5,13 +5,10 @@
 ---
 name: CI • Siracusa
 
+# NOTE: Push / pull_request triggers temporarily disabled on this branch
+# so only the L3-untiled job runs while we collect cycle-count data.
+# Restore the push: / pull_request: blocks before merging.
 "on":
-  push:
-    branches:
-      - "**"
-    tags:
-      - "v*.*.*"
-  pull_request:
   workflow_dispatch:
     inputs:
       docker_image_deeploy:
diff --git a/DeeployTest/conftest.py b/DeeployTest/conftest.py
index f29891bf..b13a4cfe 100644
--- a/DeeployTest/conftest.py
+++ b/DeeployTest/conftest.py
@@ -75,6 +75,8 @@ def pytest_configure(config: pytest.Config) -> None:
         "markers", "train_kernel: mark test as a training-related single-op kernel test (grad / loss / optimizer)")
     config.addinivalue_line("markers", "singlebuffer: mark test as single-buffer configuration")
     config.addinivalue_line("markers", "doublebuffer: mark test as double-buffer configuration")
+    config.addinivalue_line(
+        "markers", "untiled: mark test as untiled-baseline (single-tile-per-tensor schedule via fake-L1 shim)")
     config.addinivalue_line("markers", "l2: mark test as L2 default memory level")
     config.addinivalue_line("markers", "l3: mark test as L3 default memory level")
     config.addinivalue_line("markers", "wmem: mark test as using Neureka weight memory")
diff --git a/DeeployTest/test_platforms.py b/DeeployTest/test_platforms.py
index 7eee2085..b814bb89 100644
--- a/DeeployTest/test_platforms.py
+++ b/DeeployTest/test_platforms.py
@@ -2,6 +2,8 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+import os
+
 import pytest
 # Import platform-specific test configurations
 from test_chimera_config import KERNEL_TESTS as CHIMERA_KERNEL_TESTS
@@ -40,6 +42,7 @@
 from test_siracusa_tiled_config import L2_SINGLEBUFFER_TRAINING_MODELS as SIRACUSA_L2_SINGLEBUFFER_TRAINING_MODELS
 from test_siracusa_tiled_config import L3_DOUBLEBUFFER_MODELS, L3_SINGLEBUFFER_MODELS
 from test_siracusa_tiled_config import L3_SINGLEBUFFER_TRAINING_MODELS as SIRACUSA_L3_SINGLEBUFFER_TRAINING_MODELS
+from test_siracusa_tiled_config import L3_UNTILED_TRAINING_MODELS as SIRACUSA_L3_UNTILED_TRAINING_MODELS
 from test_siracusa_tiled_config import TRAINING_MODEL_OVERRIDES as SIRACUSA_TRAINING_MODEL_OVERRIDES
 from test_snitch_config import DEFAULT_NUM_CORES as SNITCH_DEFAULT_NUM_CORES
 from test_snitch_config import KERNEL_TESTS as SNITCH_KERNEL_TESTS
@@ -330,6 +333,9 @@ def test_siracusa_train_kernels(test_name, deeploy_test_dir, toolchain, toolchai
 @pytest.mark.training
 @pytest.mark.parametrize("test_name", SIRACUSA_TRAINING_TESTS, ids = SIRACUSA_TRAINING_TESTS)
 def test_siracusa_training(test_name, deeploy_test_dir, toolchain, toolchain_dir, cmake_args, skipgen, skipsim) -> None:
+    # Reuse the tiled overrides table — same models, same tolerance / data-input
+    # quirks regardless of whether tiling is on.
+    overrides = SIRACUSA_TRAINING_MODEL_OVERRIDES.get(test_name, {})
     config = create_test_config(
         test_name = test_name,
         platform = "Siracusa",
@@ -341,6 +347,8 @@ def test_siracusa_training(test_name, deeploy_test_dir, toolchain, toolchain_dir
         tiling = False,
         cores = SIRACUSA_DEFAULT_CORES,
         training = True,
+        training_num_data_inputs = overrides.get("num_data_inputs"),
+        training_tolerance = overrides.get("tolerance"),
     )
     run_and_assert_test(test_name, config, skipgen, skipsim)
 
@@ -413,6 +421,97 @@ def test_siracusa_tiled_training_l3_singlebuffer(test_params, deeploy_test_dir,
     run_and_assert_test(test_name, config, skipgen, skipsim)
 
 
+@pytest.mark.siracusa_tiled
+@pytest.mark.training
+@pytest.mark.untiled
+@pytest.mark.l3
+@pytest.mark.parametrize(
+    "test_name",
+    list(SIRACUSA_L3_UNTILED_TRAINING_MODELS.keys()),
+    ids = list(SIRACUSA_L3_UNTILED_TRAINING_MODELS.keys()),
+)
+def test_siracusa_tiled_training_l3_untiled(test_name, deeploy_test_dir, toolchain, toolchain_dir, cmake_args, skipgen,
+                                            skipsim) -> None:
+    """Untiled-L3 baseline.
+
+    SBTiler picks single-tile-per-tensor schedules (--l1 inflated above the
+    op working set so no spatial split happens).  The generated C is one
+    kernel call per op with integral L3↔L2 DMA wrappers.
+
+    To make the L1 staging buffer physically live in FC L2 (so cycles
+    represent "kernel actually accessing L2"), we post-process the
+    generated TrainingNetwork.c / OptimizerNetwork.c after codegen but
+    before cmake build:
+
+        pmsis_l1_malloc -> pi_l2_malloc
+        PI_L1           -> PI_L2
+
+    Every L1-annotated buffer ends up in FC L2.  Cluster cores access L2
+    via the fabric (~7x slower than real L1) — that's the deliberate
+    semantics of "untiled L2-resident".  No fake-L1 shim, no linker wrap,
+    no SDK pollution.
+    """
+    from pathlib import Path
+
+    from testUtils.core.execution import build_binary, configure_cmake, generate_network, run_simulation
+
+    fixture = SIRACUSA_L3_UNTILED_TRAINING_MODELS[test_name]
+    overrides = SIRACUSA_TRAINING_MODEL_OVERRIDES.get(test_name, {})
+    effective_skipsim = skipsim or (os.environ.get("CI") == "true" and fixture.get("skip_sim_in_ci", False))
+    # DEEPLOY_L1_AS_L2 is what flips mchan_transfer_1d to memcpy in mchan_v7.h —
+    # mandatory partner of the codegen sed below.
+    extra_cmake = list(cmake_args) + ["-DDEEPLOY_L1_AS_L2=ON"]
+    # Optional per-fixture training-step caps.  Some untiled-L3 models hit FC
+    # L2 heap limits when testinputs.h carries 4-batch data; capping reduces
+    # the .data footprint while keeping per-step cycle measurement valid.
+    extra_gen = []
+    if "n_steps" in fixture:
+        extra_gen.append(f"--n-steps={fixture['n_steps']}")
+    if "n_accum" in fixture:
+        extra_gen.append(f"--n-accum={fixture['n_accum']}")
+    # Per-fixture num_data_inputs override (lets a fixture force the value
+    # the model overrides don't set globally — needed when a multi-input
+    # model triggers a code-path bug only with NUM_DATA_INPUTS > 1).
+    fixture_num_data = fixture.get("num_data_inputs", overrides.get("num_data_inputs"))
+    config = create_test_config(
+        test_name = test_name,
+        platform = "Siracusa",
+        simulator = "gvsoc",
+        deeploy_test_dir = deeploy_test_dir,
+        toolchain = toolchain,
+        toolchain_dir = toolchain_dir,
+        cmake_args = extra_cmake,
+        tiling = True,
+        cores = SIRACUSA_DEFAULT_CORES,
+        l1 = fixture["l1"],
+        l2 = fixture["l2"],
+        default_mem_level = "L3",
+        double_buffer = False,
+        training = True,
+        training_num_data_inputs = fixture_num_data,
+        training_tolerance = overrides.get("tolerance"),
+        gen_args = extra_gen,
+    )
+
+    # Inline the test runner stages so we can sed between codegen and build.
+    generate_network(config, skip = skipgen)
+    for c_name in ("TrainingNetwork.c", "OptimizerNetwork.c"):
+        c_path = Path(config.gen_dir) / c_name
+        if not c_path.exists():
+            continue
+        text = c_path.read_text()
+        text = text.replace("pmsis_l1_malloc", "pi_l2_malloc")
+        text = text.replace("PI_L1 ", "PI_L2 ")
+        c_path.write_text(text)
+    configure_cmake(config)
+    build_binary(config)
+    result = run_simulation(config, skip = effective_skipsim)
+    assert result.success, (f"Test {test_name} failed with {result.error_count} errors out of "
+                            f"{result.total_count}\nOutput:\n{result.stdout}")
+    if result.error_count >= 0:
+        assert result.error_count == 0, (f"Found {result.error_count} errors out of {result.total_count} tests")
+
+
 @pytest.mark.siracusa_tiled
 @pytest.mark.kernels
 @pytest.mark.singlebuffer
diff --git a/DeeployTest/test_siracusa_config.py b/DeeployTest/test_siracusa_config.py
index d2f25708..5f839a89 100644
--- a/DeeployTest/test_siracusa_config.py
+++ b/DeeployTest/test_siracusa_config.py
@@ -113,6 +113,18 @@
 # Training-enabled models (use deeployTrainingRunner / testMVPTraining pipeline).
 # Each entry is the path to a `<model>_train` directory; the matching
 # `<model>_optimizer` directory must live next to it.
+#
+# Untiled-L3 baseline scope:
+#   The untiled path emits one pi_l2_malloc per buffer; the SUM of these calls
+#   must fit in the Siracusa FC-L2 heap (~1 MB usable after BSS/stack). Models
+#   below have a verified untiled L2 footprint within that ceiling:
+#     - SimpleMLP        ~0.05 MB
+#     - CCT_LoRA         ~0.4  MB
+#     - CCT              ~0.7  MB
+#   ResNet8 (~9.3 MB) and MobileNetV1 (~17 MB) need an L2-heap override that
+#   lives in a separate task — they stay tiled-only for now.
 TRAINING_TESTS = [
     "Models/Training/SimpleMLP/simplemlp_train",
+    "Models/Training/CCT/cct_train",
+    "Models/Training/CCT_LoRA/cct_lora_train",
 ]
diff --git a/DeeployTest/test_siracusa_tiled_config.py b/DeeployTest/test_siracusa_tiled_config.py
index bafa6635..30511320 100644
--- a/DeeployTest/test_siracusa_tiled_config.py
+++ b/DeeployTest/test_siracusa_tiled_config.py
@@ -165,11 +165,87 @@
 
 # Training-enabled tiled models that need L3 spill (weights/activations don't
 # fit in L2). Same shape: test path -> list of L1 sizes (bytes).
+# TEMPORARY: only big-CCT enabled — the other 3 already have tiled cycle
+# data from earlier CI runs.  Restore the entries below before merging.
 L3_SINGLEBUFFER_TRAINING_MODELS = {
-    "Models/Training/ResNet8/resnet8_train": [128000],
-    "Models/Training/MobileNetV1/mobilenetv1_train": [128000],
+    # "Models/Training/ResNet8/resnet8_train": [128000],
+    # "Models/Training/MobileNetV1/mobilenetv1_train": [128000],
     "Models/Training/CCT/cct_train": [128000],
-    "Models/Training/CCT_LoRA/cct_lora_train": [128000],
+    # "Models/Training/CCT_LoRA/cct_lora_train": [128000],
+}
+
+# Untiled-L3 baseline.  Same fixtures as L3_SINGLEBUFFER_TRAINING_MODELS but
+# the L1 budget is inflated so the SBTiler picks single-tile-per-tensor
+# schedules (numTiles == 1 on every dim) — semantically untiled per op, but
+# still uses the tile-codegen DMA wrappers because cluster cores cannot deref
+# HyperRAM directly. The L1 working buffer ends up larger than physical
+# Siracusa L1 (256 KB), so the deeploy_fake_l1 shim redirects pi_cl_l1_malloc
+# into an FC-L2 arena via -Wl,--wrap; size cap = DEEPLOY_FAKE_L1_SIZE (set
+# per-fixture below to fit the model's peak L1 working set with headroom).
+#
+# Maps test_name -> dict with:
+#   l1: planner-side L1 size (forces single-tile schedules)
+#   l2: planner-side L2 size
+#   fake_l1_size: physical bytes for the FC-L2 arena backing pi_cl_l1_malloc
+#
+# fake_l1_size baselining method: spike with --l1=4_000_000 → read off
+# MEMORYARENA_L1 size from generated TrainingNetwork.c → round up.
+# Untiled-L3 baseline — single-tile-per-tensor schedules for every L3
+# training model so the user can read off "untiled L3 latency" alongside
+# the existing tiled-L3 cycles.
+#
+# Each fixture goes through the same SBTiler infrastructure as the L3
+# singlebuffer tests, with --l1 inflated to the smallest value that
+# yields the minimal-tile shape (one kernel call per op + integral
+# L3↔L2 DMA, no spatial split).
+#
+# After codegen, the test post-processes TrainingNetwork.c /
+# OptimizerNetwork.c to swap pmsis_l1_malloc → pi_l2_malloc and
+# PI_L1 → PI_L2, so every L1-annotated buffer physically lives in
+# FC L2.  Cluster cores access these via the fabric (~7x slower than
+# real L1) — that's the deliberate semantics of "untiled L2-resident".
+#
+# skip_sim_in_ci: True for fixtures where gvsoc has historically OOMed
+# during the long single-tile loop.  CI still verifies codegen +
+# compile + link in that case; sim is deferred to a manual local run
+# or a beefier runner.
+L3_UNTILED_TRAINING_MODELS = {
+    "Models/Training/CCT/cct_train": {
+        # Big-CCT (img_size=32, embedding_dim=128, n_conv_layers=2) — peak
+        # L1 working = 524 KB > physical L1 (256 KB).  --l1=200K..400K
+        # trip a codegen assert ("Keys should be the same while generating
+        # DMA transfer for tensor 'data_in'/'data_out'"); 800K is the
+        # smallest value that gets through to a clean schedule.
+        "l1": 800_000,
+        "l2": 2_000_000,
+        # Use the default training schedule (n_steps=4 / n_accum=1 from
+        # inputs.npz) so per-step cycles are computed the same way as the
+        # tiled L3 baseline (BENCH total / 4).
+        "num_data_inputs": 1,
+        "skip_sim_in_ci": False,
+    },
+    # Other 3 fixtures (CCT_LoRA, ResNet8, MobileNetV1) temporarily
+    # disabled so this CI run isolates the big-CCT untiled measurement.
+    # Restore the entries below before merging.
+    #
+    # "Models/Training/CCT_LoRA/cct_lora_train": {
+    #     "l1": 64_000,
+    #     "l2": 2_000_000,
+    #     "skip_sim_in_ci": False,
+    # },
+    # "Models/Training/ResNet8/resnet8_train": {
+    #     "l1": 800_000,
+    #     "l2": 2_000_000,
+    #     "skip_sim_in_ci": False,
+    # },
+    # "Models/Training/MobileNetV1/mobilenetv1_train": {
+    #     "l1": 800_000,
+    #     "l2": 2_000_000,
+    #     "n_steps": 1,
+    #     "n_accum": 1,
+    #     "num_data_inputs": 1,
+    #     "skip_sim_in_ci": False,
+    # },
 }
 
 # Per-model overrides for training tests.
diff --git a/TargetLibraries/PULPOpen/CMakeLists.txt b/TargetLibraries/PULPOpen/CMakeLists.txt
index ce39fea7..0935b925 100644
--- a/TargetLibraries/PULPOpen/CMakeLists.txt
+++ b/TargetLibraries/PULPOpen/CMakeLists.txt
@@ -71,6 +71,17 @@ target_link_libraries(deeploypulp PUBLIC pulp-nn-mixed)
 target_link_libraries(deeploypulp INTERFACE pulp-sdk)
 target_sources(deeploypulp INTERFACE $<TARGET_OBJECTS:pulp-sdk>)
 
+# Untiled-L3 baseline: when ON, mchan_transfer_1d() in mchan_v7.h is replaced
+# with a memcpy implementation so the deeploy-generated DMA calls become
+# regular memory copies between L2 buffers.  Used together with the test-side
+# sed that rewrites pmsis_l1_malloc -> pi_l2_malloc and PI_L1 -> PI_L2 in the
+# generated TrainingNetwork.c so every L1-annotated buffer physically lives
+# in FC L2.
+option(DEEPLOY_L1_AS_L2 "Replace mchan_transfer_1d with memcpy (untiled-L3 baseline)" OFF)
+if(DEEPLOY_L1_AS_L2)
+  target_compile_definitions(deeploypulp PUBLIC DEEPLOY_L1_AS_L2)
+endif()
+
 set(PULP_NNX_PLATFORMS "Siracusa;Siracusa_w_neureka")
 if (platform IN_LIST PULP_NNX_PLATFORMS)
   if (platform STREQUAL "Siracusa" OR platform STREQUAL "Siracusa_w_neureka")
diff --git a/TargetLibraries/PULPOpen/inc/mchan_v7.h b/TargetLibraries/PULPOpen/inc/mchan_v7.h
index 32ef836f..e8a0ea1f 100644
--- a/TargetLibraries/PULPOpen/inc/mchan_v7.h
+++ b/TargetLibraries/PULPOpen/inc/mchan_v7.h
@@ -47,6 +47,42 @@
 #define MCHAN_CMD_FLAG_BROADCAST_FINISH (1 << (MCHAN_TRANSFER_LEN_SIZE + 5))
 #define MCHAN_CMD_FLAG_2D_TRANSFER_LOCAL (1 << (MCHAN_TRANSFER_LEN_SIZE + 6))
 
+/* Untiled-L3 baseline override: when DEEPLOY_L1_AS_L2 is defined the
+ * deeploy-generated code has been sed-rewritten so its "L1" pointers
+ * actually live in FC L2.  The mchan DMA hardware ignores destination
+ * pointer addresses and unconditionally routes the `loc` parameter into
+ * cluster L1 banks via the lower bits — so a real DMA call would write
+ * garbage to L1 and leave the L2 destination empty (which is exactly
+ * the bug we observed: out-of-bound L1-bank requests + computed=0.0).
+ *
+ * Replace mchan transfers with plain memcpy.  The channel API becomes a
+ * no-op: alloc returns 0, wait/free do nothing, is_busy reports idle.
+ * Only the 1D variant is provided — none of the L3 training fixtures
+ * emit 2D transfers; if a future model does, add the equivalent loop
+ * here. */
+#ifdef DEEPLOY_L1_AS_L2
+
+#include <string.h>
+
+static inline void mchan_transfer_1d(uint32_t cmd, void *loc, void *ext) {
+  uint32_t size = cmd & ((1u << MCHAN_TRANSFER_LEN_SIZE) - 1);
+  if (cmd & MCHAN_CMD_FLAG_DIRECTION_EXT2LOC) {
+    memcpy(loc, ext, size);
+  } else {
+    memcpy(ext, loc, size);
+  }
+}
+
+static inline uint32_t mchan_channel_alloc() { return 0; }
+static inline void mchan_channel_free(uint32_t channel_id) { (void)channel_id; }
+static inline uint32_t mchan_channel_is_busy(uint32_t channel_id) {
+  (void)channel_id;
+  return 0;
+}
+static inline void mchan_channel_wait(uint32_t channel_id) { (void)channel_id; }
+
+#else
+
 static volatile uint32_t *const cmd_ptr =
     (volatile uint32_t *const)(MCHAN_BASE_ADDR + 0x0);
 static volatile uint32_t *const status_ptr =
@@ -117,4 +153,6 @@ static void mchan_channel_wait(uint32_t channel_id) {
 #endif
 }
 
+#endif /* DEEPLOY_L1_AS_L2 */
+
 #endif // __MCHAN_V7_H__
diff --git a/scripts/ci_footprint_summary.py b/scripts/ci_footprint_summary.py
new file mode 100644
index 00000000..b8330082
--- /dev/null
+++ b/scripts/ci_footprint_summary.py
@@ -0,0 +1,150 @@
+#!/usr/bin/env python3
+# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+# SPDX-License-Identifier: Apache-2.0
+"""
+Emit a per-test footprint + cycle summary to GITHUB_STEP_SUMMARY.
+
+Two passes:
+
+1. **Build footprint** — walks `DeeployTest/TEST_SIRACUSA/` for generated
+   `TrainingNetwork.c` files and reports per fixture: MEMORYARENA_L1/L2/L3
+   sizes (peak working sets + L3 storage) and distinct numTiles shapes.
+2. **Cycle counts** — parses `DeeployTest/out.txt` (where the test runner
+   appends every sim's stdout) for `BENCH train_cycles=… opt_cycles=…
+   weight_sram=…` lines, correlating each line to its preceding `Testing
+   <test_dir>` banner.  Skipped fixtures contribute no cycle row.
+
+Used in the siracusa-tiled CI workflow.  Safe to run with no matching
+files (just emits an empty section).
+"""
+
+import os
+import re
+import sys
+from pathlib import Path
+
+ARENA_RE = re.compile(r"MEMORYARENA_(L1|L2|L3)\s*=.*\*\s*(\d+)")
+TILES_RE = re.compile(r"numTiles\[\d+\]\s*=\s*\{[^}]+\}")
+TESTING_RE = re.compile(r"Testing\s+(\S+)\s+on\s+\S+\s+Platform")
+BENCH_RE = re.compile(r"BENCH\s+train_cycles=(\d+)\s+opt_cycles=(\d+)\s+weight_sram=(\d+)")
+
+
+def parse_one(c_path: Path) -> dict:
+    arenas = {"L1": 0, "L2": 0, "L3": 0}
+    tile_shapes = set()
+    for line in c_path.read_text(errors="replace").splitlines():
+        m = ARENA_RE.search(line)
+        if m:
+            arenas[m.group(1)] = max(arenas[m.group(1)], int(m.group(2)))
+        for t in TILES_RE.findall(line):
+            tile_shapes.add(t)
+    return {"arenas": arenas, "tile_shapes": len(tile_shapes)}
+
+
+def fmt_kb(n: int) -> str:
+    if n == 0:
+        return "—"
+    return f"{n / 1024:.1f} KB"
+
+
+def fmt_cycles(n: int) -> str:
+    if n == 0:
+        return "—"
+    if n >= 1_000_000:
+        return f"{n / 1e6:.2f}M"
+    if n >= 1_000:
+        return f"{n / 1e3:.1f}K"
+    return str(n)
+
+
+def parse_cycles(out_txt: Path) -> dict:
+    """Returns {test_dir: {train_cycles, opt_cycles, weight_sram}}.
+
+    Each `Testing <path>` banner in out.txt opens a section; the next
+    `BENCH …` line in that section is the cycle row for that fixture.
+    Sections without a BENCH line (skipsim, sim crash) get no entry.
+    """
+    if not out_txt.is_file():
+        return {}
+    out: dict = {}
+    current = None
+    for line in out_txt.read_text(errors="replace").splitlines():
+        m = TESTING_RE.search(line)
+        if m:
+            current = m.group(1)
+            continue
+        m = BENCH_RE.search(line)
+        if m and current is not None:
+            out[current] = {
+                "train_cycles": int(m.group(1)),
+                "opt_cycles": int(m.group(2)),
+                "weight_sram": int(m.group(3)),
+            }
+    return out
+
+
+def main() -> int:
+    test_root = Path(sys.argv[1]) if len(sys.argv) > 1 else Path("DeeployTest/TEST_SIRACUSA")
+    if not test_root.is_dir():
+        print(f"[footprint-summary] {test_root} not a directory; skipping", file=sys.stderr)
+        return 0
+
+    rows = []
+    for c_path in sorted(test_root.rglob("TrainingNetwork.c")):
+        rel = c_path.relative_to(test_root).parent
+        info = parse_one(c_path)
+        rows.append((str(rel), info))
+
+    cycles = parse_cycles(test_root.parent / "out.txt")
+
+    # Pick the pytest marker label (passed by the workflow) for the section title.
+    label = os.environ.get("FOOTPRINT_SUMMARY_LABEL", "training")
+
+    out_lines = [
+        f"### Build footprint — `{label}`",
+        "",
+        "| Fixture | L1 working | L2 working | L3 storage | Distinct tile shapes |",
+        "|---|--:|--:|--:|--:|",
+    ]
+    for path, info in rows:
+        a = info["arenas"]
+        out_lines.append(
+            f"| `{path}` | {fmt_kb(a['L1'])} | {fmt_kb(a['L2'])} | {fmt_kb(a['L3'])} | {info['tile_shapes']} |")
+    if not rows:
+        out_lines.append("| _(no TrainingNetwork.c found)_ | | | | |")
+    out_lines.append("")
+
+    # Cycle table — only renders if at least one fixture actually simulated.
+    cycle_rows = []
+    for path, _info in rows:
+        # The `Testing` banner uses the absolute test_dir path; match by basename.
+        match_key = next((k for k in cycles if k.endswith(path) or path.endswith(Path(k).name)), None)
+        if match_key:
+            cycle_rows.append((path, cycles[match_key]))
+    out_lines.append(f"### Cycle counts (gvsoc) — `{label}`")
+    out_lines.append("")
+    out_lines.append("| Fixture | train_cycles | opt_cycles | weight_sram |")
+    out_lines.append("|---|--:|--:|--:|")
+    if cycle_rows:
+        for path, c in cycle_rows:
+            out_lines.append(
+                f"| `{path}` | {fmt_cycles(c['train_cycles'])} | "
+                f"{fmt_cycles(c['opt_cycles'])} | {fmt_kb(c['weight_sram'])} |")
+    else:
+        out_lines.append(
+            "| _(no BENCH lines in out.txt — sim was --skipsim'd or crashed)_ | | | |")
+    out_lines.append("")
+
+    summary_path = os.environ.get("GITHUB_STEP_SUMMARY")
+    if summary_path:
+        with open(summary_path, "a") as f:
+            f.write("\n".join(out_lines) + "\n")
+        print(f"[footprint-summary] wrote {len(rows)} rows to {summary_path}", file=sys.stderr)
+    else:
+        # Local invocation: print to stdout for visibility.
+        print("\n".join(out_lines))
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())