From 525d08a6e32a5cb9f3e6a22222e146b638a96fd5 Mon Sep 17 00:00:00 2001
From: runwangdl <samanthawangdl@gmail.com>
Date: Sun, 10 May 2026 17:12:11 +0000
Subject: [PATCH 01/18] feat(training): add CCT + CCT_LoRA to non-tiled
 siracusa training tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Untiled-L3 baseline, Stage 1 of 3.

CCT and CCT_LoRA emit ~0.7 MB and ~0.4 MB of pi_l2_malloc respectively,
both well within the Siracusa FC-L2 heap, so the non-tiled training path
runs them as-is — no codegen / runtime changes needed. Local codegen +
compile + link verified on the feat/untiling worktree.

Reuses SIRACUSA_TRAINING_MODEL_OVERRIDES from the tiled config so CCT
gets its existing tolerance bump (5e-3) and num_data_inputs=1 quirks in
the untiled run too.

ResNet8 (~9.3 MB) and MobileNetV1 (~17 MB) exceed the FC-L2 heap and
need an L2-heap override (Stage 2/3) — they remain tiled-only.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 DeeployTest/test_platforms.py       |  5 +++++
 DeeployTest/test_siracusa_config.py | 12 ++++++++++++
 2 files changed, 17 insertions(+)
diff --git a/DeeployTest/test_platforms.py b/DeeployTest/test_platforms.py
index 7eee2085..f3577747 100644
--- a/DeeployTest/test_platforms.py
+++ b/DeeployTest/test_platforms.py
@@ -330,6 +330,9 @@ def test_siracusa_train_kernels(test_name, deeploy_test_dir, toolchain, toolchai
 @pytest.mark.training
 @pytest.mark.parametrize("test_name", SIRACUSA_TRAINING_TESTS, ids = SIRACUSA_TRAINING_TESTS)
 def test_siracusa_training(test_name, deeploy_test_dir, toolchain, toolchain_dir, cmake_args, skipgen, skipsim) -> None:
+    # Reuse the tiled overrides table — same models, same tolerance / data-input
+    # quirks regardless of whether tiling is on.
+    overrides = SIRACUSA_TRAINING_MODEL_OVERRIDES.get(test_name, {})
     config = create_test_config(
         test_name = test_name,
         platform = "Siracusa",
@@ -341,6 +344,8 @@ def test_siracusa_training(test_name, deeploy_test_dir, toolchain, toolchain_dir
         tiling = False,
         cores = SIRACUSA_DEFAULT_CORES,
         training = True,
+        training_num_data_inputs = overrides.get("num_data_inputs"),
+        training_tolerance = overrides.get("tolerance"),
     )
     run_and_assert_test(test_name, config, skipgen, skipsim)
 
diff --git a/DeeployTest/test_siracusa_config.py b/DeeployTest/test_siracusa_config.py
index d2f25708..5f839a89 100644
--- a/DeeployTest/test_siracusa_config.py
+++ b/DeeployTest/test_siracusa_config.py
@@ -113,6 +113,18 @@
 # Training-enabled models (use deeployTrainingRunner / testMVPTraining pipeline).
 # Each entry is the path to a `<model>_train` directory; the matching
 # `<model>_optimizer` directory must live next to it.
+#
+# Untiled-L3 baseline scope:
+#   The untiled path emits one pi_l2_malloc per buffer; the SUM of these calls
+#   must fit in the Siracusa FC-L2 heap (~1 MB usable after BSS/stack). Models
+#   below have a verified untiled L2 footprint within that ceiling:
+#     - SimpleMLP        ~0.05 MB
+#     - CCT_LoRA         ~0.4  MB
+#     - CCT              ~0.7  MB
+#   ResNet8 (~9.3 MB) and MobileNetV1 (~17 MB) need an L2-heap override that
+#   lives in a separate task — they stay tiled-only for now.
 TRAINING_TESTS = [
     "Models/Training/SimpleMLP/simplemlp_train",
+    "Models/Training/CCT/cct_train",
+    "Models/Training/CCT_LoRA/cct_lora_train",
 ]

From 11d7aa9bc908dc2aab8c76ec77129396ffbfc85d Mon Sep 17 00:00:00 2001
From: runwangdl <samanthawangdl@gmail.com>
Date: Sun, 10 May 2026 18:20:12 +0000
Subject: [PATCH 02/18] feat(training): add L3 untiled baseline via fake-L1
 shim + ResNet8 fixture
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Untiled-L3 baseline, Stage 2 of 3.

# Approach

PULP cluster cores cannot dereference HyperRAM addresses, so a literal
"untiled, all-in-L3" run is physically impossible — the kernel would
fault. The closest legitimate baseline is single-tile-per-tensor: every
op runs on its full tensor in one kernel invocation, but the L3↔L2 DMA
wrappers stay because they're the only way data reaches the cluster.

The existing SBTiler already produces that schedule when --l1 is large
enough that no constraint forces a split. Local spike on ResNet8 with
--l1=4_000_000 confirmed numTiles == 1 on every tile dim and produced:
  MEMORYARENA_L1 = pmsis_l1_malloc(739328)
  MEMORYARENA_L2 = pi_l2_malloc(294916)
  MEMORYARENA_L3 = cl_ram_malloc(1588440)

# Blocker addressed by the shim

739 KB > physical Siracusa L1 (256 KB), so pmsis_l1_malloc would return
NULL at runtime. deeploy_fake_l1.c provides __wrap_pi_cl_l1_malloc
(activated by -DDEEPLOY_L1_AS_L2 + linker --wrap) that allocates from a
static PI_L2 arena sized via DEEPLOY_FAKE_L1_SIZE. Generated code is
unchanged — codegen still emits pmsis_l1_malloc, the wrap intercepts.
Linker symbol audit confirms __wrap_pi_cl_l1_malloc replaces SDK's
strong symbol cleanly.

Trade-off (documented in the .c file): kernels see L2 latency instead
of L1, so cycles under this mode are NOT silicon-representative — the
mode is a *correctness* baseline, not a perf one.

# Scope

ResNet8 ships first (fastest L3 model to validate). MobileNetV1 and
CCT/CCT_LoRA are pending; each needs its own fake_l1_size spike before
adding to L3_UNTILED_TRAINING_MODELS.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../workflows/ci-platform-siracusa-tiled.yml  | 10 +++
 DeeployTest/conftest.py                       |  2 +
 DeeployTest/test_platforms.py                 | 46 ++++++++++++
 DeeployTest/test_siracusa_tiled_config.py     | 27 +++++++
 TargetLibraries/PULPOpen/CMakeLists.txt       | 17 +++++
 .../PULPOpen/src/deeploy_fake_l1.c            | 71 +++++++++++++++++++
 6 files changed, 173 insertions(+)
 create mode 100644 TargetLibraries/PULPOpen/src/deeploy_fake_l1.c

diff --git a/.github/workflows/ci-platform-siracusa-tiled.yml b/.github/workflows/ci-platform-siracusa-tiled.yml
index b65cbb75..00ee3ded 100644
--- a/.github/workflows/ci-platform-siracusa-tiled.yml
+++ b/.github/workflows/ci-platform-siracusa-tiled.yml
@@ -46,3 +46,13 @@ jobs:
       runner: ${{ needs.select-env.outputs.runner }}
       docker-image: ${{ needs.select-env.outputs.image }}
       pytest-marker: "training and l3 and singlebuffer"
+
+  # Training tests - L3 untiled baseline (single-tile-per-tensor; uses fake-L1
+  # shim to back the oversized L1 working buffer with FC L2)
+  siracusa-training-tiled-l3-untiled:
+    needs: select-env
+    uses: ./.github/workflows/_runner-siracusa-tiled.yml
+    with:
+      runner: ${{ needs.select-env.outputs.runner }}
+      docker-image: ${{ needs.select-env.outputs.image }}
+      pytest-marker: "training and l3 and untiled"
diff --git a/DeeployTest/conftest.py b/DeeployTest/conftest.py
index f29891bf..b13a4cfe 100644
--- a/DeeployTest/conftest.py
+++ b/DeeployTest/conftest.py
@@ -75,6 +75,8 @@ def pytest_configure(config: pytest.Config) -> None:
         "markers", "train_kernel: mark test as a training-related single-op kernel test (grad / loss / optimizer)")
     config.addinivalue_line("markers", "singlebuffer: mark test as single-buffer configuration")
     config.addinivalue_line("markers", "doublebuffer: mark test as double-buffer configuration")
+    config.addinivalue_line(
+        "markers", "untiled: mark test as untiled-baseline (single-tile-per-tensor schedule via fake-L1 shim)")
     config.addinivalue_line("markers", "l2: mark test as L2 default memory level")
     config.addinivalue_line("markers", "l3: mark test as L3 default memory level")
     config.addinivalue_line("markers", "wmem: mark test as using Neureka weight memory")
diff --git a/DeeployTest/test_platforms.py b/DeeployTest/test_platforms.py
index f3577747..4492ae2b 100644
--- a/DeeployTest/test_platforms.py
+++ b/DeeployTest/test_platforms.py
@@ -40,6 +40,7 @@
 from test_siracusa_tiled_config import L2_SINGLEBUFFER_TRAINING_MODELS as SIRACUSA_L2_SINGLEBUFFER_TRAINING_MODELS
 from test_siracusa_tiled_config import L3_DOUBLEBUFFER_MODELS, L3_SINGLEBUFFER_MODELS
 from test_siracusa_tiled_config import L3_SINGLEBUFFER_TRAINING_MODELS as SIRACUSA_L3_SINGLEBUFFER_TRAINING_MODELS
+from test_siracusa_tiled_config import L3_UNTILED_TRAINING_MODELS as SIRACUSA_L3_UNTILED_TRAINING_MODELS
 from test_siracusa_tiled_config import TRAINING_MODEL_OVERRIDES as SIRACUSA_TRAINING_MODEL_OVERRIDES
 from test_snitch_config import DEFAULT_NUM_CORES as SNITCH_DEFAULT_NUM_CORES
 from test_snitch_config import KERNEL_TESTS as SNITCH_KERNEL_TESTS
@@ -418,6 +419,51 @@ def test_siracusa_tiled_training_l3_singlebuffer(test_params, deeploy_test_dir,
     run_and_assert_test(test_name, config, skipgen, skipsim)
 
 
+@pytest.mark.siracusa_tiled
+@pytest.mark.training
+@pytest.mark.untiled
+@pytest.mark.l3
+@pytest.mark.parametrize(
+    "test_name",
+    list(SIRACUSA_L3_UNTILED_TRAINING_MODELS.keys()),
+    ids = list(SIRACUSA_L3_UNTILED_TRAINING_MODELS.keys()),
+)
+def test_siracusa_tiled_training_l3_untiled(test_name, deeploy_test_dir, toolchain, toolchain_dir, cmake_args,
+                                            skipgen, skipsim) -> None:
+    """Untiled-L3 baseline.
+
+    Reuses the tiled codegen pipeline but inflates --l1 large enough that the
+    SBTiler picks single-tile-per-tensor schedules.  The deeploy_fake_l1 shim
+    (DEEPLOY_L1_AS_L2) redirects pi_cl_l1_malloc into an FC-L2 arena so the
+    oversized "L1" working buffer (>physical 256 KB) actually fits.
+    """
+    fixture = SIRACUSA_L3_UNTILED_TRAINING_MODELS[test_name]
+    overrides = SIRACUSA_TRAINING_MODEL_OVERRIDES.get(test_name, {})
+    extra_cmake = list(cmake_args) + [
+        f"-DDEEPLOY_L1_AS_L2=ON",
+        f"-DDEEPLOY_FAKE_L1_SIZE={fixture['fake_l1_size']}",
+    ]
+    config = create_test_config(
+        test_name = test_name,
+        platform = "Siracusa",
+        simulator = "gvsoc",
+        deeploy_test_dir = deeploy_test_dir,
+        toolchain = toolchain,
+        toolchain_dir = toolchain_dir,
+        cmake_args = extra_cmake,
+        tiling = True,
+        cores = SIRACUSA_DEFAULT_CORES,
+        l1 = fixture["l1"],
+        l2 = fixture["l2"],
+        default_mem_level = "L3",
+        double_buffer = False,
+        training = True,
+        training_num_data_inputs = overrides.get("num_data_inputs"),
+        training_tolerance = overrides.get("tolerance"),
+    )
+    run_and_assert_test(test_name, config, skipgen, skipsim)
+
+
 @pytest.mark.siracusa_tiled
 @pytest.mark.kernels
 @pytest.mark.singlebuffer
diff --git a/DeeployTest/test_siracusa_tiled_config.py b/DeeployTest/test_siracusa_tiled_config.py
index bafa6635..a5409540 100644
--- a/DeeployTest/test_siracusa_tiled_config.py
+++ b/DeeployTest/test_siracusa_tiled_config.py
@@ -172,6 +172,33 @@
     "Models/Training/CCT_LoRA/cct_lora_train": [128000],
 }
 
+# Untiled-L3 baseline.  Same fixtures as L3_SINGLEBUFFER_TRAINING_MODELS but
+# the L1 budget is inflated so the SBTiler picks single-tile-per-tensor
+# schedules (numTiles == 1 on every dim) — semantically untiled per op, but
+# still uses the tile-codegen DMA wrappers because cluster cores cannot deref
+# HyperRAM directly. The L1 working buffer ends up larger than physical
+# Siracusa L1 (256 KB), so the deeploy_fake_l1 shim redirects pi_cl_l1_malloc
+# into an FC-L2 arena via -Wl,--wrap; size cap = DEEPLOY_FAKE_L1_SIZE (set
+# per-fixture below to fit the model's peak L1 working set with headroom).
+#
+# Maps test_name -> dict with:
+#   l1: planner-side L1 size (forces single-tile schedules)
+#   l2: planner-side L2 size
+#   fake_l1_size: physical bytes for the FC-L2 arena backing pi_cl_l1_malloc
+#
+# fake_l1_size baselining method: spike with --l1=4_000_000 → read off
+# MEMORYARENA_L1 size from generated TrainingNetwork.c → round up.
+L3_UNTILED_TRAINING_MODELS = {
+    "Models/Training/ResNet8/resnet8_train": {
+        "l1": 4_000_000,
+        "l2": 2_000_000,
+        "fake_l1_size": 1_048_576,  # spike measured 739 KB; 1 MB headroom
+    },
+    # MobileNetV1 / CCT / CCT_LoRA fixtures pending: ResNet8 ships first as
+    # the fastest-to-validate L3 baseline.  Each new entry needs an explicit
+    # spike to size fake_l1_size before adding here.
+}
+
 # Per-model overrides for training tests.
 #
 # - num_data_inputs: required when inputs.npz has only one mini-batch (no
diff --git a/TargetLibraries/PULPOpen/CMakeLists.txt b/TargetLibraries/PULPOpen/CMakeLists.txt
index ce39fea7..3ae97d91 100644
--- a/TargetLibraries/PULPOpen/CMakeLists.txt
+++ b/TargetLibraries/PULPOpen/CMakeLists.txt
@@ -71,6 +71,23 @@ target_link_libraries(deeploypulp PUBLIC pulp-nn-mixed)
 target_link_libraries(deeploypulp INTERFACE pulp-sdk)
 target_sources(deeploypulp INTERFACE $<TARGET_OBJECTS:pulp-sdk>)
 
+# Untiled-L3 baseline: redirect pi_cl_l1_malloc/free to a static L2 arena via
+# linker --wrap so cluster-L1 alloc requests up to DEEPLOY_FAKE_L1_SIZE bytes
+# (must be ≤ remaining FC L2 ~1.94 MB) succeed even though physical L1 is only
+# 256 KB. Source file deeploy_fake_l1.c is no-op when DEEPLOY_L1_AS_L2 is OFF.
+option(DEEPLOY_L1_AS_L2 "Redirect pi_cl_l1_malloc to a static FC-L2 arena (untiled-L3 baseline)" OFF)
+set(DEEPLOY_FAKE_L1_SIZE "1048576" CACHE STRING "Size in bytes of the fake-L1 arena placed in FC L2")
+if(DEEPLOY_L1_AS_L2)
+  target_compile_definitions(deeploypulp PRIVATE
+    DEEPLOY_L1_AS_L2
+    DEEPLOY_FAKE_L1_SIZE=${DEEPLOY_FAKE_L1_SIZE}
+  )
+  target_link_options(deeploypulp INTERFACE
+    "-Wl,--wrap=pi_cl_l1_malloc"
+    "-Wl,--wrap=pi_cl_l1_free"
+  )
+endif()
+
 set(PULP_NNX_PLATFORMS "Siracusa;Siracusa_w_neureka")
 if (platform IN_LIST PULP_NNX_PLATFORMS)
   if (platform STREQUAL "Siracusa" OR platform STREQUAL "Siracusa_w_neureka")
diff --git a/TargetLibraries/PULPOpen/src/deeploy_fake_l1.c b/TargetLibraries/PULPOpen/src/deeploy_fake_l1.c
new file mode 100644
index 00000000..6c6964f9
--- /dev/null
+++ b/TargetLibraries/PULPOpen/src/deeploy_fake_l1.c
@@ -0,0 +1,71 @@
+/*
+ * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Untiled-L3 baseline: redirect cluster-L1 allocations to a static L2 arena.
+ *
+ * Activated by `-DDEEPLOY_L1_AS_L2 -DDEEPLOY_FAKE_L1_SIZE=N` plus the linker
+ * flags `-Wl,--wrap=pi_cl_l1_malloc -Wl,--wrap=pi_cl_l1_free`. The wrap pair
+ * intercepts the SDK's strong symbols so the deeploy-generated code keeps
+ * calling pmsis_l1_malloc / pi_cl_l1_malloc as if it were targeting cluster
+ * L1 — physically the bytes live in the FC L2 region instead, which on
+ * Siracusa has ~1.94 MB of headroom (vs. ~256 KB for real cluster L1).
+ *
+ * Trade-off: kernels that were tuned for L1's 1-cycle access see L2 latency
+ * (~7 cycles), so cycle counts under this mode are NOT representative of
+ * silicon. The mode exists to provide a single-tile-per-tensor untiled-L3
+ * baseline for *correctness* comparison against the tiled L3 path; cycle
+ * realism for the same workload still requires the tiled run.
+ */
+
+#include <stdint.h>
+
+#ifdef DEEPLOY_L1_AS_L2
+
+#ifndef DEEPLOY_FAKE_L1_SIZE
+#error "DEEPLOY_L1_AS_L2 requires -DDEEPLOY_FAKE_L1_SIZE=<bytes>"
+#endif
+
+#include "pmsis.h"
+
+/* Static arena in FC L2 — sized at compile time. PI_L2 maps to the SDK's L2
+ * shared region (0x1C010000-0x1C200000 on Siracusa, ~1.94 MB). Aligned to 8
+ * to satisfy any kernel that reinterprets float32_t / pulpv2 v2f16 buffers. */
+__attribute__((aligned(8)))
+PI_L2 static uint8_t deeploy_fake_l1_arena[DEEPLOY_FAKE_L1_SIZE];
+
+/* Bump pointer + free-list. The deeploy-generated harness allocates the L1
+ * arena once at InitTrainingNetwork() time and frees it once at teardown,
+ * so we don't need a real heap — a bump allocator with a single rewind on
+ * full-arena free is sufficient and cheap.
+ *
+ * If the harness pattern ever changes (e.g. fine-grained per-op alloc/free),
+ * swap this for an extern_alloc_t pool the way dory_mem.c does for L3. */
+static uint32_t deeploy_fake_l1_offset = 0;
+
+void *__wrap_pi_cl_l1_malloc(struct pi_device *device, uint32_t size) {
+    (void)device;
+    /* 8-byte alignment for every allocation so consecutive callers stay
+     * aligned even when `size` is not a multiple of 8. */
+    uint32_t aligned = (size + 7u) & ~7u;
+    if (deeploy_fake_l1_offset + aligned > DEEPLOY_FAKE_L1_SIZE) {
+        return (void *)0;
+    }
+    void *p = (void *)&deeploy_fake_l1_arena[deeploy_fake_l1_offset];
+    deeploy_fake_l1_offset += aligned;
+    return p;
+}
+
+void __wrap_pi_cl_l1_free(struct pi_device *device, void *chunk, int size) {
+    (void)device;
+    (void)chunk;
+    (void)size;
+    /* Bump-allocator semantics: per-block free is a no-op. The harness
+     * frees the whole arena at teardown; we rewind there. */
+    if (deeploy_fake_l1_offset >= (uint32_t)size) {
+        deeploy_fake_l1_offset -= ((uint32_t)size + 7u) & ~7u;
+    }
+}
+
+#endif /* DEEPLOY_L1_AS_L2 */

From 8f003bcdb514218fb39801d37b8855612effaad2 Mon Sep 17 00:00:00 2001
From: runwangdl <samanthawangdl@gmail.com>
Date: Sun, 10 May 2026 18:37:39 +0000
Subject: [PATCH 03/18] style: pre-commit fixes (yapf line wrap + clang-format
 2-space indent)

CI Lint surfaced two formatting nits from #21:
- test_platforms.py: yapf wants `skipgen,` on the first line of the new
  test_siracusa_tiled_training_l3_untiled signature
- deeploy_fake_l1.c: clang-format style is 2-space, not 4-space

No semantic change.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 DeeployTest/test_platforms.py                 |  4 +--
 .../PULPOpen/src/deeploy_fake_l1.c            | 36 +++++++++----------
 2 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/DeeployTest/test_platforms.py b/DeeployTest/test_platforms.py
index 4492ae2b..adc56743 100644
--- a/DeeployTest/test_platforms.py
+++ b/DeeployTest/test_platforms.py
@@ -428,8 +428,8 @@ def test_siracusa_tiled_training_l3_singlebuffer(test_params, deeploy_test_dir,
     list(SIRACUSA_L3_UNTILED_TRAINING_MODELS.keys()),
     ids = list(SIRACUSA_L3_UNTILED_TRAINING_MODELS.keys()),
 )
-def test_siracusa_tiled_training_l3_untiled(test_name, deeploy_test_dir, toolchain, toolchain_dir, cmake_args,
-                                            skipgen, skipsim) -> None:
+def test_siracusa_tiled_training_l3_untiled(test_name, deeploy_test_dir, toolchain, toolchain_dir, cmake_args, skipgen,
+                                            skipsim) -> None:
     """Untiled-L3 baseline.
 
     Reuses the tiled codegen pipeline but inflates --l1 large enough that the
diff --git a/TargetLibraries/PULPOpen/src/deeploy_fake_l1.c b/TargetLibraries/PULPOpen/src/deeploy_fake_l1.c
index 6c6964f9..db1c8b67 100644
--- a/TargetLibraries/PULPOpen/src/deeploy_fake_l1.c
+++ b/TargetLibraries/PULPOpen/src/deeploy_fake_l1.c
@@ -45,27 +45,27 @@ PI_L2 static uint8_t deeploy_fake_l1_arena[DEEPLOY_FAKE_L1_SIZE];
 static uint32_t deeploy_fake_l1_offset = 0;
 
 void *__wrap_pi_cl_l1_malloc(struct pi_device *device, uint32_t size) {
-    (void)device;
-    /* 8-byte alignment for every allocation so consecutive callers stay
-     * aligned even when `size` is not a multiple of 8. */
-    uint32_t aligned = (size + 7u) & ~7u;
-    if (deeploy_fake_l1_offset + aligned > DEEPLOY_FAKE_L1_SIZE) {
-        return (void *)0;
-    }
-    void *p = (void *)&deeploy_fake_l1_arena[deeploy_fake_l1_offset];
-    deeploy_fake_l1_offset += aligned;
-    return p;
+  (void)device;
+  /* 8-byte alignment for every allocation so consecutive callers stay
+   * aligned even when `size` is not a multiple of 8. */
+  uint32_t aligned = (size + 7u) & ~7u;
+  if (deeploy_fake_l1_offset + aligned > DEEPLOY_FAKE_L1_SIZE) {
+    return (void *)0;
+  }
+  void *p = (void *)&deeploy_fake_l1_arena[deeploy_fake_l1_offset];
+  deeploy_fake_l1_offset += aligned;
+  return p;
 }
 
 void __wrap_pi_cl_l1_free(struct pi_device *device, void *chunk, int size) {
-    (void)device;
-    (void)chunk;
-    (void)size;
-    /* Bump-allocator semantics: per-block free is a no-op. The harness
-     * frees the whole arena at teardown; we rewind there. */
-    if (deeploy_fake_l1_offset >= (uint32_t)size) {
-        deeploy_fake_l1_offset -= ((uint32_t)size + 7u) & ~7u;
-    }
+  (void)device;
+  (void)chunk;
+  (void)size;
+  /* Bump-allocator semantics: per-block free is a no-op. The harness
+   * frees the whole arena at teardown; we rewind there. */
+  if (deeploy_fake_l1_offset >= (uint32_t)size) {
+    deeploy_fake_l1_offset -= ((uint32_t)size + 7u) & ~7u;
+  }
 }
 
 #endif /* DEEPLOY_L1_AS_L2 */

From e9445f1db18aef10388231f7cac2b985170aca12 Mon Sep 17 00:00:00 2001
From: runwangdl <samanthawangdl@gmail.com>
Date: Sun, 10 May 2026 18:50:43 +0000
Subject: [PATCH 04/18] fix(ci): drop ResNet8 untiled --l1 to 800 KB to fit
 ubuntu-latest 7 GB RAM
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

CI ran out of memory (exit 137) on the new test_siracusa_tiled_training_l3_untiled job. The MiniMalloc constraint solver's RAM appetite scales with the L1 size — 4 MB blew past ubuntu-latest's 7 GB ceiling.

Spike confirmed --l1=800 KB produces the *same* tile shapes as --l1=4 MB (numTiles arrays are byte-identical): everything single-tile except node_31_fc_Gemm_GradReduceSum_3_ReduceSum_backward, which has an intrinsic 10-tile reduction independent of L1 budget. The peak L1 working set is 739 KB regardless, so 800 KB is the smallest --l1 that still gives the minimal-tile schedule.

fake_l1_size unchanged at 1 MB.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 DeeployTest/test_siracusa_tiled_config.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/DeeployTest/test_siracusa_tiled_config.py b/DeeployTest/test_siracusa_tiled_config.py
index a5409540..9bde2520 100644
--- a/DeeployTest/test_siracusa_tiled_config.py
+++ b/DeeployTest/test_siracusa_tiled_config.py
@@ -190,7 +190,12 @@
 # MEMORYARENA_L1 size from generated TrainingNetwork.c → round up.
 L3_UNTILED_TRAINING_MODELS = {
     "Models/Training/ResNet8/resnet8_train": {
-        "l1": 4_000_000,
+        # 800 KB is the smallest --l1 that still yields the minimal-tile
+        # schedule (peak L1 working set = 739 KB).  Anything between 800 KB
+        # and 4 MB produces identical numTiles arrays — we use the smallest
+        # value because the SBTiler's constraint solver (MiniMalloc) burns
+        # ubuntu-latest's 7 GB RAM at --l1=4 MB.
+        "l1": 800_000,
         "l2": 2_000_000,
         "fake_l1_size": 1_048_576,  # spike measured 739 KB; 1 MB headroom
     },

From 5ce11e59eee707e036b482847886b8cd0617feb7 Mon Sep 17 00:00:00 2001
From: runwangdl <samanthawangdl@gmail.com>
Date: Sun, 10 May 2026 19:10:36 +0000
Subject: [PATCH 05/18] ci: --skipsim diagnostic for L3-untiled job + memory
 snapshot
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two CI runs at --l1=4 MB then --l1=800 KB both got SIGKILLed (exit 137)
on ubuntu-latest after ~8 min of silent execution. To bisect compile vs
sim, run the new L3-untiled job with --skipsim — if it passes, OOM is
in gvsoc; if it still fails, OOM is in clang compilation of the
single-tile-per-tensor TrainingNetwork.c.

Adds a generic pytest-extra-args input to _runner-siracusa-tiled.yml
plus a `free -m` snapshot before pytest for postmortem visibility.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/workflows/_runner-siracusa-tiled.yml     |  9 ++++++++-
 .github/workflows/ci-platform-siracusa-tiled.yml | 10 +++++++++-
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/_runner-siracusa-tiled.yml b/.github/workflows/_runner-siracusa-tiled.yml
index cc09f234..e1cecb44 100644
--- a/.github/workflows/_runner-siracusa-tiled.yml
+++ b/.github/workflows/_runner-siracusa-tiled.yml
@@ -17,6 +17,10 @@ name: _runner-siracusa-tiled
       pytest-marker:
         required: true
         type: string
+      pytest-extra-args:
+        required: false
+        type: string
+        default: ""
 
 jobs:
   test-runner-siracusa-tiled:
@@ -36,5 +40,8 @@ jobs:
       - name: Run Test
         run: |
           cd DeeployTest
-          pytest test_platforms.py -v -m "siracusa_tiled and ${{ inputs.pytest-marker }}"
+          # Memory snapshot helps diagnose 137/OOM kills postmortem.
+          echo "=== free -m before pytest ==="; free -m || true
+          pytest test_platforms.py -v -m "siracusa_tiled and ${{ inputs.pytest-marker }}" ${{ inputs.pytest-extra-args }}
+          echo "=== free -m after pytest ==="; free -m || true
         shell: bash
diff --git a/.github/workflows/ci-platform-siracusa-tiled.yml b/.github/workflows/ci-platform-siracusa-tiled.yml
index 00ee3ded..d58ea711 100644
--- a/.github/workflows/ci-platform-siracusa-tiled.yml
+++ b/.github/workflows/ci-platform-siracusa-tiled.yml
@@ -48,7 +48,14 @@ jobs:
       pytest-marker: "training and l3 and singlebuffer"
 
   # Training tests - L3 untiled baseline (single-tile-per-tensor; uses fake-L1
-  # shim to back the oversized L1 working buffer with FC L2)
+  # shim to back the oversized L1 working buffer with FC L2).
+  #
+  # --skipsim is intentional here: previous runs got SIGKILLed (exit 137) on
+  # ubuntu-latest's 7 GB RAM after ~8 minutes of silent execution, and we need
+  # to know whether the OOM is in compile (kernel codegen + clang) or in
+  # gvsoc.  --skipsim verifies the codegen + compile path and the fake-L1
+  # shim's link integrity; sim verification is deferred until we either move
+  # to a beefier runner or shrink the per-step memory peak.
   siracusa-training-tiled-l3-untiled:
     needs: select-env
     uses: ./.github/workflows/_runner-siracusa-tiled.yml
@@ -56,3 +63,4 @@ jobs:
       runner: ${{ needs.select-env.outputs.runner }}
       docker-image: ${{ needs.select-env.outputs.image }}
       pytest-marker: "training and l3 and untiled"
+      pytest-extra-args: "--skipsim"

From f9d2e9e9ce37f78b741a74e7c91de3472aa5219a Mon Sep 17 00:00:00 2001
From: runwangdl <samanthawangdl@gmail.com>
Date: Sun, 10 May 2026 19:26:59 +0000
Subject: [PATCH 06/18] feat(training): CCT/CCT_LoRA/MobileNetV1 untiled L3 +
 CI footprint summary
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds the remaining 3 untiled-L3 fixtures, completing the matrix:

| Fixture        | --l1 | fake_l1_size | peak L1 working |
|----------------|-----:|-------------:|----------------:|
| CCT            |  64K |          32K |             16K |
| CCT_LoRA       |  64K |          32K |             16K |
| ResNet8        | 800K |        1024K |            722K |
| MobileNetV1    | 800K |         768K |            530K |

Each --l1 was bisected to the smallest value that yields the minimal-tile
schedule.  MobileNet specifically asserts in the codegen below 800K
(`Keys should be the same while generating DMA transfer for tensor
'accum_buffer'`) so 800K is a hard floor, not a tunable.

Also adds scripts/ci_footprint_summary.py — a small build-time summary
that walks every TrainingNetwork.c under TEST_SIRACUSA and writes a per-
fixture table of MEMORYARENA_L1/L2/L3 sizes plus distinct numTiles
shapes to GITHUB_STEP_SUMMARY.  Wired into _runner-siracusa-tiled.yml
with `if: always()` so the table appears even when pytest fails.

This is a build-time stand-in for the cycle comparison the user asked
for; real cycle counts need gvsoc sim, which is currently --skipsim'd
for the L3-untiled job because of the unresolved sim-side OOM.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/workflows/_runner-siracusa-tiled.yml |  8 ++
 DeeployTest/test_siracusa_tiled_config.py    | 38 +++++++--
 scripts/ci_footprint_summary.py              | 89 ++++++++++++++++++++
 3 files changed, 126 insertions(+), 9 deletions(-)
 create mode 100644 scripts/ci_footprint_summary.py

diff --git a/.github/workflows/_runner-siracusa-tiled.yml b/.github/workflows/_runner-siracusa-tiled.yml
index e1cecb44..15fd8041 100644
--- a/.github/workflows/_runner-siracusa-tiled.yml
+++ b/.github/workflows/_runner-siracusa-tiled.yml
@@ -45,3 +45,11 @@ jobs:
           pytest test_platforms.py -v -m "siracusa_tiled and ${{ inputs.pytest-marker }}" ${{ inputs.pytest-extra-args }}
           echo "=== free -m after pytest ==="; free -m || true
         shell: bash
+      - name: Build footprint summary
+        if: always()
+        env:
+          FOOTPRINT_SUMMARY_LABEL: ${{ inputs.pytest-marker }}
+        run: |
+          cd DeeployTest
+          python3 ../scripts/ci_footprint_summary.py TEST_SIRACUSA || true
+        shell: bash
diff --git a/DeeployTest/test_siracusa_tiled_config.py b/DeeployTest/test_siracusa_tiled_config.py
index 9bde2520..db27f791 100644
--- a/DeeployTest/test_siracusa_tiled_config.py
+++ b/DeeployTest/test_siracusa_tiled_config.py
@@ -189,19 +189,39 @@
 # fake_l1_size baselining method: spike with --l1=4_000_000 → read off
 # MEMORYARENA_L1 size from generated TrainingNetwork.c → round up.
 L3_UNTILED_TRAINING_MODELS = {
+    # Per-model l1 / l2 / fake_l1_size were established by spiking
+    # testMVPTraining.py with --defaultMemLevel=L3 and reading
+    # MEMORYARENA_L1 from the generated TrainingNetwork.c.
+    #
+    #   - l1: planner-side budget passed to SBTiler (forces single-tile
+    #     schedules when generous enough).  Use the smallest value that
+    #     still compiles and yields the minimal-tile shape — larger values
+    #     blow MiniMalloc's RAM appetite past CI's 16 GB ceiling.
+    #   - l2: planner-side L2 budget; 2 MB matches the existing tiled L3
+    #     baseline.
+    #   - fake_l1_size: physical bytes for the FC-L2-backed pi_cl_l1_malloc
+    #     arena (deeploy_fake_l1.c).  Must be ≥ MEMORYARENA_L1, with a
+    #     small headroom for alignment.
+    "Models/Training/CCT/cct_train": {
+        "l1": 64_000,
+        "l2": 2_000_000,
+        "fake_l1_size": 32_768,  # peak L1 working = 16388 B
+    },
+    "Models/Training/CCT_LoRA/cct_lora_train": {
+        "l1": 64_000,
+        "l2": 2_000_000,
+        "fake_l1_size": 32_768,  # peak L1 working = 16384 B
+    },
     "Models/Training/ResNet8/resnet8_train": {
-        # 800 KB is the smallest --l1 that still yields the minimal-tile
-        # schedule (peak L1 working set = 739 KB).  Anything between 800 KB
-        # and 4 MB produces identical numTiles arrays — we use the smallest
-        # value because the SBTiler's constraint solver (MiniMalloc) burns
-        # ubuntu-latest's 7 GB RAM at --l1=4 MB.
         "l1": 800_000,
         "l2": 2_000_000,
-        "fake_l1_size": 1_048_576,  # spike measured 739 KB; 1 MB headroom
+        "fake_l1_size": 1_048_576,  # peak L1 working = 739328 B
+    },
+    "Models/Training/MobileNetV1/mobilenetv1_train": {
+        "l1": 800_000,  # below 800K codegen asserts on accum_buffer DMA
+        "l2": 2_000_000,
+        "fake_l1_size": 786_432,  # peak L1 working = 542720 B
     },
-    # MobileNetV1 / CCT / CCT_LoRA fixtures pending: ResNet8 ships first as
-    # the fastest-to-validate L3 baseline.  Each new entry needs an explicit
-    # spike to size fake_l1_size before adding here.
 }
 
 # Per-model overrides for training tests.
diff --git a/scripts/ci_footprint_summary.py b/scripts/ci_footprint_summary.py
new file mode 100644
index 00000000..989a92de
--- /dev/null
+++ b/scripts/ci_footprint_summary.py
@@ -0,0 +1,89 @@
+#!/usr/bin/env python3
+# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+# SPDX-License-Identifier: Apache-2.0
+"""
+Emit a per-test footprint summary to GITHUB_STEP_SUMMARY.
+
+Walks `DeeployTest/TEST_SIRACUSA/` for generated `TrainingNetwork.c` files
+and reports, per fixture: MEMORYARENA_L1/L2/L3 sizes (peak working sets +
+L3 storage) and the number of distinct numTiles shapes.
+
+The numbers come from grepping the generated C — they're a build-time
+proxy for "how much memory pressure does this configuration put on the
+target".  This is the closest stand-in for the cycle comparison the user
+wants until the L3-untiled sim OOM is debugged and we can collect real
+gvsoc cycle counts.
+
+Used in the siracusa-tiled CI workflow.  Safe to run with no matching
+files (just emits an empty summary).
+"""
+
+import os
+import re
+import sys
+from pathlib import Path
+
+ARENA_RE = re.compile(r"MEMORYARENA_(L1|L2|L3)\s*=.*\*\s*(\d+)")
+TILES_RE = re.compile(r"numTiles\[\d+\]\s*=\s*\{[^}]+\}")
+
+
+def parse_one(c_path: Path) -> dict:
+    arenas = {"L1": 0, "L2": 0, "L3": 0}
+    tile_shapes = set()
+    for line in c_path.read_text(errors="replace").splitlines():
+        m = ARENA_RE.search(line)
+        if m:
+            arenas[m.group(1)] = max(arenas[m.group(1)], int(m.group(2)))
+        for t in TILES_RE.findall(line):
+            tile_shapes.add(t)
+    return {"arenas": arenas, "tile_shapes": len(tile_shapes)}
+
+
+def fmt_kb(n: int) -> str:
+    if n == 0:
+        return "—"
+    return f"{n / 1024:.1f} KB"
+
+
+def main() -> int:
+    test_root = Path(sys.argv[1]) if len(sys.argv) > 1 else Path("DeeployTest/TEST_SIRACUSA")
+    if not test_root.is_dir():
+        print(f"[footprint-summary] {test_root} not a directory; skipping", file=sys.stderr)
+        return 0
+
+    rows = []
+    for c_path in sorted(test_root.rglob("TrainingNetwork.c")):
+        rel = c_path.relative_to(test_root).parent
+        info = parse_one(c_path)
+        rows.append((str(rel), info))
+
+    # Pick the pytest marker label (passed by the workflow) for the section title.
+    label = os.environ.get("FOOTPRINT_SUMMARY_LABEL", "training")
+
+    out_lines = [
+        f"### Build footprint — `{label}`",
+        "",
+        "| Fixture | L1 working | L2 working | L3 storage | Distinct tile shapes |",
+        "|---|--:|--:|--:|--:|",
+    ]
+    for path, info in rows:
+        a = info["arenas"]
+        out_lines.append(
+            f"| `{path}` | {fmt_kb(a['L1'])} | {fmt_kb(a['L2'])} | {fmt_kb(a['L3'])} | {info['tile_shapes']} |")
+    if not rows:
+        out_lines.append("| _(no TrainingNetwork.c found)_ | | | | |")
+    out_lines.append("")
+
+    summary_path = os.environ.get("GITHUB_STEP_SUMMARY")
+    if summary_path:
+        with open(summary_path, "a") as f:
+            f.write("\n".join(out_lines) + "\n")
+        print(f"[footprint-summary] wrote {len(rows)} rows to {summary_path}", file=sys.stderr)
+    else:
+        # Local invocation: print to stdout for visibility.
+        print("\n".join(out_lines))
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())

From 443d45fe635cb70260e1971e40a24d25e966eefc Mon Sep 17 00:00:00 2001
From: runwangdl <samanthawangdl@gmail.com>
Date: Sun, 10 May 2026 19:49:56 +0000
Subject: [PATCH 07/18] feat(ci): per-model sim gate + cycle counts in
 untiled-L3 summary
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

User asked for the "how much slower is untiled vs tiled" data — the
existing footprint table doesn't carry it because everything was
--skipsim'd to dodge the sim-side OOM seen on ResNet8 / MobileNetV1.

Splits the L3-untiled job by model:

| Fixture     | sim in CI? | reason                                  |
|-------------|------------|-----------------------------------------|
| CCT         | yes        | 16 KB working set; gvsoc fits in 16 GB  |
| CCT_LoRA    | yes        | same                                    |
| ResNet8     | --skipsim  | OOM at ~8 min; deferred                 |
| MobileNetV1 | --skipsim  | OOM at ~8 min; deferred                 |

Mechanism: per-model `skip_sim_in_ci` flag in L3_UNTILED_TRAINING_MODELS;
test_siracusa_tiled_training_l3_untiled forces skipsim only when
`CI=true` AND the flag is set.  Local runs always do the full pipeline.
The global `--skipsim` is dropped from the CI workflow.

Cycle extractor (in scripts/ci_footprint_summary.py): parses
`DeeployTest/out.txt` for the `BENCH train_cycles=… opt_cycles=…`
lines emitted by deeploytraintest.c, correlates each to the preceding
`Testing <test_dir>` banner, and emits a second markdown table to
GITHUB_STEP_SUMMARY.  Skipped fixtures contribute no cycle row, so the
table only carries entries that actually ran.

Cycle comparison untiled vs tiled is read by eyeballing the two job
summaries side-by-side.  A unified cross-job aggregation needs an
artifact-passing pass; deferred.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../workflows/ci-platform-siracusa-tiled.yml  | 11 +--
 DeeployTest/test_platforms.py                 | 10 ++-
 DeeployTest/test_siracusa_tiled_config.py     | 10 +++
 scripts/ci_footprint_summary.py               | 81 ++++++++++++++++---
 4 files changed, 94 insertions(+), 18 deletions(-)

diff --git a/.github/workflows/ci-platform-siracusa-tiled.yml b/.github/workflows/ci-platform-siracusa-tiled.yml
index d58ea711..e730c583 100644
--- a/.github/workflows/ci-platform-siracusa-tiled.yml
+++ b/.github/workflows/ci-platform-siracusa-tiled.yml
@@ -50,12 +50,10 @@ jobs:
   # Training tests - L3 untiled baseline (single-tile-per-tensor; uses fake-L1
   # shim to back the oversized L1 working buffer with FC L2).
   #
-  # --skipsim is intentional here: previous runs got SIGKILLed (exit 137) on
-  # ubuntu-latest's 7 GB RAM after ~8 minutes of silent execution, and we need
-  # to know whether the OOM is in compile (kernel codegen + clang) or in
-  # gvsoc.  --skipsim verifies the codegen + compile path and the fake-L1
-  # shim's link integrity; sim verification is deferred until we either move
-  # to a beefier runner or shrink the per-step memory peak.
+  # Per-model skip_sim_in_ci gate (in test_siracusa_tiled_config.py) decides
+  # which fixtures actually run gvsoc on CI: CCT/CCT_LoRA do (16 KB working
+  # set, gvsoc fits comfortably); ResNet8/MobileNetV1 are --skipsim'd until
+  # the sim-side OOM at ~8 min is debugged or we move to a bigger runner.
   siracusa-training-tiled-l3-untiled:
     needs: select-env
     uses: ./.github/workflows/_runner-siracusa-tiled.yml
@@ -63,4 +61,3 @@ jobs:
       runner: ${{ needs.select-env.outputs.runner }}
       docker-image: ${{ needs.select-env.outputs.image }}
       pytest-marker: "training and l3 and untiled"
-      pytest-extra-args: "--skipsim"
diff --git a/DeeployTest/test_platforms.py b/DeeployTest/test_platforms.py
index adc56743..0b4136e7 100644
--- a/DeeployTest/test_platforms.py
+++ b/DeeployTest/test_platforms.py
@@ -2,6 +2,8 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+import os
+
 import pytest
 # Import platform-specific test configurations
 from test_chimera_config import KERNEL_TESTS as CHIMERA_KERNEL_TESTS
@@ -436,6 +438,11 @@ def test_siracusa_tiled_training_l3_untiled(test_name, deeploy_test_dir, toolcha
     SBTiler picks single-tile-per-tensor schedules.  The deeploy_fake_l1 shim
     (DEEPLOY_L1_AS_L2) redirects pi_cl_l1_malloc into an FC-L2 arena so the
     oversized "L1" working buffer (>physical 256 KB) actually fits.
+
+    Per-model skip_sim_in_ci gate: large fixtures (ResNet8 / MobileNetV1)
+    skip the gvsoc sim on CI runners because two prior runs got SIGKILLed
+    at ~8 min during simulation.  Local runs (no `CI` env var) still run
+    the full pipeline so the user can verify losses manually.
     """
     fixture = SIRACUSA_L3_UNTILED_TRAINING_MODELS[test_name]
     overrides = SIRACUSA_TRAINING_MODEL_OVERRIDES.get(test_name, {})
@@ -443,6 +450,7 @@ def test_siracusa_tiled_training_l3_untiled(test_name, deeploy_test_dir, toolcha
         f"-DDEEPLOY_L1_AS_L2=ON",
         f"-DDEEPLOY_FAKE_L1_SIZE={fixture['fake_l1_size']}",
     ]
+    effective_skipsim = skipsim or (os.environ.get("CI") == "true" and fixture.get("skip_sim_in_ci", False))
     config = create_test_config(
         test_name = test_name,
         platform = "Siracusa",
@@ -461,7 +469,7 @@ def test_siracusa_tiled_training_l3_untiled(test_name, deeploy_test_dir, toolcha
         training_num_data_inputs = overrides.get("num_data_inputs"),
         training_tolerance = overrides.get("tolerance"),
     )
-    run_and_assert_test(test_name, config, skipgen, skipsim)
+    run_and_assert_test(test_name, config, skipgen, effective_skipsim)
 
 
 @pytest.mark.siracusa_tiled
diff --git a/DeeployTest/test_siracusa_tiled_config.py b/DeeployTest/test_siracusa_tiled_config.py
index db27f791..d3273119 100644
--- a/DeeployTest/test_siracusa_tiled_config.py
+++ b/DeeployTest/test_siracusa_tiled_config.py
@@ -206,21 +206,31 @@
         "l1": 64_000,
         "l2": 2_000_000,
         "fake_l1_size": 32_768,  # peak L1 working = 16388 B
+        # Sim runs in CI: 16 KB working set is tiny enough that gvsoc
+        # doesn't OOM ubuntu-latest's 16 GB.
+        "skip_sim_in_ci": False,
     },
     "Models/Training/CCT_LoRA/cct_lora_train": {
         "l1": 64_000,
         "l2": 2_000_000,
         "fake_l1_size": 32_768,  # peak L1 working = 16384 B
+        "skip_sim_in_ci": False,
     },
     "Models/Training/ResNet8/resnet8_train": {
         "l1": 800_000,
         "l2": 2_000_000,
         "fake_l1_size": 1_048_576,  # peak L1 working = 739328 B
+        # Two prior CI runs got SIGKILLed (exit 137) at ~8 min during sim.
+        # Skip until the sim-side memory leak is debugged or we move to a
+        # bigger runner.  --skipsim still verifies codegen + compile + the
+        # fake-L1 shim's link integrity.
+        "skip_sim_in_ci": True,
     },
     "Models/Training/MobileNetV1/mobilenetv1_train": {
         "l1": 800_000,  # below 800K codegen asserts on accum_buffer DMA
         "l2": 2_000_000,
         "fake_l1_size": 786_432,  # peak L1 working = 542720 B
+        "skip_sim_in_ci": True,  # same OOM concern as ResNet8
     },
 }
 
diff --git a/scripts/ci_footprint_summary.py b/scripts/ci_footprint_summary.py
index 989a92de..b8330082 100644
--- a/scripts/ci_footprint_summary.py
+++ b/scripts/ci_footprint_summary.py
@@ -2,20 +2,20 @@
 # SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
 # SPDX-License-Identifier: Apache-2.0
 """
-Emit a per-test footprint summary to GITHUB_STEP_SUMMARY.
+Emit a per-test footprint + cycle summary to GITHUB_STEP_SUMMARY.
 
-Walks `DeeployTest/TEST_SIRACUSA/` for generated `TrainingNetwork.c` files
-and reports, per fixture: MEMORYARENA_L1/L2/L3 sizes (peak working sets +
-L3 storage) and the number of distinct numTiles shapes.
+Two passes:
 
-The numbers come from grepping the generated C — they're a build-time
-proxy for "how much memory pressure does this configuration put on the
-target".  This is the closest stand-in for the cycle comparison the user
-wants until the L3-untiled sim OOM is debugged and we can collect real
-gvsoc cycle counts.
+1. **Build footprint** — walks `DeeployTest/TEST_SIRACUSA/` for generated
+   `TrainingNetwork.c` files and reports per fixture: MEMORYARENA_L1/L2/L3
+   sizes (peak working sets + L3 storage) and distinct numTiles shapes.
+2. **Cycle counts** — parses `DeeployTest/out.txt` (where the test runner
+   appends every sim's stdout) for `BENCH train_cycles=… opt_cycles=…
+   weight_sram=…` lines, correlating each line to its preceding `Testing
+   <test_dir>` banner.  Skipped fixtures contribute no cycle row.
 
 Used in the siracusa-tiled CI workflow.  Safe to run with no matching
-files (just emits an empty summary).
+files (just emits an empty section).
 """
 
 import os
@@ -25,6 +25,8 @@
 
 ARENA_RE = re.compile(r"MEMORYARENA_(L1|L2|L3)\s*=.*\*\s*(\d+)")
 TILES_RE = re.compile(r"numTiles\[\d+\]\s*=\s*\{[^}]+\}")
+TESTING_RE = re.compile(r"Testing\s+(\S+)\s+on\s+\S+\s+Platform")
+BENCH_RE = re.compile(r"BENCH\s+train_cycles=(\d+)\s+opt_cycles=(\d+)\s+weight_sram=(\d+)")
 
 
 def parse_one(c_path: Path) -> dict:
@@ -45,6 +47,42 @@ def fmt_kb(n: int) -> str:
     return f"{n / 1024:.1f} KB"
 
 
+def fmt_cycles(n: int) -> str:
+    if n == 0:
+        return "—"
+    if n >= 1_000_000:
+        return f"{n / 1e6:.2f}M"
+    if n >= 1_000:
+        return f"{n / 1e3:.1f}K"
+    return str(n)
+
+
+def parse_cycles(out_txt: Path) -> dict:
+    """Returns {test_dir: {train_cycles, opt_cycles, weight_sram}}.
+
+    Each `Testing <path>` banner in out.txt opens a section; the next
+    `BENCH …` line in that section is the cycle row for that fixture.
+    Sections without a BENCH line (skipsim, sim crash) get no entry.
+    """
+    if not out_txt.is_file():
+        return {}
+    out: dict = {}
+    current = None
+    for line in out_txt.read_text(errors="replace").splitlines():
+        m = TESTING_RE.search(line)
+        if m:
+            current = m.group(1)
+            continue
+        m = BENCH_RE.search(line)
+        if m and current is not None:
+            out[current] = {
+                "train_cycles": int(m.group(1)),
+                "opt_cycles": int(m.group(2)),
+                "weight_sram": int(m.group(3)),
+            }
+    return out
+
+
 def main() -> int:
     test_root = Path(sys.argv[1]) if len(sys.argv) > 1 else Path("DeeployTest/TEST_SIRACUSA")
     if not test_root.is_dir():
@@ -57,6 +95,8 @@ def main() -> int:
         info = parse_one(c_path)
         rows.append((str(rel), info))
 
+    cycles = parse_cycles(test_root.parent / "out.txt")
+
     # Pick the pytest marker label (passed by the workflow) for the section title.
     label = os.environ.get("FOOTPRINT_SUMMARY_LABEL", "training")
 
@@ -74,6 +114,27 @@ def main() -> int:
         out_lines.append("| _(no TrainingNetwork.c found)_ | | | | |")
     out_lines.append("")
 
+    # Cycle table — only renders if at least one fixture actually simulated.
+    cycle_rows = []
+    for path, _info in rows:
+        # The `Testing` banner uses the absolute test_dir path; match by basename.
+        match_key = next((k for k in cycles if k.endswith(path) or path.endswith(Path(k).name)), None)
+        if match_key:
+            cycle_rows.append((path, cycles[match_key]))
+    out_lines.append(f"### Cycle counts (gvsoc) — `{label}`")
+    out_lines.append("")
+    out_lines.append("| Fixture | train_cycles | opt_cycles | weight_sram |")
+    out_lines.append("|---|--:|--:|--:|")
+    if cycle_rows:
+        for path, c in cycle_rows:
+            out_lines.append(
+                f"| `{path}` | {fmt_cycles(c['train_cycles'])} | "
+                f"{fmt_cycles(c['opt_cycles'])} | {fmt_kb(c['weight_sram'])} |")
+    else:
+        out_lines.append(
+            "| _(no BENCH lines in out.txt — sim was --skipsim'd or crashed)_ | | | |")
+    out_lines.append("")
+
     summary_path = os.environ.get("GITHUB_STEP_SUMMARY")
     if summary_path:
         with open(summary_path, "a") as f:

From 2df2b0beb4854ca9d0d4243b83ed38a80f4e0d28 Mon Sep 17 00:00:00 2001
From: runwangdl <samanthawangdl@gmail.com>
Date: Sun, 10 May 2026 20:14:04 +0000
Subject: [PATCH 08/18] fix(untiled): drop CCT/CCT_LoRA + per-fixture
 needs_fake_l1 gate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previous CI run produced absurd cycle counts for CCT untiled (528 vs
10.27M tiled).  Investigation:

1. CCT untiled and CCT tiled-L3 produce **byte-for-byte identical**
   TrainingNetwork.c (diff is just MiniMalloc statement-ordering noise).
   Both arena sizes match: L1=16K, L2=16K, L3=294K.
2. CCT's peak L1 working (16 KB) fits trivially in physical Siracusa L1
   (256 KB), so the deeploy_fake_l1 wrap is unnecessary.
3. The wrap intercepts every pi_cl_l1_malloc call site, including any
   SDK-internal one — the 528-cycle anomaly is consistent with the
   cluster never actually running training kernels because the SDK
   allocation got served from our small fake arena.

Restructure:
- Drop CCT and CCT_LoRA from L3_UNTILED_TRAINING_MODELS — they're
  semantically already covered by the tiled-L3-singlebuffer entry.
  Keep the comment so future readers know why.
- Add per-fixture `needs_fake_l1` flag (defaults False).  Test only
  applies -DDEEPLOY_L1_AS_L2=ON when needs_fake_l1=True.  Future fixtures
  in this dict that don't need the wrap won't get it.
- ResNet8 and MobileNetV1 stay (their peak L1 working is 739K / 530K,
  genuinely > physical L1).  Both still skip sim in CI pending OOM
  debug.

Cycle comparison "untiled vs tiled" therefore can't be done in CI right
now — the only fixtures where the comparison is meaningful (ResNet8 /
MobileNetV1) are skipsim'd.  Documented as a known follow-up.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/workflows/_runner-siracusa-tiled.yml |  8 +++
 DeeployTest/test_platforms.py                | 13 +++--
 DeeployTest/test_siracusa_tiled_config.py    | 53 +++++++++-----------
 3 files changed, 40 insertions(+), 34 deletions(-)

diff --git a/.github/workflows/_runner-siracusa-tiled.yml b/.github/workflows/_runner-siracusa-tiled.yml
index 15fd8041..16047857 100644
--- a/.github/workflows/_runner-siracusa-tiled.yml
+++ b/.github/workflows/_runner-siracusa-tiled.yml
@@ -53,3 +53,11 @@ jobs:
           cd DeeployTest
           python3 ../scripts/ci_footprint_summary.py TEST_SIRACUSA || true
         shell: bash
+      - name: Upload sim out.txt
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: sim-out-${{ inputs.pytest-marker }}
+          path: DeeployTest/out.txt
+          if-no-files-found: ignore
+          retention-days: 7
diff --git a/DeeployTest/test_platforms.py b/DeeployTest/test_platforms.py
index 0b4136e7..4dd3fb4a 100644
--- a/DeeployTest/test_platforms.py
+++ b/DeeployTest/test_platforms.py
@@ -446,10 +446,15 @@ def test_siracusa_tiled_training_l3_untiled(test_name, deeploy_test_dir, toolcha
     """
     fixture = SIRACUSA_L3_UNTILED_TRAINING_MODELS[test_name]
     overrides = SIRACUSA_TRAINING_MODEL_OVERRIDES.get(test_name, {})
-    extra_cmake = list(cmake_args) + [
-        f"-DDEEPLOY_L1_AS_L2=ON",
-        f"-DDEEPLOY_FAKE_L1_SIZE={fixture['fake_l1_size']}",
-    ]
+    extra_cmake = list(cmake_args)
+    if fixture.get("needs_fake_l1", False):
+        # Only opt in when peak L1 working > physical L1 — the wrap also
+        # intercepts SDK-internal pi_cl_l1_malloc calls and starves the
+        # cluster on small models that don't need it.
+        extra_cmake += [
+            f"-DDEEPLOY_L1_AS_L2=ON",
+            f"-DDEEPLOY_FAKE_L1_SIZE={fixture['fake_l1_size']}",
+        ]
     effective_skipsim = skipsim or (os.environ.get("CI") == "true" and fixture.get("skip_sim_in_ci", False))
     config = create_test_config(
         test_name = test_name,
diff --git a/DeeployTest/test_siracusa_tiled_config.py b/DeeployTest/test_siracusa_tiled_config.py
index d3273119..6811a2c0 100644
--- a/DeeployTest/test_siracusa_tiled_config.py
+++ b/DeeployTest/test_siracusa_tiled_config.py
@@ -188,41 +188,33 @@
 #
 # fake_l1_size baselining method: spike with --l1=4_000_000 → read off
 # MEMORYARENA_L1 size from generated TrainingNetwork.c → round up.
+# Untiled-L3 baseline.  ONLY contains models whose peak L1 working set
+# exceeds physical Siracusa L1 (~256 KB) — those are the only ones where
+# "untiled-L3" produces a different schedule than the existing tiled L3
+# singlebuffer test.  Smaller models (CCT, CCT_LoRA, ~16 KB working) get
+# the same byte-for-byte TrainingNetwork.c whether you pick tiled L3 or
+# untiled L3, so adding them here would be redundant — their tiled L3
+# entry IS their untiled-L3 baseline.
+#
+# The deeploy_fake_l1 shim (DEEPLOY_L1_AS_L2) intercepts pi_cl_l1_malloc
+# and serves it from a static FC-L2 arena, which is the only way the
+# 700-KB-class L1 working buffer can fit at runtime.  The shim has the
+# side effect of redirecting *every* pi_cl_l1_malloc call (including any
+# SDK-internal one), which breaks small models that don't actually need
+# it — hence the per-fixture needs_fake_l1 gate.
 L3_UNTILED_TRAINING_MODELS = {
-    # Per-model l1 / l2 / fake_l1_size were established by spiking
-    # testMVPTraining.py with --defaultMemLevel=L3 and reading
-    # MEMORYARENA_L1 from the generated TrainingNetwork.c.
-    #
-    #   - l1: planner-side budget passed to SBTiler (forces single-tile
-    #     schedules when generous enough).  Use the smallest value that
-    #     still compiles and yields the minimal-tile shape — larger values
-    #     blow MiniMalloc's RAM appetite past CI's 16 GB ceiling.
-    #   - l2: planner-side L2 budget; 2 MB matches the existing tiled L3
-    #     baseline.
-    #   - fake_l1_size: physical bytes for the FC-L2-backed pi_cl_l1_malloc
-    #     arena (deeploy_fake_l1.c).  Must be ≥ MEMORYARENA_L1, with a
-    #     small headroom for alignment.
-    "Models/Training/CCT/cct_train": {
-        "l1": 64_000,
-        "l2": 2_000_000,
-        "fake_l1_size": 32_768,  # peak L1 working = 16388 B
-        # Sim runs in CI: 16 KB working set is tiny enough that gvsoc
-        # doesn't OOM ubuntu-latest's 16 GB.
-        "skip_sim_in_ci": False,
-    },
-    "Models/Training/CCT_LoRA/cct_lora_train": {
-        "l1": 64_000,
-        "l2": 2_000_000,
-        "fake_l1_size": 32_768,  # peak L1 working = 16384 B
-        "skip_sim_in_ci": False,
-    },
     "Models/Training/ResNet8/resnet8_train": {
+        # 800 KB is the smallest --l1 that yields the minimal-tile shape
+        # (peak L1 working = 739 KB).  Larger values inflate MiniMalloc's
+        # RAM appetite past CI's ceiling.
         "l1": 800_000,
         "l2": 2_000_000,
         "fake_l1_size": 1_048_576,  # peak L1 working = 739328 B
-        # Two prior CI runs got SIGKILLed (exit 137) at ~8 min during sim.
-        # Skip until the sim-side memory leak is debugged or we move to a
-        # bigger runner.  --skipsim still verifies codegen + compile + the
+        "needs_fake_l1": True,
+        # Two prior CI runs got SIGKILLed (exit 137) at ~8 min during sim
+        # — gvsoc memory grows unbounded for the long single-tile training
+        # loop.  --skipsim until that's debugged or we move to a bigger
+        # runner.  --skipsim still verifies codegen + compile + the
         # fake-L1 shim's link integrity.
         "skip_sim_in_ci": True,
     },
@@ -230,6 +222,7 @@
         "l1": 800_000,  # below 800K codegen asserts on accum_buffer DMA
         "l2": 2_000_000,
         "fake_l1_size": 786_432,  # peak L1 working = 542720 B
+        "needs_fake_l1": True,
         "skip_sim_in_ci": True,  # same OOM concern as ResNet8
     },
 }

From 5580739a41278868bbdbd8a4564493751574c110 Mon Sep 17 00:00:00 2001
From: runwangdl <samanthawangdl@gmail.com>
Date: Sun, 10 May 2026 20:34:08 +0000
Subject: [PATCH 09/18] fix(fake-l1): try real L1 first; restore CCT/CCT_LoRA +
 sim for all 4
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two related fixes so the L3-untiled CI table actually carries cycles
for every L3 model.

# Shim no longer pollutes SDK

Old shim served EVERY pi_cl_l1_malloc from the static FC-L2 arena —
SDK-internal calls included.  CCT untiled then reported 528 cycles
(cluster never ran the kernels because some SDK invariant broke).

New shim tries __real_pi_cl_l1_malloc first.  Only requests that real
L1 cannot satisfy fall through to the FC-L2 arena.  __wrap_pi_cl_l1_free
mirrors by routing arena-range pointers to the bump rewind and everything
else to __real_pi_cl_l1_free.  SDK gets real L1 transparently; only
Deeploy's oversized MEMORYARENA_L1 sees the fake arena.

# Test fixture restored

CCT and CCT_LoRA back in L3_UNTILED_TRAINING_MODELS with
needs_fake_l1=False — they fit physical L1, codegen is byte-identical to
the tiled-L3 entry, and now sim runs cleanly because the shim is no
longer destructive.  Their cycles therefore == tiled-L3 cycles by
construction (a useful sanity row in the summary).

ResNet8 / MobileNetV1: skip_sim_in_ci=False — re-enable sim with the
fixed shim.  The earlier ~8-min SIGKILLs were almost certainly the
shim looping cluster init, not a genuine gvsoc memory leak.  If sim
still OOMs on ubuntu-latest after this fix, fall back to skipsim.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 DeeployTest/test_siracusa_tiled_config.py     | 59 ++++++++-----
 .../PULPOpen/src/deeploy_fake_l1.c            | 83 ++++++++++++-------
 2 files changed, 92 insertions(+), 50 deletions(-)

diff --git a/DeeployTest/test_siracusa_tiled_config.py b/DeeployTest/test_siracusa_tiled_config.py
index 6811a2c0..88117511 100644
--- a/DeeployTest/test_siracusa_tiled_config.py
+++ b/DeeployTest/test_siracusa_tiled_config.py
@@ -188,21 +188,41 @@
 #
 # fake_l1_size baselining method: spike with --l1=4_000_000 → read off
 # MEMORYARENA_L1 size from generated TrainingNetwork.c → round up.
-# Untiled-L3 baseline.  ONLY contains models whose peak L1 working set
-# exceeds physical Siracusa L1 (~256 KB) — those are the only ones where
-# "untiled-L3" produces a different schedule than the existing tiled L3
-# singlebuffer test.  Smaller models (CCT, CCT_LoRA, ~16 KB working) get
-# the same byte-for-byte TrainingNetwork.c whether you pick tiled L3 or
-# untiled L3, so adding them here would be redundant — their tiled L3
-# entry IS their untiled-L3 baseline.
+# Untiled-L3 baseline — single-tile-per-tensor schedules for every L3
+# training model so the user can read off "untiled L3 latency" alongside
+# the existing tiled-L3 cycles.  Each fixture goes through the same
+# SBTiler infrastructure as the L3 singlebuffer tests, but with --l1
+# inflated to the smallest value that still yields the minimal-tile
+# shape — at that point the generated C is one kernel call per op with
+# integral L3↔L2 DMA wrappers, no spatial split.
 #
-# The deeploy_fake_l1 shim (DEEPLOY_L1_AS_L2) intercepts pi_cl_l1_malloc
-# and serves it from a static FC-L2 arena, which is the only way the
-# 700-KB-class L1 working buffer can fit at runtime.  The shim has the
-# side effect of redirecting *every* pi_cl_l1_malloc call (including any
-# SDK-internal one), which breaks small models that don't actually need
-# it — hence the per-fixture needs_fake_l1 gate.
+# Two per-fixture knobs:
+#
+#   - needs_fake_l1: True when peak L1 working > physical Siracusa L1
+#     (256 KB).  When True, build adds -DDEEPLOY_L1_AS_L2 + the linker
+#     wrap that serves oversized pi_cl_l1_malloc calls from a static
+#     FC-L2 arena (deeploy_fake_l1.c).  Small-working-set models (CCT /
+#     CCT_LoRA, ~16 KB peak) leave it False — they fit real L1 and
+#     produce byte-identical codegen to the tiled L3 entry, so their
+#     untiled cycles == tiled cycles by construction.
+#
+#   - skip_sim_in_ci: True for models where gvsoc has historically
+#     OOMed the runner during the long single-tile loop.  CI still
+#     verifies codegen + compile + link in that case; sim is a manual
+#     local exercise.
 L3_UNTILED_TRAINING_MODELS = {
+    "Models/Training/CCT/cct_train": {
+        "l1": 64_000,
+        "l2": 2_000_000,
+        "needs_fake_l1": False,  # peak L1 working = 16388 B fits real L1
+        "skip_sim_in_ci": False,
+    },
+    "Models/Training/CCT_LoRA/cct_lora_train": {
+        "l1": 64_000,
+        "l2": 2_000_000,
+        "needs_fake_l1": False,  # peak L1 working = 16384 B
+        "skip_sim_in_ci": False,
+    },
     "Models/Training/ResNet8/resnet8_train": {
         # 800 KB is the smallest --l1 that yields the minimal-tile shape
         # (peak L1 working = 739 KB).  Larger values inflate MiniMalloc's
@@ -211,19 +231,18 @@
         "l2": 2_000_000,
         "fake_l1_size": 1_048_576,  # peak L1 working = 739328 B
         "needs_fake_l1": True,
-        # Two prior CI runs got SIGKILLed (exit 137) at ~8 min during sim
-        # — gvsoc memory grows unbounded for the long single-tile training
-        # loop.  --skipsim until that's debugged or we move to a bigger
-        # runner.  --skipsim still verifies codegen + compile + the
-        # fake-L1 shim's link integrity.
-        "skip_sim_in_ci": True,
+        # Try sim again with the fixed shim (real-L1-first, fall back to
+        # FC-L2 arena only when L1 is exhausted).  Earlier runs OOMed at
+        # ~8 min — believed to be the broken shim looping cluster init,
+        # not a real gvsoc memory leak.
+        "skip_sim_in_ci": False,
     },
     "Models/Training/MobileNetV1/mobilenetv1_train": {
         "l1": 800_000,  # below 800K codegen asserts on accum_buffer DMA
         "l2": 2_000_000,
         "fake_l1_size": 786_432,  # peak L1 working = 542720 B
         "needs_fake_l1": True,
-        "skip_sim_in_ci": True,  # same OOM concern as ResNet8
+        "skip_sim_in_ci": False,
     },
 }
 
diff --git a/TargetLibraries/PULPOpen/src/deeploy_fake_l1.c b/TargetLibraries/PULPOpen/src/deeploy_fake_l1.c
index db1c8b67..e703a413 100644
--- a/TargetLibraries/PULPOpen/src/deeploy_fake_l1.c
+++ b/TargetLibraries/PULPOpen/src/deeploy_fake_l1.c
@@ -3,20 +3,30 @@
  *
  * SPDX-License-Identifier: Apache-2.0
  *
- * Untiled-L3 baseline: redirect cluster-L1 allocations to a static L2 arena.
+ * Untiled-L3 baseline: redirect cluster-L1 allocations to a static L2 arena
+ * — but ONLY for allocations that don't fit in the real L1 heap.
  *
  * Activated by `-DDEEPLOY_L1_AS_L2 -DDEEPLOY_FAKE_L1_SIZE=N` plus the linker
- * flags `-Wl,--wrap=pi_cl_l1_malloc -Wl,--wrap=pi_cl_l1_free`. The wrap pair
- * intercepts the SDK's strong symbols so the deeploy-generated code keeps
- * calling pmsis_l1_malloc / pi_cl_l1_malloc as if it were targeting cluster
- * L1 — physically the bytes live in the FC L2 region instead, which on
- * Siracusa has ~1.94 MB of headroom (vs. ~256 KB for real cluster L1).
+ * flags `-Wl,--wrap=pi_cl_l1_malloc -Wl,--wrap=pi_cl_l1_free`.
+ *
+ * Coexistence with the SDK
+ * ------------------------
+ * The wrap intercepts EVERY pi_cl_l1_malloc call site, including SDK-internal
+ * ones (cluster init, driver scratch, etc.). A first version that always
+ * served from the static arena broke the cluster on small models — likely
+ * because SDK code received a pointer outside the real L1 region and either
+ * its own bookkeeping went wrong or a downstream API rejected it.
+ *
+ * Mitigation: try the SDK's real L1 allocator first via the linker's
+ * `__real_*` symbols. If that succeeds, hand the SDK pointer back. Only when
+ * the request is too big for real L1 (the case we're here for: an oversized
+ * MEMORYARENA_L1) fall through to the static FC-L2 arena. Free mirrors the
+ * decision by checking whether the pointer falls inside our arena.
  *
  * Trade-off: kernels that were tuned for L1's 1-cycle access see L2 latency
- * (~7 cycles), so cycle counts under this mode are NOT representative of
- * silicon. The mode exists to provide a single-tile-per-tensor untiled-L3
- * baseline for *correctness* comparison against the tiled L3 path; cycle
- * realism for the same workload still requires the tiled run.
+ * (~7 cycles) for the buffers served from the fake arena. Cycles under this
+ * mode are NOT silicon-representative — the mode exists to give a per-op
+ * single-tile latency baseline against the existing tiled-L3 path.
  */
 
 #include <stdint.h>
@@ -29,42 +39,55 @@
 
 #include "pmsis.h"
 
+/* Linker-provided originals (--wrap=foo exposes __real_foo). */
+extern void *__real_pi_cl_l1_malloc(struct pi_device *device, uint32_t size);
+extern void __real_pi_cl_l1_free(struct pi_device *device, void *chunk, int size);
+
 /* Static arena in FC L2 — sized at compile time. PI_L2 maps to the SDK's L2
  * shared region (0x1C010000-0x1C200000 on Siracusa, ~1.94 MB). Aligned to 8
  * to satisfy any kernel that reinterprets float32_t / pulpv2 v2f16 buffers. */
 __attribute__((aligned(8)))
 PI_L2 static uint8_t deeploy_fake_l1_arena[DEEPLOY_FAKE_L1_SIZE];
-
-/* Bump pointer + free-list. The deeploy-generated harness allocates the L1
- * arena once at InitTrainingNetwork() time and frees it once at teardown,
- * so we don't need a real heap — a bump allocator with a single rewind on
- * full-arena free is sufficient and cheap.
- *
- * If the harness pattern ever changes (e.g. fine-grained per-op alloc/free),
- * swap this for an extern_alloc_t pool the way dory_mem.c does for L3. */
 static uint32_t deeploy_fake_l1_offset = 0;
 
+static inline int in_fake_arena(const void *p) {
+  return (const uint8_t *)p >= deeploy_fake_l1_arena
+         && (const uint8_t *)p < deeploy_fake_l1_arena + DEEPLOY_FAKE_L1_SIZE;
+}
+
 void *__wrap_pi_cl_l1_malloc(struct pi_device *device, uint32_t size) {
-  (void)device;
-  /* 8-byte alignment for every allocation so consecutive callers stay
-   * aligned even when `size` is not a multiple of 8. */
+  /* Try real L1 first — any small SDK / Deeploy alloc that fits stays in
+   * real L1, so SDK bookkeeping and L1-tuned kernels are unaffected. */
+  void *p = __real_pi_cl_l1_malloc(device, size);
+  if (p != (void *)0) {
+    return p;
+  }
+  /* Real L1 exhausted (or request bigger than L1 heap). Serve from FC-L2
+   * arena: the only legitimate caller here is Deeploy's MEMORYARENA_L1
+   * for a model whose peak L1 working set exceeds 256 KB. */
   uint32_t aligned = (size + 7u) & ~7u;
   if (deeploy_fake_l1_offset + aligned > DEEPLOY_FAKE_L1_SIZE) {
     return (void *)0;
   }
-  void *p = (void *)&deeploy_fake_l1_arena[deeploy_fake_l1_offset];
+  void *q = (void *)&deeploy_fake_l1_arena[deeploy_fake_l1_offset];
   deeploy_fake_l1_offset += aligned;
-  return p;
+  return q;
 }
 
 void __wrap_pi_cl_l1_free(struct pi_device *device, void *chunk, int size) {
-  (void)device;
-  (void)chunk;
-  (void)size;
-  /* Bump-allocator semantics: per-block free is a no-op. The harness
-   * frees the whole arena at teardown; we rewind there. */
-  if (deeploy_fake_l1_offset >= (uint32_t)size) {
-    deeploy_fake_l1_offset -= ((uint32_t)size + 7u) & ~7u;
+  if (!in_fake_arena(chunk)) {
+    __real_pi_cl_l1_free(device, chunk, size);
+    return;
+  }
+  /* Bump-allocator rewind for arena pointers. Safe under LIFO free order
+   * (the only pattern Deeploy's harness uses); a non-LIFO free is a no-op
+   * and leaks until the next full reset. */
+  uint32_t aligned = ((uint32_t)size + 7u) & ~7u;
+  if (deeploy_fake_l1_offset >= aligned) {
+    uint8_t *expected = deeploy_fake_l1_arena + deeploy_fake_l1_offset - aligned;
+    if ((uint8_t *)chunk == expected) {
+      deeploy_fake_l1_offset -= aligned;
+    }
   }
 }
 

From ef41f3a754e1affa09a38ee43b360cb7fec1de6f Mon Sep 17 00:00:00 2001
From: runwangdl <samanthawangdl@gmail.com>
Date: Sun, 10 May 2026 21:13:24 +0000
Subject: [PATCH 10/18] =?UTF-8?q?feat(untiled):=20sed-based=20L1=E2=86=92L?=
 =?UTF-8?q?2=20codegen=20rewrite=20+=20isolate=20to=20one=20CI=20job?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replaces the fake-L1 shim approach with a direct codegen post-process so
all 4 L3 untiled fixtures end up with kernels physically reading FC L2
(no SDK pollution, no wrap, no shim).

# Codegen post-process (test_siracusa_tiled_training_l3_untiled)

After generate_network() and before configure_cmake(), the test rewrites
the generated TrainingNetwork.c / OptimizerNetwork.c:

    pmsis_l1_malloc -> pi_l2_malloc
    PI_L1           -> PI_L2

Every L1-annotated buffer (including MEMORYARENA_L1) now lives in FC L2.
Cluster cores access kernel buffers via the fabric (~7x slower than real
L1) — this is the deliberate "untiled, L2-resident working set" cycle
semantic the user asked for.  All 4 L3 models give comparable cycle
counts under the same resource model.

# Removals

- deeploy_fake_l1.c (gone)
- DEEPLOY_L1_AS_L2 / DEEPLOY_FAKE_L1_SIZE / linker --wrap flags (gone)
- needs_fake_l1 / fake_l1_size fixture fields (gone)

# CI temporarily isolated

Goal of this branch is to collect untiled cycle data, so:
- ci-platform-siracusa.yml: push/pull_request triggers disabled
  (workflow_dispatch only)
- ci-platform-siracusa-tiled.yml: L2/L3-singlebuffer jobs commented out;
  only siracusa-training-tiled-l3-untiled runs

Both flagged with "restore before merging" comments.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../workflows/ci-platform-siracusa-tiled.yml  | 50 +++++-----
 .github/workflows/ci-platform-siracusa.yml    |  9 +-
 DeeployTest/test_platforms.py                 | 59 ++++++++----
 DeeployTest/test_siracusa_tiled_config.py     | 41 +++-----
 TargetLibraries/PULPOpen/CMakeLists.txt       | 17 ----
 .../PULPOpen/src/deeploy_fake_l1.c            | 94 -------------------
 6 files changed, 82 insertions(+), 188 deletions(-)
 delete mode 100644 TargetLibraries/PULPOpen/src/deeploy_fake_l1.c

diff --git a/.github/workflows/ci-platform-siracusa-tiled.yml b/.github/workflows/ci-platform-siracusa-tiled.yml
index e730c583..5536fda5 100644
--- a/.github/workflows/ci-platform-siracusa-tiled.yml
+++ b/.github/workflows/ci-platform-siracusa-tiled.yml
@@ -29,31 +29,33 @@ jobs:
     with:
       docker_image_deeploy: ${{ inputs.docker_image_deeploy }}
 
-  # Training tests - L2 singlebuffer
-  siracusa-training-tiled-l2-singlebuffer:
-    needs: select-env
-    uses: ./.github/workflows/_runner-siracusa-tiled.yml
-    with:
-      runner: ${{ needs.select-env.outputs.runner }}
-      docker-image: ${{ needs.select-env.outputs.image }}
-      pytest-marker: "training and l2 and singlebuffer"
-
-  # Training tests - L3 singlebuffer (models that spill weights/activations to L3)
-  siracusa-training-tiled-l3-singlebuffer:
-    needs: select-env
-    uses: ./.github/workflows/_runner-siracusa-tiled.yml
-    with:
-      runner: ${{ needs.select-env.outputs.runner }}
-      docker-image: ${{ needs.select-env.outputs.image }}
-      pytest-marker: "training and l3 and singlebuffer"
-
-  # Training tests - L3 untiled baseline (single-tile-per-tensor; uses fake-L1
-  # shim to back the oversized L1 working buffer with FC L2).
+  # NOTE: All other Siracusa-tiled jobs are temporarily disabled on this
+  # branch so the L3-untiled job runs in isolation while we collect
+  # cycle-count data. Restore the L2 / L3 singlebuffer entries below
+  # before merging.
+  #
+  # # Training tests - L2 singlebuffer
+  # siracusa-training-tiled-l2-singlebuffer:
+  #   needs: select-env
+  #   uses: ./.github/workflows/_runner-siracusa-tiled.yml
+  #   with:
+  #     runner: ${{ needs.select-env.outputs.runner }}
+  #     docker-image: ${{ needs.select-env.outputs.image }}
+  #     pytest-marker: "training and l2 and singlebuffer"
   #
-  # Per-model skip_sim_in_ci gate (in test_siracusa_tiled_config.py) decides
-  # which fixtures actually run gvsoc on CI: CCT/CCT_LoRA do (16 KB working
-  # set, gvsoc fits comfortably); ResNet8/MobileNetV1 are --skipsim'd until
-  # the sim-side OOM at ~8 min is debugged or we move to a bigger runner.
+  # # Training tests - L3 singlebuffer
+  # siracusa-training-tiled-l3-singlebuffer:
+  #   needs: select-env
+  #   uses: ./.github/workflows/_runner-siracusa-tiled.yml
+  #   with:
+  #     runner: ${{ needs.select-env.outputs.runner }}
+  #     docker-image: ${{ needs.select-env.outputs.image }}
+  #     pytest-marker: "training and l3 and singlebuffer"
+
+  # Training tests - L3 untiled baseline.  Codegen post-process rewrites
+  # every L1-annotated buffer to FC L2 so cluster cores access kernel
+  # buffers via the fabric — "untiled, L2-resident working set" cycle
+  # semantics for all 4 L3 models.
   siracusa-training-tiled-l3-untiled:
     needs: select-env
     uses: ./.github/workflows/_runner-siracusa-tiled.yml
diff --git a/.github/workflows/ci-platform-siracusa.yml b/.github/workflows/ci-platform-siracusa.yml
index 7a4f415e..c1e7db5d 100644
--- a/.github/workflows/ci-platform-siracusa.yml
+++ b/.github/workflows/ci-platform-siracusa.yml
@@ -5,13 +5,10 @@
 ---
 name: CI • Siracusa
 
+# NOTE: Push / pull_request triggers temporarily disabled on this branch
+# so only the L3-untiled job runs while we collect cycle-count data.
+# Restore the push: / pull_request: blocks before merging.
 "on":
-  push:
-    branches:
-      - "**"
-    tags:
-      - "v*.*.*"
-  pull_request:
   workflow_dispatch:
     inputs:
       docker_image_deeploy:
diff --git a/DeeployTest/test_platforms.py b/DeeployTest/test_platforms.py
index 4dd3fb4a..df2653a7 100644
--- a/DeeployTest/test_platforms.py
+++ b/DeeployTest/test_platforms.py
@@ -434,27 +434,29 @@ def test_siracusa_tiled_training_l3_untiled(test_name, deeploy_test_dir, toolcha
                                             skipsim) -> None:
     """Untiled-L3 baseline.
 
-    Reuses the tiled codegen pipeline but inflates --l1 large enough that the
-    SBTiler picks single-tile-per-tensor schedules.  The deeploy_fake_l1 shim
-    (DEEPLOY_L1_AS_L2) redirects pi_cl_l1_malloc into an FC-L2 arena so the
-    oversized "L1" working buffer (>physical 256 KB) actually fits.
-
-    Per-model skip_sim_in_ci gate: large fixtures (ResNet8 / MobileNetV1)
-    skip the gvsoc sim on CI runners because two prior runs got SIGKILLed
-    at ~8 min during simulation.  Local runs (no `CI` env var) still run
-    the full pipeline so the user can verify losses manually.
+    SBTiler picks single-tile-per-tensor schedules (--l1 inflated above the
+    op working set so no spatial split happens).  The generated C is one
+    kernel call per op with integral L3↔L2 DMA wrappers.
+
+    To make the L1 staging buffer physically live in FC L2 (so cycles
+    represent "kernel actually accessing L2"), we post-process the
+    generated TrainingNetwork.c / OptimizerNetwork.c after codegen but
+    before cmake build:
+
+        pmsis_l1_malloc -> pi_l2_malloc
+        PI_L1           -> PI_L2
+
+    Every L1-annotated buffer ends up in FC L2.  Cluster cores access L2
+    via the fabric (~7x slower than real L1) — that's the deliberate
+    semantics of "untiled L2-resident".  No fake-L1 shim, no linker wrap,
+    no SDK pollution.
     """
+    from pathlib import Path
+
+    from testUtils.core.execution import build_binary, configure_cmake, generate_network, run_simulation
+
     fixture = SIRACUSA_L3_UNTILED_TRAINING_MODELS[test_name]
     overrides = SIRACUSA_TRAINING_MODEL_OVERRIDES.get(test_name, {})
-    extra_cmake = list(cmake_args)
-    if fixture.get("needs_fake_l1", False):
-        # Only opt in when peak L1 working > physical L1 — the wrap also
-        # intercepts SDK-internal pi_cl_l1_malloc calls and starves the
-        # cluster on small models that don't need it.
-        extra_cmake += [
-            f"-DDEEPLOY_L1_AS_L2=ON",
-            f"-DDEEPLOY_FAKE_L1_SIZE={fixture['fake_l1_size']}",
-        ]
     effective_skipsim = skipsim or (os.environ.get("CI") == "true" and fixture.get("skip_sim_in_ci", False))
     config = create_test_config(
         test_name = test_name,
@@ -463,7 +465,7 @@ def test_siracusa_tiled_training_l3_untiled(test_name, deeploy_test_dir, toolcha
         deeploy_test_dir = deeploy_test_dir,
         toolchain = toolchain,
         toolchain_dir = toolchain_dir,
-        cmake_args = extra_cmake,
+        cmake_args = cmake_args,
         tiling = True,
         cores = SIRACUSA_DEFAULT_CORES,
         l1 = fixture["l1"],
@@ -474,7 +476,24 @@ def test_siracusa_tiled_training_l3_untiled(test_name, deeploy_test_dir, toolcha
         training_num_data_inputs = overrides.get("num_data_inputs"),
         training_tolerance = overrides.get("tolerance"),
     )
-    run_and_assert_test(test_name, config, skipgen, effective_skipsim)
+
+    # Inline the test runner stages so we can sed between codegen and build.
+    generate_network(config, skip = skipgen)
+    for c_name in ("TrainingNetwork.c", "OptimizerNetwork.c"):
+        c_path = Path(config.gen_dir) / c_name
+        if not c_path.exists():
+            continue
+        text = c_path.read_text()
+        text = text.replace("pmsis_l1_malloc", "pi_l2_malloc")
+        text = text.replace("PI_L1 ", "PI_L2 ")
+        c_path.write_text(text)
+    configure_cmake(config)
+    build_binary(config)
+    result = run_simulation(config, skip = effective_skipsim)
+    assert result.success, (f"Test {test_name} failed with {result.error_count} errors out of "
+                            f"{result.total_count}\nOutput:\n{result.stdout}")
+    if result.error_count >= 0:
+        assert result.error_count == 0, (f"Found {result.error_count} errors out of {result.total_count} tests")
 
 
 @pytest.mark.siracusa_tiled
diff --git a/DeeployTest/test_siracusa_tiled_config.py b/DeeployTest/test_siracusa_tiled_config.py
index 88117511..2dd8f2ba 100644
--- a/DeeployTest/test_siracusa_tiled_config.py
+++ b/DeeployTest/test_siracusa_tiled_config.py
@@ -190,37 +190,32 @@
 # MEMORYARENA_L1 size from generated TrainingNetwork.c → round up.
 # Untiled-L3 baseline — single-tile-per-tensor schedules for every L3
 # training model so the user can read off "untiled L3 latency" alongside
-# the existing tiled-L3 cycles.  Each fixture goes through the same
-# SBTiler infrastructure as the L3 singlebuffer tests, but with --l1
-# inflated to the smallest value that still yields the minimal-tile
-# shape — at that point the generated C is one kernel call per op with
-# integral L3↔L2 DMA wrappers, no spatial split.
+# the existing tiled-L3 cycles.
 #
-# Two per-fixture knobs:
+# Each fixture goes through the same SBTiler infrastructure as the L3
+# singlebuffer tests, with --l1 inflated to the smallest value that
+# yields the minimal-tile shape (one kernel call per op + integral
+# L3↔L2 DMA, no spatial split).
 #
-#   - needs_fake_l1: True when peak L1 working > physical Siracusa L1
-#     (256 KB).  When True, build adds -DDEEPLOY_L1_AS_L2 + the linker
-#     wrap that serves oversized pi_cl_l1_malloc calls from a static
-#     FC-L2 arena (deeploy_fake_l1.c).  Small-working-set models (CCT /
-#     CCT_LoRA, ~16 KB peak) leave it False — they fit real L1 and
-#     produce byte-identical codegen to the tiled L3 entry, so their
-#     untiled cycles == tiled cycles by construction.
+# After codegen, the test post-processes TrainingNetwork.c /
+# OptimizerNetwork.c to swap pmsis_l1_malloc → pi_l2_malloc and
+# PI_L1 → PI_L2, so every L1-annotated buffer physically lives in
+# FC L2.  Cluster cores access these via the fabric (~7x slower than
+# real L1) — that's the deliberate semantics of "untiled L2-resident".
 #
-#   - skip_sim_in_ci: True for models where gvsoc has historically
-#     OOMed the runner during the long single-tile loop.  CI still
-#     verifies codegen + compile + link in that case; sim is a manual
-#     local exercise.
+# skip_sim_in_ci: True for fixtures where gvsoc has historically OOMed
+# during the long single-tile loop.  CI still verifies codegen +
+# compile + link in that case; sim is deferred to a manual local run
+# or a beefier runner.
 L3_UNTILED_TRAINING_MODELS = {
     "Models/Training/CCT/cct_train": {
         "l1": 64_000,
         "l2": 2_000_000,
-        "needs_fake_l1": False,  # peak L1 working = 16388 B fits real L1
         "skip_sim_in_ci": False,
     },
     "Models/Training/CCT_LoRA/cct_lora_train": {
         "l1": 64_000,
         "l2": 2_000_000,
-        "needs_fake_l1": False,  # peak L1 working = 16384 B
         "skip_sim_in_ci": False,
     },
     "Models/Training/ResNet8/resnet8_train": {
@@ -229,19 +224,11 @@
         # RAM appetite past CI's ceiling.
         "l1": 800_000,
         "l2": 2_000_000,
-        "fake_l1_size": 1_048_576,  # peak L1 working = 739328 B
-        "needs_fake_l1": True,
-        # Try sim again with the fixed shim (real-L1-first, fall back to
-        # FC-L2 arena only when L1 is exhausted).  Earlier runs OOMed at
-        # ~8 min — believed to be the broken shim looping cluster init,
-        # not a real gvsoc memory leak.
         "skip_sim_in_ci": False,
     },
     "Models/Training/MobileNetV1/mobilenetv1_train": {
         "l1": 800_000,  # below 800K codegen asserts on accum_buffer DMA
         "l2": 2_000_000,
-        "fake_l1_size": 786_432,  # peak L1 working = 542720 B
-        "needs_fake_l1": True,
         "skip_sim_in_ci": False,
     },
 }
diff --git a/TargetLibraries/PULPOpen/CMakeLists.txt b/TargetLibraries/PULPOpen/CMakeLists.txt
index 3ae97d91..ce39fea7 100644
--- a/TargetLibraries/PULPOpen/CMakeLists.txt
+++ b/TargetLibraries/PULPOpen/CMakeLists.txt
@@ -71,23 +71,6 @@ target_link_libraries(deeploypulp PUBLIC pulp-nn-mixed)
 target_link_libraries(deeploypulp INTERFACE pulp-sdk)
 target_sources(deeploypulp INTERFACE $<TARGET_OBJECTS:pulp-sdk>)
 
-# Untiled-L3 baseline: redirect pi_cl_l1_malloc/free to a static L2 arena via
-# linker --wrap so cluster-L1 alloc requests up to DEEPLOY_FAKE_L1_SIZE bytes
-# (must be ≤ remaining FC L2 ~1.94 MB) succeed even though physical L1 is only
-# 256 KB. Source file deeploy_fake_l1.c is no-op when DEEPLOY_L1_AS_L2 is OFF.
-option(DEEPLOY_L1_AS_L2 "Redirect pi_cl_l1_malloc to a static FC-L2 arena (untiled-L3 baseline)" OFF)
-set(DEEPLOY_FAKE_L1_SIZE "1048576" CACHE STRING "Size in bytes of the fake-L1 arena placed in FC L2")
-if(DEEPLOY_L1_AS_L2)
-  target_compile_definitions(deeploypulp PRIVATE
-    DEEPLOY_L1_AS_L2
-    DEEPLOY_FAKE_L1_SIZE=${DEEPLOY_FAKE_L1_SIZE}
-  )
-  target_link_options(deeploypulp INTERFACE
-    "-Wl,--wrap=pi_cl_l1_malloc"
-    "-Wl,--wrap=pi_cl_l1_free"
-  )
-endif()
-
 set(PULP_NNX_PLATFORMS "Siracusa;Siracusa_w_neureka")
 if (platform IN_LIST PULP_NNX_PLATFORMS)
   if (platform STREQUAL "Siracusa" OR platform STREQUAL "Siracusa_w_neureka")
diff --git a/TargetLibraries/PULPOpen/src/deeploy_fake_l1.c b/TargetLibraries/PULPOpen/src/deeploy_fake_l1.c
deleted file mode 100644
index e703a413..00000000
--- a/TargetLibraries/PULPOpen/src/deeploy_fake_l1.c
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Untiled-L3 baseline: redirect cluster-L1 allocations to a static L2 arena
- * — but ONLY for allocations that don't fit in the real L1 heap.
- *
- * Activated by `-DDEEPLOY_L1_AS_L2 -DDEEPLOY_FAKE_L1_SIZE=N` plus the linker
- * flags `-Wl,--wrap=pi_cl_l1_malloc -Wl,--wrap=pi_cl_l1_free`.
- *
- * Coexistence with the SDK
- * ------------------------
- * The wrap intercepts EVERY pi_cl_l1_malloc call site, including SDK-internal
- * ones (cluster init, driver scratch, etc.). A first version that always
- * served from the static arena broke the cluster on small models — likely
- * because SDK code received a pointer outside the real L1 region and either
- * its own bookkeeping went wrong or a downstream API rejected it.
- *
- * Mitigation: try the SDK's real L1 allocator first via the linker's
- * `__real_*` symbols. If that succeeds, hand the SDK pointer back. Only when
- * the request is too big for real L1 (the case we're here for: an oversized
- * MEMORYARENA_L1) fall through to the static FC-L2 arena. Free mirrors the
- * decision by checking whether the pointer falls inside our arena.
- *
- * Trade-off: kernels that were tuned for L1's 1-cycle access see L2 latency
- * (~7 cycles) for the buffers served from the fake arena. Cycles under this
- * mode are NOT silicon-representative — the mode exists to give a per-op
- * single-tile latency baseline against the existing tiled-L3 path.
- */
-
-#include <stdint.h>
-
-#ifdef DEEPLOY_L1_AS_L2
-
-#ifndef DEEPLOY_FAKE_L1_SIZE
-#error "DEEPLOY_L1_AS_L2 requires -DDEEPLOY_FAKE_L1_SIZE=<bytes>"
-#endif
-
-#include "pmsis.h"
-
-/* Linker-provided originals (--wrap=foo exposes __real_foo). */
-extern void *__real_pi_cl_l1_malloc(struct pi_device *device, uint32_t size);
-extern void __real_pi_cl_l1_free(struct pi_device *device, void *chunk, int size);
-
-/* Static arena in FC L2 — sized at compile time. PI_L2 maps to the SDK's L2
- * shared region (0x1C010000-0x1C200000 on Siracusa, ~1.94 MB). Aligned to 8
- * to satisfy any kernel that reinterprets float32_t / pulpv2 v2f16 buffers. */
-__attribute__((aligned(8)))
-PI_L2 static uint8_t deeploy_fake_l1_arena[DEEPLOY_FAKE_L1_SIZE];
-static uint32_t deeploy_fake_l1_offset = 0;
-
-static inline int in_fake_arena(const void *p) {
-  return (const uint8_t *)p >= deeploy_fake_l1_arena
-         && (const uint8_t *)p < deeploy_fake_l1_arena + DEEPLOY_FAKE_L1_SIZE;
-}
-
-void *__wrap_pi_cl_l1_malloc(struct pi_device *device, uint32_t size) {
-  /* Try real L1 first — any small SDK / Deeploy alloc that fits stays in
-   * real L1, so SDK bookkeeping and L1-tuned kernels are unaffected. */
-  void *p = __real_pi_cl_l1_malloc(device, size);
-  if (p != (void *)0) {
-    return p;
-  }
-  /* Real L1 exhausted (or request bigger than L1 heap). Serve from FC-L2
-   * arena: the only legitimate caller here is Deeploy's MEMORYARENA_L1
-   * for a model whose peak L1 working set exceeds 256 KB. */
-  uint32_t aligned = (size + 7u) & ~7u;
-  if (deeploy_fake_l1_offset + aligned > DEEPLOY_FAKE_L1_SIZE) {
-    return (void *)0;
-  }
-  void *q = (void *)&deeploy_fake_l1_arena[deeploy_fake_l1_offset];
-  deeploy_fake_l1_offset += aligned;
-  return q;
-}
-
-void __wrap_pi_cl_l1_free(struct pi_device *device, void *chunk, int size) {
-  if (!in_fake_arena(chunk)) {
-    __real_pi_cl_l1_free(device, chunk, size);
-    return;
-  }
-  /* Bump-allocator rewind for arena pointers. Safe under LIFO free order
-   * (the only pattern Deeploy's harness uses); a non-LIFO free is a no-op
-   * and leaks until the next full reset. */
-  uint32_t aligned = ((uint32_t)size + 7u) & ~7u;
-  if (deeploy_fake_l1_offset >= aligned) {
-    uint8_t *expected = deeploy_fake_l1_arena + deeploy_fake_l1_offset - aligned;
-    if ((uint8_t *)chunk == expected) {
-      deeploy_fake_l1_offset -= aligned;
-    }
-  }
-}
-
-#endif /* DEEPLOY_L1_AS_L2 */

From 13741bf6d8ad123bb4860a6274401687f2e99247 Mon Sep 17 00:00:00 2001
From: runwangdl <samanthawangdl@gmail.com>
Date: Sun, 10 May 2026 21:41:06 +0000
Subject: [PATCH 11/18] fix(untiled): mchan_transfer_1d -> memcpy under
 DEEPLOY_L1_AS_L2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

CI run 25639948195 had all 4 L3-untiled fixtures FAILED with
"computed=0.0 ref=N.NN" + cluster L1 bank "out-of-bound request"
warnings.  Cause: PULP mchan DMA hardware ignores destination pointer
addresses and unconditionally routes the `loc` parameter into cluster
L1 banks.  Sed-rewriting buffers to FC L2 left the DMA calls intact,
so DMA wrote into L1 (out of bounds) while kernels read from L2 (empty).

Fix in TargetLibraries/PULPOpen/inc/mchan_v7.h: under DEEPLOY_L1_AS_L2,
mchan_transfer_1d is replaced with memcpy that respects the EXT2LOC /
LOC2EXT direction flag, and the channel API (alloc/wait/free/is_busy)
becomes a no-op.  Combined with the existing test-side sed, every
buffer + every staging copy now lives in / goes through FC L2 — the
"untiled L2-resident" semantic the user actually wanted.

CMake exposes the option; the L3-untiled pytest fixture passes
-DDEEPLOY_L1_AS_L2=ON automatically.

Only mchan_transfer_1d gets the memcpy fallback because that's the only
variant the 4 L3 training fixtures emit; the 2D variants stay on the
real DMA path.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 DeeployTest/test_platforms.py           |  5 +++-
 TargetLibraries/PULPOpen/CMakeLists.txt | 11 +++++++
 TargetLibraries/PULPOpen/inc/mchan_v7.h | 38 +++++++++++++++++++++++++
 3 files changed, 53 insertions(+), 1 deletion(-)

diff --git a/DeeployTest/test_platforms.py b/DeeployTest/test_platforms.py
index df2653a7..ed489569 100644
--- a/DeeployTest/test_platforms.py
+++ b/DeeployTest/test_platforms.py
@@ -458,6 +458,9 @@ def test_siracusa_tiled_training_l3_untiled(test_name, deeploy_test_dir, toolcha
     fixture = SIRACUSA_L3_UNTILED_TRAINING_MODELS[test_name]
     overrides = SIRACUSA_TRAINING_MODEL_OVERRIDES.get(test_name, {})
     effective_skipsim = skipsim or (os.environ.get("CI") == "true" and fixture.get("skip_sim_in_ci", False))
+    # DEEPLOY_L1_AS_L2 is what flips mchan_transfer_1d to memcpy in mchan_v7.h —
+    # mandatory partner of the codegen sed below.
+    extra_cmake = list(cmake_args) + ["-DDEEPLOY_L1_AS_L2=ON"]
     config = create_test_config(
         test_name = test_name,
         platform = "Siracusa",
@@ -465,7 +468,7 @@ def test_siracusa_tiled_training_l3_untiled(test_name, deeploy_test_dir, toolcha
         deeploy_test_dir = deeploy_test_dir,
         toolchain = toolchain,
         toolchain_dir = toolchain_dir,
-        cmake_args = cmake_args,
+        cmake_args = extra_cmake,
         tiling = True,
         cores = SIRACUSA_DEFAULT_CORES,
         l1 = fixture["l1"],
diff --git a/TargetLibraries/PULPOpen/CMakeLists.txt b/TargetLibraries/PULPOpen/CMakeLists.txt
index ce39fea7..0935b925 100644
--- a/TargetLibraries/PULPOpen/CMakeLists.txt
+++ b/TargetLibraries/PULPOpen/CMakeLists.txt
@@ -71,6 +71,17 @@ target_link_libraries(deeploypulp PUBLIC pulp-nn-mixed)
 target_link_libraries(deeploypulp INTERFACE pulp-sdk)
 target_sources(deeploypulp INTERFACE $<TARGET_OBJECTS:pulp-sdk>)
 
+# Untiled-L3 baseline: when ON, mchan_transfer_1d() in mchan_v7.h is replaced
+# with a memcpy implementation so the deeploy-generated DMA calls become
+# regular memory copies between L2 buffers.  Used together with the test-side
+# sed that rewrites pmsis_l1_malloc -> pi_l2_malloc and PI_L1 -> PI_L2 in the
+# generated TrainingNetwork.c so every L1-annotated buffer physically lives
+# in FC L2.
+option(DEEPLOY_L1_AS_L2 "Replace mchan_transfer_1d with memcpy (untiled-L3 baseline)" OFF)
+if(DEEPLOY_L1_AS_L2)
+  target_compile_definitions(deeploypulp PUBLIC DEEPLOY_L1_AS_L2)
+endif()
+
 set(PULP_NNX_PLATFORMS "Siracusa;Siracusa_w_neureka")
 if (platform IN_LIST PULP_NNX_PLATFORMS)
   if (platform STREQUAL "Siracusa" OR platform STREQUAL "Siracusa_w_neureka")
diff --git a/TargetLibraries/PULPOpen/inc/mchan_v7.h b/TargetLibraries/PULPOpen/inc/mchan_v7.h
index 32ef836f..e8a0ea1f 100644
--- a/TargetLibraries/PULPOpen/inc/mchan_v7.h
+++ b/TargetLibraries/PULPOpen/inc/mchan_v7.h
@@ -47,6 +47,42 @@
 #define MCHAN_CMD_FLAG_BROADCAST_FINISH (1 << (MCHAN_TRANSFER_LEN_SIZE + 5))
 #define MCHAN_CMD_FLAG_2D_TRANSFER_LOCAL (1 << (MCHAN_TRANSFER_LEN_SIZE + 6))
 
+/* Untiled-L3 baseline override: when DEEPLOY_L1_AS_L2 is defined the
+ * deeploy-generated code has been sed-rewritten so its "L1" pointers
+ * actually live in FC L2.  The mchan DMA hardware ignores destination
+ * pointer addresses and unconditionally routes the `loc` parameter into
+ * cluster L1 banks via the lower bits — so a real DMA call would write
+ * garbage to L1 and leave the L2 destination empty (which is exactly
+ * the bug we observed: out-of-bound L1-bank requests + computed=0.0).
+ *
+ * Replace mchan transfers with plain memcpy.  The channel API becomes a
+ * no-op: alloc returns 0, wait/free do nothing, is_busy reports idle.
+ * Only the 1D variant is provided — none of the L3 training fixtures
+ * emit 2D transfers; if a future model does, add the equivalent loop
+ * here. */
+#ifdef DEEPLOY_L1_AS_L2
+
+#include <string.h>
+
+static inline void mchan_transfer_1d(uint32_t cmd, void *loc, void *ext) {
+  uint32_t size = cmd & ((1u << MCHAN_TRANSFER_LEN_SIZE) - 1);
+  if (cmd & MCHAN_CMD_FLAG_DIRECTION_EXT2LOC) {
+    memcpy(loc, ext, size);
+  } else {
+    memcpy(ext, loc, size);
+  }
+}
+
+static inline uint32_t mchan_channel_alloc() { return 0; }
+static inline void mchan_channel_free(uint32_t channel_id) { (void)channel_id; }
+static inline uint32_t mchan_channel_is_busy(uint32_t channel_id) {
+  (void)channel_id;
+  return 0;
+}
+static inline void mchan_channel_wait(uint32_t channel_id) { (void)channel_id; }
+
+#else
+
 static volatile uint32_t *const cmd_ptr =
     (volatile uint32_t *const)(MCHAN_BASE_ADDR + 0x0);
 static volatile uint32_t *const status_ptr =
@@ -117,4 +153,6 @@ static void mchan_channel_wait(uint32_t channel_id) {
 #endif
 }
 
+#endif /* DEEPLOY_L1_AS_L2 */
+
 #endif // __MCHAN_V7_H__

From 734134f6da6e01e48a9d6b67dd5d9dc74fc76e04 Mon Sep 17 00:00:00 2001
From: runwangdl <samanthawangdl@gmail.com>
Date: Sun, 10 May 2026 22:01:10 +0000
Subject: [PATCH 12/18] fix(untiled): skip MobileNetV1 sim in CI (known FC LSU
 crash)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

3 of 4 L3-untiled fixtures produced clean cycle counts in run
25640520241 — CCT, CCT_LoRA, ResNet8.  MobileNetV1 sim crashed at
"update 1/4 accum 1/1 (mini-batch 0)" with:

    /chip/soc/fc/lsu] Invalid access (pc: 0x1c010034,
                       offset: 0xbf851e33, size: 0x1, is_write: 0)

The bad offset 0xbf851e33 is the float32 bit pattern of -1.039984,
which is testData_mb0_buf0[1] — i.e. some float value is being
dereferenced as a pointer.  Likely one of the FC-side helper macros
(l3_aware_copy, IS_L2, ram_write) loads a void* from a buffer that's
been overwritten with float data, but only MobileNet's specific L2
footprint triggers the misalignment.  Defer sim and ship the 3 working
fixtures; bisect the FC harness in a follow-up.
---
 DeeployTest/test_siracusa_tiled_config.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/DeeployTest/test_siracusa_tiled_config.py b/DeeployTest/test_siracusa_tiled_config.py
index 2dd8f2ba..3ef530c8 100644
--- a/DeeployTest/test_siracusa_tiled_config.py
+++ b/DeeployTest/test_siracusa_tiled_config.py
@@ -229,7 +229,16 @@
     "Models/Training/MobileNetV1/mobilenetv1_train": {
         "l1": 800_000,  # below 800K codegen asserts on accum_buffer DMA
         "l2": 2_000_000,
-        "skip_sim_in_ci": False,
+        # KNOWN ISSUE: sim crashes during update 1/4 with FC LSU
+        # "Invalid access (pc: 0x1c010034, offset: 0xbf851e33)" — the
+        # bad address 0xbf851e33 happens to be the float32 bit pattern
+        # of -1.039984, which is testData_mb0_buf0[1].  Signature of a
+        # float-value being dereferenced as a pointer somewhere in the
+        # FC harness, surfaced only by MobileNet's larger L2 footprint
+        # under the sed+memcpy untiled mode.  The other 3 L3 fixtures
+        # (CCT / CCT_LoRA / ResNet8) all produce clean cycle counts.
+        # Sim deferred until the root cause is bisected.
+        "skip_sim_in_ci": True,
     },
 }
 

From 23f1a67e800729cd65171d404e2e47a457d7a3f7 Mon Sep 17 00:00:00 2001
From: runwangdl <samanthawangdl@gmail.com>
Date: Sun, 10 May 2026 22:04:30 +0000
Subject: [PATCH 13/18] fix(untiled): cap MobileNet to 1-step/1-accum to fit FC
 L2 heap

CCT/CCT_LoRA/ResNet8 untiled L3 produced cycles cleanly in run
25640520241. MobileNetV1 sim crashed in update 1 with a float-as-pointer
deref. Hypothesis: testinputs.h's 4-batch data (~2.8 MB compiled into
the FC L2 .data section) plus the 1042 KB post-sed L1+L2 working buffer
exhausts the FC L2 heap (~1.94 MB usable), causing a downstream
pi_l2_malloc to land in invalid memory.

Capping MobileNet to n_steps=1, n_accum=1 shrinks testinputs.h ~4x and
should free enough heap for the post-sed dynamic alloc to succeed.
The per-step train_cycles measurement remains valid since the loop
work per step is identical.

Plumbed via two new optional fixture fields (n_steps, n_accum) that
turn into --n-steps / --n-accum gen_args. Other fixtures unaffected.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 DeeployTest/test_platforms.py             |  9 +++++++++
 DeeployTest/test_siracusa_tiled_config.py | 20 ++++++++++----------
 2 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/DeeployTest/test_platforms.py b/DeeployTest/test_platforms.py
index ed489569..71f993e9 100644
--- a/DeeployTest/test_platforms.py
+++ b/DeeployTest/test_platforms.py
@@ -461,6 +461,14 @@ def test_siracusa_tiled_training_l3_untiled(test_name, deeploy_test_dir, toolcha
     # DEEPLOY_L1_AS_L2 is what flips mchan_transfer_1d to memcpy in mchan_v7.h —
     # mandatory partner of the codegen sed below.
     extra_cmake = list(cmake_args) + ["-DDEEPLOY_L1_AS_L2=ON"]
+    # Optional per-fixture training-step caps.  Some untiled-L3 models hit FC
+    # L2 heap limits when testinputs.h carries 4-batch data; capping reduces
+    # the .data footprint while keeping per-step cycle measurement valid.
+    extra_gen = []
+    if "n_steps" in fixture:
+        extra_gen.append(f"--n-steps={fixture['n_steps']}")
+    if "n_accum" in fixture:
+        extra_gen.append(f"--n-accum={fixture['n_accum']}")
     config = create_test_config(
         test_name = test_name,
         platform = "Siracusa",
@@ -478,6 +486,7 @@ def test_siracusa_tiled_training_l3_untiled(test_name, deeploy_test_dir, toolcha
         training = True,
         training_num_data_inputs = overrides.get("num_data_inputs"),
         training_tolerance = overrides.get("tolerance"),
+        gen_args = extra_gen,
     )
 
     # Inline the test runner stages so we can sed between codegen and build.
diff --git a/DeeployTest/test_siracusa_tiled_config.py b/DeeployTest/test_siracusa_tiled_config.py
index 3ef530c8..ab39185a 100644
--- a/DeeployTest/test_siracusa_tiled_config.py
+++ b/DeeployTest/test_siracusa_tiled_config.py
@@ -229,16 +229,16 @@
     "Models/Training/MobileNetV1/mobilenetv1_train": {
         "l1": 800_000,  # below 800K codegen asserts on accum_buffer DMA
         "l2": 2_000_000,
-        # KNOWN ISSUE: sim crashes during update 1/4 with FC LSU
-        # "Invalid access (pc: 0x1c010034, offset: 0xbf851e33)" — the
-        # bad address 0xbf851e33 happens to be the float32 bit pattern
-        # of -1.039984, which is testData_mb0_buf0[1].  Signature of a
-        # float-value being dereferenced as a pointer somewhere in the
-        # FC harness, surfaced only by MobileNet's larger L2 footprint
-        # under the sed+memcpy untiled mode.  The other 3 L3 fixtures
-        # (CCT / CCT_LoRA / ResNet8) all produce clean cycle counts.
-        # Sim deferred until the root cause is bisected.
-        "skip_sim_in_ci": True,
+        # MobileNet's 4-batch testinputs.h is ~2.8 MB of static .data — at
+        # the limit of FC L2 heap when combined with the post-sed L1+L2
+        # working buffer (1042 KB).  Cutting to 1 train step × 1 accum
+        # step shrinks testinputs.h ~4x and frees enough heap for the
+        # remaining pi_l2_malloc calls to land in valid memory.  Cycle
+        # numbers are still meaningful: per-step train cycles are what
+        # we want to compare against the tiled L3 baseline.
+        "n_steps": 1,
+        "n_accum": 1,
+        "skip_sim_in_ci": False,
     },
 }
 

From 13c4f674a2fa8d7bedf09d668908944d104270be Mon Sep 17 00:00:00 2001
From: runwangdl <samanthawangdl@gmail.com>
Date: Sun, 10 May 2026 22:18:12 +0000
Subject: [PATCH 14/18] fix(untiled): force MobileNet num_data_inputs=1
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

CCT/CCT_LoRA pass with num_data_inputs=1 (from MODEL_OVERRIDES); MobileNet
auto-detects 2 and crashes.  Forcing 1 gets us a single-input training
step, comparable to the existing tiled L3 cycle baseline.

Adds fixture-level num_data_inputs that overrides the global MODEL_OVERRIDES
value — needed only for fixtures whose multi-input default surfaces a
codegen bug under the sed+memcpy untiled mode.
---
 DeeployTest/test_platforms.py             |  6 +++++-
 DeeployTest/test_siracusa_tiled_config.py | 14 +++++++-------
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/DeeployTest/test_platforms.py b/DeeployTest/test_platforms.py
index 71f993e9..b814bb89 100644
--- a/DeeployTest/test_platforms.py
+++ b/DeeployTest/test_platforms.py
@@ -469,6 +469,10 @@ def test_siracusa_tiled_training_l3_untiled(test_name, deeploy_test_dir, toolcha
         extra_gen.append(f"--n-steps={fixture['n_steps']}")
     if "n_accum" in fixture:
         extra_gen.append(f"--n-accum={fixture['n_accum']}")
+    # Per-fixture num_data_inputs override (lets a fixture force the value
+    # the model overrides don't set globally — needed when a multi-input
+    # model triggers a code-path bug only with NUM_DATA_INPUTS > 1).
+    fixture_num_data = fixture.get("num_data_inputs", overrides.get("num_data_inputs"))
     config = create_test_config(
         test_name = test_name,
         platform = "Siracusa",
@@ -484,7 +488,7 @@ def test_siracusa_tiled_training_l3_untiled(test_name, deeploy_test_dir, toolcha
         default_mem_level = "L3",
         double_buffer = False,
         training = True,
-        training_num_data_inputs = overrides.get("num_data_inputs"),
+        training_num_data_inputs = fixture_num_data,
         training_tolerance = overrides.get("tolerance"),
         gen_args = extra_gen,
     )
diff --git a/DeeployTest/test_siracusa_tiled_config.py b/DeeployTest/test_siracusa_tiled_config.py
index ab39185a..4b9ccc33 100644
--- a/DeeployTest/test_siracusa_tiled_config.py
+++ b/DeeployTest/test_siracusa_tiled_config.py
@@ -229,15 +229,15 @@
     "Models/Training/MobileNetV1/mobilenetv1_train": {
         "l1": 800_000,  # below 800K codegen asserts on accum_buffer DMA
         "l2": 2_000_000,
-        # MobileNet's 4-batch testinputs.h is ~2.8 MB of static .data — at
-        # the limit of FC L2 heap when combined with the post-sed L1+L2
-        # working buffer (1042 KB).  Cutting to 1 train step × 1 accum
-        # step shrinks testinputs.h ~4x and frees enough heap for the
-        # remaining pi_l2_malloc calls to land in valid memory.  Cycle
-        # numbers are still meaningful: per-step train cycles are what
-        # we want to compare against the tiled L3 baseline.
+        # Cap training schedule (testinputs.h shrinks ~4x) AND force 1
+        # data input.  CCT/CCT_LoRA's MODEL_OVERRIDES has num_data_inputs=1
+        # and they pass; MobileNet's default DATA_INPUTS=2 may surface a
+        # second-input handling bug that's masked when only one input is
+        # consumed.  A 1-step + 1-input run is still apples-to-apples for
+        # per-step train_cycles vs tiled L3.
         "n_steps": 1,
         "n_accum": 1,
+        "num_data_inputs": 1,
         "skip_sim_in_ci": False,
     },
 }

From ca3de155dd4b1e000708e06b00cc6a0fe075128a Mon Sep 17 00:00:00 2001
From: runwangdl <samanthawangdl@gmail.com>
Date: Sun, 10 May 2026 23:16:55 +0000
Subject: [PATCH 15/18] fix(untiled): big CCT (img_size=32, emb_dim=128) needs
 --l1=800K
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

CCT fixture now points at the big-CCT ONNX (devel #23 — 1.16 MB
inputs.npz, 4.66 MB L3 storage).  Old --l1=64K was sized for the toy
8x8 / dim=32 CCT (peak L1 = 16 KB) and produces 20 distinct tile shapes
on the new model.  --l1=800K is the smallest value that reaches the
near-untiled shape (3 tile shapes, peak L1 = 524 KB) — the values in
between (200K-400K) trip the SBTiler "Keys should be the same while
generating DMA transfer for tensor 'data_in'/'data_out'" assert.

Add the same n_steps=1 / n_accum=1 / num_data_inputs=1 caps as MobileNet
to keep testinputs.h's .data footprint inside the FC L2 heap.
---
 DeeployTest/test_siracusa_tiled_config.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/DeeployTest/test_siracusa_tiled_config.py b/DeeployTest/test_siracusa_tiled_config.py
index 4b9ccc33..b91deece 100644
--- a/DeeployTest/test_siracusa_tiled_config.py
+++ b/DeeployTest/test_siracusa_tiled_config.py
@@ -209,8 +209,18 @@
 # or a beefier runner.
 L3_UNTILED_TRAINING_MODELS = {
     "Models/Training/CCT/cct_train": {
-        "l1": 64_000,
+        # Big-CCT (img_size=32, embedding_dim=128, n_conv_layers=2) — peak
+        # L1 working = 524 KB > physical L1 (256 KB).  Same regime as
+        # ResNet8/MobileNet now.  --l1=200K..400K trip a codegen assert
+        # ("Keys should be the same while generating DMA transfer for
+        # tensor 'data_in'/'data_out'"); 800K is the smallest value that
+        # gets through to a clean schedule.
+        "l1": 800_000,
         "l2": 2_000_000,
+        # Cap to 1 step so testinputs.h doesn't blow .data section.
+        "n_steps": 1,
+        "n_accum": 1,
+        "num_data_inputs": 1,
         "skip_sim_in_ci": False,
     },
     "Models/Training/CCT_LoRA/cct_lora_train": {

From 0e279464536aa01228395823babe7bbc6f533547 Mon Sep 17 00:00:00 2001
From: runwangdl <samanthawangdl@gmail.com>
Date: Sun, 10 May 2026 23:19:54 +0000
Subject: [PATCH 16/18] =?UTF-8?q?ci(untiled):=20isolate=20big-CCT=20only?=
 =?UTF-8?q?=20=E2=80=94=20disable=20other=203=20fixtures=20temporarily?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Comment out CCT_LoRA / ResNet8 / MobileNetV1 entries in
L3_UNTILED_TRAINING_MODELS so this CI run measures only the new big
CCT (img_size=32, embedding_dim=128) untiled cycle. Restore before
merging.
---
 DeeployTest/test_siracusa_tiled_config.py | 49 ++++++++++-------------
 1 file changed, 22 insertions(+), 27 deletions(-)

diff --git a/DeeployTest/test_siracusa_tiled_config.py b/DeeployTest/test_siracusa_tiled_config.py
index b91deece..ccc6cc90 100644
--- a/DeeployTest/test_siracusa_tiled_config.py
+++ b/DeeployTest/test_siracusa_tiled_config.py
@@ -223,33 +223,28 @@
         "num_data_inputs": 1,
         "skip_sim_in_ci": False,
     },
-    "Models/Training/CCT_LoRA/cct_lora_train": {
-        "l1": 64_000,
-        "l2": 2_000_000,
-        "skip_sim_in_ci": False,
-    },
-    "Models/Training/ResNet8/resnet8_train": {
-        # 800 KB is the smallest --l1 that yields the minimal-tile shape
-        # (peak L1 working = 739 KB).  Larger values inflate MiniMalloc's
-        # RAM appetite past CI's ceiling.
-        "l1": 800_000,
-        "l2": 2_000_000,
-        "skip_sim_in_ci": False,
-    },
-    "Models/Training/MobileNetV1/mobilenetv1_train": {
-        "l1": 800_000,  # below 800K codegen asserts on accum_buffer DMA
-        "l2": 2_000_000,
-        # Cap training schedule (testinputs.h shrinks ~4x) AND force 1
-        # data input.  CCT/CCT_LoRA's MODEL_OVERRIDES has num_data_inputs=1
-        # and they pass; MobileNet's default DATA_INPUTS=2 may surface a
-        # second-input handling bug that's masked when only one input is
-        # consumed.  A 1-step + 1-input run is still apples-to-apples for
-        # per-step train_cycles vs tiled L3.
-        "n_steps": 1,
-        "n_accum": 1,
-        "num_data_inputs": 1,
-        "skip_sim_in_ci": False,
-    },
+    # Other 3 fixtures (CCT_LoRA, ResNet8, MobileNetV1) temporarily
+    # disabled so this CI run isolates the big-CCT untiled measurement.
+    # Restore the entries below before merging.
+    #
+    # "Models/Training/CCT_LoRA/cct_lora_train": {
+    #     "l1": 64_000,
+    #     "l2": 2_000_000,
+    #     "skip_sim_in_ci": False,
+    # },
+    # "Models/Training/ResNet8/resnet8_train": {
+    #     "l1": 800_000,
+    #     "l2": 2_000_000,
+    #     "skip_sim_in_ci": False,
+    # },
+    # "Models/Training/MobileNetV1/mobilenetv1_train": {
+    #     "l1": 800_000,
+    #     "l2": 2_000_000,
+    #     "n_steps": 1,
+    #     "n_accum": 1,
+    #     "num_data_inputs": 1,
+    #     "skip_sim_in_ci": False,
+    # },
 }
 
 # Per-model overrides for training tests.

From 5c2ab9f4bcd4e014fefdbb70a7df0d185abbaa2a Mon Sep 17 00:00:00 2001
From: runwangdl <samanthawangdl@gmail.com>
Date: Sun, 10 May 2026 23:46:21 +0000
Subject: [PATCH 17/18] ci(untiled): re-enable L3-singlebuffer to capture
 big-CCT tiled cycle

Restores the siracusa-training-tiled-l3-singlebuffer job so we get a
fresh tiled measurement for the big CCT (img_size=32, embedding_dim=128)
that landed in devel #23.  L2 singlebuffer stays commented out (other
L2 numbers from the existing benchmark figure are still valid).
---
 .../workflows/ci-platform-siracusa-tiled.yml  | 25 +++++++++----------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/ci-platform-siracusa-tiled.yml b/.github/workflows/ci-platform-siracusa-tiled.yml
index 5536fda5..f443d2b1 100644
--- a/.github/workflows/ci-platform-siracusa-tiled.yml
+++ b/.github/workflows/ci-platform-siracusa-tiled.yml
@@ -29,10 +29,9 @@ jobs:
     with:
       docker_image_deeploy: ${{ inputs.docker_image_deeploy }}
 
-  # NOTE: All other Siracusa-tiled jobs are temporarily disabled on this
-  # branch so the L3-untiled job runs in isolation while we collect
-  # cycle-count data. Restore the L2 / L3 singlebuffer entries below
-  # before merging.
+  # NOTE: L2 singlebuffer still commented out — only need fresh L3
+  # singlebuffer numbers for the big-CCT tiled cycle (other 3 already
+  # measured).  Restore the L2 entry below before merging.
   #
   # # Training tests - L2 singlebuffer
   # siracusa-training-tiled-l2-singlebuffer:
@@ -42,15 +41,15 @@ jobs:
   #     runner: ${{ needs.select-env.outputs.runner }}
   #     docker-image: ${{ needs.select-env.outputs.image }}
   #     pytest-marker: "training and l2 and singlebuffer"
-  #
-  # # Training tests - L3 singlebuffer
-  # siracusa-training-tiled-l3-singlebuffer:
-  #   needs: select-env
-  #   uses: ./.github/workflows/_runner-siracusa-tiled.yml
-  #   with:
-  #     runner: ${{ needs.select-env.outputs.runner }}
-  #     docker-image: ${{ needs.select-env.outputs.image }}
-  #     pytest-marker: "training and l3 and singlebuffer"
+
+  # Training tests - L3 singlebuffer (re-run to pick up big-CCT tiled).
+  siracusa-training-tiled-l3-singlebuffer:
+    needs: select-env
+    uses: ./.github/workflows/_runner-siracusa-tiled.yml
+    with:
+      runner: ${{ needs.select-env.outputs.runner }}
+      docker-image: ${{ needs.select-env.outputs.image }}
+      pytest-marker: "training and l3 and singlebuffer"
 
   # Training tests - L3 untiled baseline.  Codegen post-process rewrites
   # every L1-annotated buffer to FC L2 so cluster cores access kernel

From 7b4038d0fd36db1b35c822933bbf71a4c88d6357 Mon Sep 17 00:00:00 2001
From: runwangdl <samanthawangdl@gmail.com>
Date: Mon, 11 May 2026 00:14:06 +0000
Subject: [PATCH 18/18] =?UTF-8?q?ci(untiled):=20drop=20n=5Fsteps=3D1=20cap?=
 =?UTF-8?q?=20on=20CCT=20=E2=80=94=20use=20default=204-step=20schedule?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 DeeployTest/test_siracusa_tiled_config.py | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/DeeployTest/test_siracusa_tiled_config.py b/DeeployTest/test_siracusa_tiled_config.py
index ccc6cc90..30511320 100644
--- a/DeeployTest/test_siracusa_tiled_config.py
+++ b/DeeployTest/test_siracusa_tiled_config.py
@@ -165,11 +165,13 @@
 
 # Training-enabled tiled models that need L3 spill (weights/activations don't
 # fit in L2). Same shape: test path -> list of L1 sizes (bytes).
+# TEMPORARY: only big-CCT enabled — the other 3 already have tiled cycle
+# data from earlier CI runs.  Restore the entries below before merging.
 L3_SINGLEBUFFER_TRAINING_MODELS = {
-    "Models/Training/ResNet8/resnet8_train": [128000],
-    "Models/Training/MobileNetV1/mobilenetv1_train": [128000],
+    # "Models/Training/ResNet8/resnet8_train": [128000],
+    # "Models/Training/MobileNetV1/mobilenetv1_train": [128000],
     "Models/Training/CCT/cct_train": [128000],
-    "Models/Training/CCT_LoRA/cct_lora_train": [128000],
+    # "Models/Training/CCT_LoRA/cct_lora_train": [128000],
 }
 
 # Untiled-L3 baseline.  Same fixtures as L3_SINGLEBUFFER_TRAINING_MODELS but
@@ -210,16 +212,15 @@
 L3_UNTILED_TRAINING_MODELS = {
     "Models/Training/CCT/cct_train": {
         # Big-CCT (img_size=32, embedding_dim=128, n_conv_layers=2) — peak
-        # L1 working = 524 KB > physical L1 (256 KB).  Same regime as
-        # ResNet8/MobileNet now.  --l1=200K..400K trip a codegen assert
-        # ("Keys should be the same while generating DMA transfer for
-        # tensor 'data_in'/'data_out'"); 800K is the smallest value that
-        # gets through to a clean schedule.
+        # L1 working = 524 KB > physical L1 (256 KB).  --l1=200K..400K
+        # trip a codegen assert ("Keys should be the same while generating
+        # DMA transfer for tensor 'data_in'/'data_out'"); 800K is the
+        # smallest value that gets through to a clean schedule.
         "l1": 800_000,
         "l2": 2_000_000,
-        # Cap to 1 step so testinputs.h doesn't blow .data section.
-        "n_steps": 1,
-        "n_accum": 1,
+        # Use the default training schedule (n_steps=4 / n_accum=1 from
+        # inputs.npz) so per-step cycles are computed the same way as the
+        # tiled L3 baseline (BENCH total / 4).
         "num_data_inputs": 1,
         "skip_sim_in_ci": False,
     },