runwangdl · runwangdl · May 10, 2026 · May 10, 2026 · May 10, 2026 · May 10, 2026
@@ -17,6 +17,10 @@ name: _runner-siracusa-tiled
       pytest-marker:
         required: true
         type: string
+      pytest-extra-args:
+        required: false
+        type: string
+        default: ""
 
 jobs:
   test-runner-siracusa-tiled:
@@ -36,5 +40,24 @@ jobs:
       - name: Run Test
         run: |
           cd DeeployTest
-          pytest test_platforms.py -v -m "siracusa_tiled and ${{ inputs.pytest-marker }}"
+          # Memory snapshot helps diagnose 137/OOM kills postmortem.
+          echo "=== free -m before pytest ==="; free -m || true
+          pytest test_platforms.py -v -m "siracusa_tiled and ${{ inputs.pytest-marker }}" ${{ inputs.pytest-extra-args }}
+          echo "=== free -m after pytest ==="; free -m || true
         shell: bash
+      - name: Build footprint summary
+        if: always()
+        env:
+          FOOTPRINT_SUMMARY_LABEL: ${{ inputs.pytest-marker }}
+        run: |
+          cd DeeployTest
+          python3 ../scripts/ci_footprint_summary.py TEST_SIRACUSA || true
+        shell: bash
+      - name: Upload sim out.txt
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: sim-out-${{ inputs.pytest-marker }}
+          path: DeeployTest/out.txt
+          if-no-files-found: ignore
+          retention-days: 7
@@ -29,20 +29,36 @@ jobs:
     with:
       docker_image_deeploy: ${{ inputs.docker_image_deeploy }}
 
-  # Training tests - L2 singlebuffer
-  siracusa-training-tiled-l2-singlebuffer:
+  # NOTE: L2 singlebuffer still commented out — only need fresh L3
+  # singlebuffer numbers for the big-CCT tiled cycle (other 3 already
+  # measured).  Restore the L2 entry below before merging.
+  #
+  # # Training tests - L2 singlebuffer
+  # siracusa-training-tiled-l2-singlebuffer:
+  #   needs: select-env
+  #   uses: ./.github/workflows/_runner-siracusa-tiled.yml
+  #   with:
+  #     runner: ${{ needs.select-env.outputs.runner }}
+  #     docker-image: ${{ needs.select-env.outputs.image }}
+  #     pytest-marker: "training and l2 and singlebuffer"
+
+  # Training tests - L3 singlebuffer (re-run to pick up big-CCT tiled).
+  siracusa-training-tiled-l3-singlebuffer:
     needs: select-env
     uses: ./.github/workflows/_runner-siracusa-tiled.yml
     with:
       runner: ${{ needs.select-env.outputs.runner }}
       docker-image: ${{ needs.select-env.outputs.image }}
-      pytest-marker: "training and l2 and singlebuffer"
+      pytest-marker: "training and l3 and singlebuffer"
 
-  # Training tests - L3 singlebuffer (models that spill weights/activations to L3)
-  siracusa-training-tiled-l3-singlebuffer:
+  # Training tests - L3 untiled baseline.  Codegen post-process rewrites
+  # every L1-annotated buffer to FC L2 so cluster cores access kernel
+  # buffers via the fabric — "untiled, L2-resident working set" cycle
+  # semantics for all 4 L3 models.
+  siracusa-training-tiled-l3-untiled:
     needs: select-env
     uses: ./.github/workflows/_runner-siracusa-tiled.yml
     with:
       runner: ${{ needs.select-env.outputs.runner }}
       docker-image: ${{ needs.select-env.outputs.image }}
-      pytest-marker: "training and l3 and singlebuffer"
+      pytest-marker: "training and l3 and untiled"
@@ -5,13 +5,10 @@
 ---
 name: CI • Siracusa
 
+# NOTE: Push / pull_request triggers temporarily disabled on this branch
+# so only the L3-untiled job runs while we collect cycle-count data.
+# Restore the push: / pull_request: blocks before merging.
 "on":
-  push:
-    branches:
-      - "**"
-    tags:
-      - "v*.*.*"
-  pull_request:
   workflow_dispatch:
     inputs:
       docker_image_deeploy:

@@ -75,6 +75,8 @@ def pytest_configure(config: pytest.Config) -> None:
         "markers", "train_kernel: mark test as a training-related single-op kernel test (grad / loss / optimizer)")
     config.addinivalue_line("markers", "singlebuffer: mark test as single-buffer configuration")
     config.addinivalue_line("markers", "doublebuffer: mark test as double-buffer configuration")
+    config.addinivalue_line(
+        "markers", "untiled: mark test as untiled-baseline (single-tile-per-tensor schedule via fake-L1 shim)")
     config.addinivalue_line("markers", "l2: mark test as L2 default memory level")
     config.addinivalue_line("markers", "l3: mark test as L3 default memory level")
     config.addinivalue_line("markers", "wmem: mark test as using Neureka weight memory")

@@ -2,6 +2,8 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+import os
+
 import pytest
 # Import platform-specific test configurations
 from test_chimera_config import KERNEL_TESTS as CHIMERA_KERNEL_TESTS
@@ -40,6 +42,7 @@
 from test_siracusa_tiled_config import L2_SINGLEBUFFER_TRAINING_MODELS as SIRACUSA_L2_SINGLEBUFFER_TRAINING_MODELS
 from test_siracusa_tiled_config import L3_DOUBLEBUFFER_MODELS, L3_SINGLEBUFFER_MODELS
 from test_siracusa_tiled_config import L3_SINGLEBUFFER_TRAINING_MODELS as SIRACUSA_L3_SINGLEBUFFER_TRAINING_MODELS
+from test_siracusa_tiled_config import L3_UNTILED_TRAINING_MODELS as SIRACUSA_L3_UNTILED_TRAINING_MODELS
 from test_siracusa_tiled_config import TRAINING_MODEL_OVERRIDES as SIRACUSA_TRAINING_MODEL_OVERRIDES
 from test_snitch_config import DEFAULT_NUM_CORES as SNITCH_DEFAULT_NUM_CORES
 from test_snitch_config import KERNEL_TESTS as SNITCH_KERNEL_TESTS
@@ -330,6 +333,9 @@ def test_siracusa_train_kernels(test_name, deeploy_test_dir, toolchain, toolchai
 @pytest.mark.training
 @pytest.mark.parametrize("test_name", SIRACUSA_TRAINING_TESTS, ids = SIRACUSA_TRAINING_TESTS)
 def test_siracusa_training(test_name, deeploy_test_dir, toolchain, toolchain_dir, cmake_args, skipgen, skipsim) -> None:
+    # Reuse the tiled overrides table — same models, same tolerance / data-input
+    # quirks regardless of whether tiling is on.
+    overrides = SIRACUSA_TRAINING_MODEL_OVERRIDES.get(test_name, {})
     config = create_test_config(
         test_name = test_name,
         platform = "Siracusa",
@@ -341,6 +347,8 @@ def test_siracusa_training(test_name, deeploy_test_dir, toolchain, toolchain_dir
         tiling = False,
         cores = SIRACUSA_DEFAULT_CORES,
         training = True,
+        training_num_data_inputs = overrides.get("num_data_inputs"),
+        training_tolerance = overrides.get("tolerance"),
     )
     run_and_assert_test(test_name, config, skipgen, skipsim)
 
@@ -413,6 +421,97 @@ def test_siracusa_tiled_training_l3_singlebuffer(test_params, deeploy_test_dir,
     run_and_assert_test(test_name, config, skipgen, skipsim)
 
 
+@pytest.mark.siracusa_tiled
+@pytest.mark.training
+@pytest.mark.untiled
+@pytest.mark.l3
+@pytest.mark.parametrize(
+    "test_name",
+    list(SIRACUSA_L3_UNTILED_TRAINING_MODELS.keys()),
+    ids = list(SIRACUSA_L3_UNTILED_TRAINING_MODELS.keys()),
+)
+def test_siracusa_tiled_training_l3_untiled(test_name, deeploy_test_dir, toolchain, toolchain_dir, cmake_args, skipgen,
+                                            skipsim) -> None:
+    """Untiled-L3 baseline.
+
+    SBTiler picks single-tile-per-tensor schedules (--l1 inflated above the
+    op working set so no spatial split happens).  The generated C is one
+    kernel call per op with integral L3↔L2 DMA wrappers.
+
+    To make the L1 staging buffer physically live in FC L2 (so cycles
+    represent "kernel actually accessing L2"), we post-process the
+    generated TrainingNetwork.c / OptimizerNetwork.c after codegen but
+    before cmake build:
+
+        pmsis_l1_malloc -> pi_l2_malloc
+        PI_L1           -> PI_L2
+
+    Every L1-annotated buffer ends up in FC L2.  Cluster cores access L2
+    via the fabric (~7x slower than real L1) — that's the deliberate
+    semantics of "untiled L2-resident".  No fake-L1 shim, no linker wrap,
+    no SDK pollution.
+    """
+    from pathlib import Path
+
+    from testUtils.core.execution import build_binary, configure_cmake, generate_network, run_simulation
+
+    fixture = SIRACUSA_L3_UNTILED_TRAINING_MODELS[test_name]
+    overrides = SIRACUSA_TRAINING_MODEL_OVERRIDES.get(test_name, {})
+    effective_skipsim = skipsim or (os.environ.get("CI") == "true" and fixture.get("skip_sim_in_ci", False))
+    # DEEPLOY_L1_AS_L2 is what flips mchan_transfer_1d to memcpy in mchan_v7.h —
+    # mandatory partner of the codegen sed below.
+    extra_cmake = list(cmake_args) + ["-DDEEPLOY_L1_AS_L2=ON"]
+    # Optional per-fixture training-step caps.  Some untiled-L3 models hit FC
+    # L2 heap limits when testinputs.h carries 4-batch data; capping reduces
+    # the .data footprint while keeping per-step cycle measurement valid.
+    extra_gen = []
+    if "n_steps" in fixture:
+        extra_gen.append(f"--n-steps={fixture['n_steps']}")
+    if "n_accum" in fixture:
+        extra_gen.append(f"--n-accum={fixture['n_accum']}")
+    # Per-fixture num_data_inputs override (lets a fixture force the value
+    # the model overrides don't set globally — needed when a multi-input
+    # model triggers a code-path bug only with NUM_DATA_INPUTS > 1).
+    fixture_num_data = fixture.get("num_data_inputs", overrides.get("num_data_inputs"))
+    config = create_test_config(
+        test_name = test_name,
+        platform = "Siracusa",
+        simulator = "gvsoc",
+        deeploy_test_dir = deeploy_test_dir,
+        toolchain = toolchain,
+        toolchain_dir = toolchain_dir,
+        cmake_args = extra_cmake,
+        tiling = True,
+        cores = SIRACUSA_DEFAULT_CORES,
+        l1 = fixture["l1"],
+        l2 = fixture["l2"],
+        default_mem_level = "L3",
+        double_buffer = False,
+        training = True,
+        training_num_data_inputs = fixture_num_data,
+        training_tolerance = overrides.get("tolerance"),
+        gen_args = extra_gen,
+    )
+
+    # Inline the test runner stages so we can sed between codegen and build.
+    generate_network(config, skip = skipgen)
+    for c_name in ("TrainingNetwork.c", "OptimizerNetwork.c"):
+        c_path = Path(config.gen_dir) / c_name
+        if not c_path.exists():
+            continue
+        text = c_path.read_text()
+        text = text.replace("pmsis_l1_malloc", "pi_l2_malloc")
+        text = text.replace("PI_L1 ", "PI_L2 ")
+        c_path.write_text(text)
+    configure_cmake(config)
+    build_binary(config)
+    result = run_simulation(config, skip = effective_skipsim)
+    assert result.success, (f"Test {test_name} failed with {result.error_count} errors out of "
+                            f"{result.total_count}\nOutput:\n{result.stdout}")
+    if result.error_count >= 0:
+        assert result.error_count == 0, (f"Found {result.error_count} errors out of {result.total_count} tests")
+
+
 @pytest.mark.siracusa_tiled
 @pytest.mark.kernels
 @pytest.mark.singlebuffer

@@ -113,6 +113,18 @@
 # Training-enabled models (use deeployTrainingRunner / testMVPTraining pipeline).
 # Each entry is the path to a `<model>_train` directory; the matching
 # `<model>_optimizer` directory must live next to it.
+#
+# Untiled-L3 baseline scope:
+#   The untiled path emits one pi_l2_malloc per buffer; the SUM of these calls
+#   must fit in the Siracusa FC-L2 heap (~1 MB usable after BSS/stack). Models
+#   below have a verified untiled L2 footprint within that ceiling:
+#     - SimpleMLP        ~0.05 MB
+#     - CCT_LoRA         ~0.4  MB
+#     - CCT              ~0.7  MB
+#   ResNet8 (~9.3 MB) and MobileNetV1 (~17 MB) need an L2-heap override that
+#   lives in a separate task — they stay tiled-only for now.
 TRAINING_TESTS = [
     "Models/Training/SimpleMLP/simplemlp_train",
+    "Models/Training/CCT/cct_train",
+    "Models/Training/CCT_LoRA/cct_lora_train",
 ]
@@ -165,11 +165,87 @@
 
 # Training-enabled tiled models that need L3 spill (weights/activations don't
 # fit in L2). Same shape: test path -> list of L1 sizes (bytes).
+# TEMPORARY: only big-CCT enabled — the other 3 already have tiled cycle
+# data from earlier CI runs.  Restore the entries below before merging.
 L3_SINGLEBUFFER_TRAINING_MODELS = {
-    "Models/Training/ResNet8/resnet8_train": [128000],
-    "Models/Training/MobileNetV1/mobilenetv1_train": [128000],
+    # "Models/Training/ResNet8/resnet8_train": [128000],
+    # "Models/Training/MobileNetV1/mobilenetv1_train": [128000],
     "Models/Training/CCT/cct_train": [128000],
-    "Models/Training/CCT_LoRA/cct_lora_train": [128000],
+    # "Models/Training/CCT_LoRA/cct_lora_train": [128000],
+}
+
+# Untiled-L3 baseline.  Same fixtures as L3_SINGLEBUFFER_TRAINING_MODELS but
+# the L1 budget is inflated so the SBTiler picks single-tile-per-tensor
+# schedules (numTiles == 1 on every dim) — semantically untiled per op, but
+# still uses the tile-codegen DMA wrappers because cluster cores cannot deref
+# HyperRAM directly. The L1 working buffer ends up larger than physical
+# Siracusa L1 (256 KB), so the deeploy_fake_l1 shim redirects pi_cl_l1_malloc
+# into an FC-L2 arena via -Wl,--wrap; size cap = DEEPLOY_FAKE_L1_SIZE (set
+# per-fixture below to fit the model's peak L1 working set with headroom).
+#
+# Maps test_name -> dict with:
+#   l1: planner-side L1 size (forces single-tile schedules)
+#   l2: planner-side L2 size
+#   fake_l1_size: physical bytes for the FC-L2 arena backing pi_cl_l1_malloc
+#
+# fake_l1_size baselining method: spike with --l1=4_000_000 → read off
+# MEMORYARENA_L1 size from generated TrainingNetwork.c → round up.
+# Untiled-L3 baseline — single-tile-per-tensor schedules for every L3
+# training model so the user can read off "untiled L3 latency" alongside
+# the existing tiled-L3 cycles.
+#
+# Each fixture goes through the same SBTiler infrastructure as the L3
+# singlebuffer tests, with --l1 inflated to the smallest value that
+# yields the minimal-tile shape (one kernel call per op + integral
+# L3↔L2 DMA, no spatial split).
+#
+# After codegen, the test post-processes TrainingNetwork.c /
+# OptimizerNetwork.c to swap pmsis_l1_malloc → pi_l2_malloc and
+# PI_L1 → PI_L2, so every L1-annotated buffer physically lives in
+# FC L2.  Cluster cores access these via the fabric (~7x slower than
+# real L1) — that's the deliberate semantics of "untiled L2-resident".
+#
+# skip_sim_in_ci: True for fixtures where gvsoc has historically OOMed
+# during the long single-tile loop.  CI still verifies codegen +
+# compile + link in that case; sim is deferred to a manual local run
+# or a beefier runner.
+L3_UNTILED_TRAINING_MODELS = {
+    "Models/Training/CCT/cct_train": {
+        # Big-CCT (img_size=32, embedding_dim=128, n_conv_layers=2) — peak
+        # L1 working = 524 KB > physical L1 (256 KB).  --l1=200K..400K
+        # trip a codegen assert ("Keys should be the same while generating
+        # DMA transfer for tensor 'data_in'/'data_out'"); 800K is the
+        # smallest value that gets through to a clean schedule.
+        "l1": 800_000,
+        "l2": 2_000_000,
+        # Use the default training schedule (n_steps=4 / n_accum=1 from
+        # inputs.npz) so per-step cycles are computed the same way as the
+        # tiled L3 baseline (BENCH total / 4).
+        "num_data_inputs": 1,
+        "skip_sim_in_ci": False,
+    },
+    # Other 3 fixtures (CCT_LoRA, ResNet8, MobileNetV1) temporarily
+    # disabled so this CI run isolates the big-CCT untiled measurement.
+    # Restore the entries below before merging.
+    #
+    # "Models/Training/CCT_LoRA/cct_lora_train": {
+    #     "l1": 64_000,
+    #     "l2": 2_000_000,
+    #     "skip_sim_in_ci": False,
+    # },
+    # "Models/Training/ResNet8/resnet8_train": {
+    #     "l1": 800_000,
+    #     "l2": 2_000_000,
+    #     "skip_sim_in_ci": False,
+    # },
+    # "Models/Training/MobileNetV1/mobilenetv1_train": {
+    #     "l1": 800_000,
+    #     "l2": 2_000_000,
+    #     "n_steps": 1,
+    #     "n_accum": 1,
+    #     "num_data_inputs": 1,
+    #     "skip_sim_in_ci": False,
+    # },
 }
 
 # Per-model overrides for training tests.

@@ -71,6 +71,17 @@ target_link_libraries(deeploypulp PUBLIC pulp-nn-mixed)
 target_link_libraries(deeploypulp INTERFACE pulp-sdk)
 target_sources(deeploypulp INTERFACE $<TARGET_OBJECTS:pulp-sdk>)
 
+# Untiled-L3 baseline: when ON, mchan_transfer_1d() in mchan_v7.h is replaced
+# with a memcpy implementation so the deeploy-generated DMA calls become
+# regular memory copies between L2 buffers.  Used together with the test-side
+# sed that rewrites pmsis_l1_malloc -> pi_l2_malloc and PI_L1 -> PI_L2 in the
+# generated TrainingNetwork.c so every L1-annotated buffer physically lives
+# in FC L2.
+option(DEEPLOY_L1_AS_L2 "Replace mchan_transfer_1d with memcpy (untiled-L3 baseline)" OFF)
+if(DEEPLOY_L1_AS_L2)
+  target_compile_definitions(deeploypulp PUBLIC DEEPLOY_L1_AS_L2)
+endif()
+
 set(PULP_NNX_PLATFORMS "Siracusa;Siracusa_w_neureka")
 if (platform IN_LIST PULP_NNX_PLATFORMS)
   if (platform STREQUAL "Siracusa" OR platform STREQUAL "Siracusa_w_neureka")