From 525d08a6e32a5cb9f3e6a22222e146b638a96fd5 Mon Sep 17 00:00:00 2001 From: runwangdl Date: Sun, 10 May 2026 17:12:11 +0000 Subject: [PATCH 01/18] feat(training): add CCT + CCT_LoRA to non-tiled siracusa training tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Untiled-L3 baseline, Stage 1 of 3. CCT and CCT_LoRA emit ~0.7 MB and ~0.4 MB of pi_l2_malloc respectively, both well within the Siracusa FC-L2 heap, so the non-tiled training path runs them as-is — no codegen / runtime changes needed. Local codegen + compile + link verified on the feat/untiling worktree. Reuses SIRACUSA_TRAINING_MODEL_OVERRIDES from the tiled config so CCT gets its existing tolerance bump (5e-3) and num_data_inputs=1 quirks in the untiled run too. ResNet8 (~9.3 MB) and MobileNetV1 (~17 MB) exceed the FC-L2 heap and need an L2-heap override (Stage 2/3) — they remain tiled-only. Co-Authored-By: Claude Opus 4.7 (1M context) --- DeeployTest/test_platforms.py | 5 +++++ DeeployTest/test_siracusa_config.py | 12 ++++++++++++ 2 files changed, 17 insertions(+) diff --git a/DeeployTest/test_platforms.py b/DeeployTest/test_platforms.py index 7eee2085..f3577747 100644 --- a/DeeployTest/test_platforms.py +++ b/DeeployTest/test_platforms.py @@ -330,6 +330,9 @@ def test_siracusa_train_kernels(test_name, deeploy_test_dir, toolchain, toolchai @pytest.mark.training @pytest.mark.parametrize("test_name", SIRACUSA_TRAINING_TESTS, ids = SIRACUSA_TRAINING_TESTS) def test_siracusa_training(test_name, deeploy_test_dir, toolchain, toolchain_dir, cmake_args, skipgen, skipsim) -> None: + # Reuse the tiled overrides table — same models, same tolerance / data-input + # quirks regardless of whether tiling is on. + overrides = SIRACUSA_TRAINING_MODEL_OVERRIDES.get(test_name, {}) config = create_test_config( test_name = test_name, platform = "Siracusa", @@ -341,6 +344,8 @@ def test_siracusa_training(test_name, deeploy_test_dir, toolchain, toolchain_dir tiling = False, cores = SIRACUSA_DEFAULT_CORES, training = True, + training_num_data_inputs = overrides.get("num_data_inputs"), + training_tolerance = overrides.get("tolerance"), ) run_and_assert_test(test_name, config, skipgen, skipsim) diff --git a/DeeployTest/test_siracusa_config.py b/DeeployTest/test_siracusa_config.py index d2f25708..5f839a89 100644 --- a/DeeployTest/test_siracusa_config.py +++ b/DeeployTest/test_siracusa_config.py @@ -113,6 +113,18 @@ # Training-enabled models (use deeployTrainingRunner / testMVPTraining pipeline). # Each entry is the path to a `_train` directory; the matching # `_optimizer` directory must live next to it. +# +# Untiled-L3 baseline scope: +# The untiled path emits one pi_l2_malloc per buffer; the SUM of these calls +# must fit in the Siracusa FC-L2 heap (~1 MB usable after BSS/stack). Models +# below have a verified untiled L2 footprint within that ceiling: +# - SimpleMLP ~0.05 MB +# - CCT_LoRA ~0.4 MB +# - CCT ~0.7 MB +# ResNet8 (~9.3 MB) and MobileNetV1 (~17 MB) need an L2-heap override that +# lives in a separate task — they stay tiled-only for now. TRAINING_TESTS = [ "Models/Training/SimpleMLP/simplemlp_train", + "Models/Training/CCT/cct_train", + "Models/Training/CCT_LoRA/cct_lora_train", ] From 11d7aa9bc908dc2aab8c76ec77129396ffbfc85d Mon Sep 17 00:00:00 2001 From: runwangdl Date: Sun, 10 May 2026 18:20:12 +0000 Subject: [PATCH 02/18] feat(training): add L3 untiled baseline via fake-L1 shim + ResNet8 fixture MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Untiled-L3 baseline, Stage 2 of 3. # Approach PULP cluster cores cannot dereference HyperRAM addresses, so a literal "untiled, all-in-L3" run is physically impossible — the kernel would fault. The closest legitimate baseline is single-tile-per-tensor: every op runs on its full tensor in one kernel invocation, but the L3↔L2 DMA wrappers stay because they're the only way data reaches the cluster. The existing SBTiler already produces that schedule when --l1 is large enough that no constraint forces a split. Local spike on ResNet8 with --l1=4_000_000 confirmed numTiles == 1 on every tile dim and produced: MEMORYARENA_L1 = pmsis_l1_malloc(739328) MEMORYARENA_L2 = pi_l2_malloc(294916) MEMORYARENA_L3 = cl_ram_malloc(1588440) # Blocker addressed by the shim 739 KB > physical Siracusa L1 (256 KB), so pmsis_l1_malloc would return NULL at runtime. deeploy_fake_l1.c provides __wrap_pi_cl_l1_malloc (activated by -DDEEPLOY_L1_AS_L2 + linker --wrap) that allocates from a static PI_L2 arena sized via DEEPLOY_FAKE_L1_SIZE. Generated code is unchanged — codegen still emits pmsis_l1_malloc, the wrap intercepts. Linker symbol audit confirms __wrap_pi_cl_l1_malloc replaces SDK's strong symbol cleanly. Trade-off (documented in the .c file): kernels see L2 latency instead of L1, so cycles under this mode are NOT silicon-representative — the mode is a *correctness* baseline, not a perf one. # Scope ResNet8 ships first (fastest L3 model to validate). MobileNetV1 and CCT/CCT_LoRA are pending; each needs its own fake_l1_size spike before adding to L3_UNTILED_TRAINING_MODELS. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../workflows/ci-platform-siracusa-tiled.yml | 10 +++ DeeployTest/conftest.py | 2 + DeeployTest/test_platforms.py | 46 ++++++++++++ DeeployTest/test_siracusa_tiled_config.py | 27 +++++++ TargetLibraries/PULPOpen/CMakeLists.txt | 17 +++++ .../PULPOpen/src/deeploy_fake_l1.c | 71 +++++++++++++++++++ 6 files changed, 173 insertions(+) create mode 100644 TargetLibraries/PULPOpen/src/deeploy_fake_l1.c diff --git a/.github/workflows/ci-platform-siracusa-tiled.yml b/.github/workflows/ci-platform-siracusa-tiled.yml index b65cbb75..00ee3ded 100644 --- a/.github/workflows/ci-platform-siracusa-tiled.yml +++ b/.github/workflows/ci-platform-siracusa-tiled.yml @@ -46,3 +46,13 @@ jobs: runner: ${{ needs.select-env.outputs.runner }} docker-image: ${{ needs.select-env.outputs.image }} pytest-marker: "training and l3 and singlebuffer" + + # Training tests - L3 untiled baseline (single-tile-per-tensor; uses fake-L1 + # shim to back the oversized L1 working buffer with FC L2) + siracusa-training-tiled-l3-untiled: + needs: select-env + uses: ./.github/workflows/_runner-siracusa-tiled.yml + with: + runner: ${{ needs.select-env.outputs.runner }} + docker-image: ${{ needs.select-env.outputs.image }} + pytest-marker: "training and l3 and untiled" diff --git a/DeeployTest/conftest.py b/DeeployTest/conftest.py index f29891bf..b13a4cfe 100644 --- a/DeeployTest/conftest.py +++ b/DeeployTest/conftest.py @@ -75,6 +75,8 @@ def pytest_configure(config: pytest.Config) -> None: "markers", "train_kernel: mark test as a training-related single-op kernel test (grad / loss / optimizer)") config.addinivalue_line("markers", "singlebuffer: mark test as single-buffer configuration") config.addinivalue_line("markers", "doublebuffer: mark test as double-buffer configuration") + config.addinivalue_line( + "markers", "untiled: mark test as untiled-baseline (single-tile-per-tensor schedule via fake-L1 shim)") config.addinivalue_line("markers", "l2: mark test as L2 default memory level") config.addinivalue_line("markers", "l3: mark test as L3 default memory level") config.addinivalue_line("markers", "wmem: mark test as using Neureka weight memory") diff --git a/DeeployTest/test_platforms.py b/DeeployTest/test_platforms.py index f3577747..4492ae2b 100644 --- a/DeeployTest/test_platforms.py +++ b/DeeployTest/test_platforms.py @@ -40,6 +40,7 @@ from test_siracusa_tiled_config import L2_SINGLEBUFFER_TRAINING_MODELS as SIRACUSA_L2_SINGLEBUFFER_TRAINING_MODELS from test_siracusa_tiled_config import L3_DOUBLEBUFFER_MODELS, L3_SINGLEBUFFER_MODELS from test_siracusa_tiled_config import L3_SINGLEBUFFER_TRAINING_MODELS as SIRACUSA_L3_SINGLEBUFFER_TRAINING_MODELS +from test_siracusa_tiled_config import L3_UNTILED_TRAINING_MODELS as SIRACUSA_L3_UNTILED_TRAINING_MODELS from test_siracusa_tiled_config import TRAINING_MODEL_OVERRIDES as SIRACUSA_TRAINING_MODEL_OVERRIDES from test_snitch_config import DEFAULT_NUM_CORES as SNITCH_DEFAULT_NUM_CORES from test_snitch_config import KERNEL_TESTS as SNITCH_KERNEL_TESTS @@ -418,6 +419,51 @@ def test_siracusa_tiled_training_l3_singlebuffer(test_params, deeploy_test_dir, run_and_assert_test(test_name, config, skipgen, skipsim) +@pytest.mark.siracusa_tiled +@pytest.mark.training +@pytest.mark.untiled +@pytest.mark.l3 +@pytest.mark.parametrize( + "test_name", + list(SIRACUSA_L3_UNTILED_TRAINING_MODELS.keys()), + ids = list(SIRACUSA_L3_UNTILED_TRAINING_MODELS.keys()), +) +def test_siracusa_tiled_training_l3_untiled(test_name, deeploy_test_dir, toolchain, toolchain_dir, cmake_args, + skipgen, skipsim) -> None: + """Untiled-L3 baseline. + + Reuses the tiled codegen pipeline but inflates --l1 large enough that the + SBTiler picks single-tile-per-tensor schedules. The deeploy_fake_l1 shim + (DEEPLOY_L1_AS_L2) redirects pi_cl_l1_malloc into an FC-L2 arena so the + oversized "L1" working buffer (>physical 256 KB) actually fits. + """ + fixture = SIRACUSA_L3_UNTILED_TRAINING_MODELS[test_name] + overrides = SIRACUSA_TRAINING_MODEL_OVERRIDES.get(test_name, {}) + extra_cmake = list(cmake_args) + [ + f"-DDEEPLOY_L1_AS_L2=ON", + f"-DDEEPLOY_FAKE_L1_SIZE={fixture['fake_l1_size']}", + ] + config = create_test_config( + test_name = test_name, + platform = "Siracusa", + simulator = "gvsoc", + deeploy_test_dir = deeploy_test_dir, + toolchain = toolchain, + toolchain_dir = toolchain_dir, + cmake_args = extra_cmake, + tiling = True, + cores = SIRACUSA_DEFAULT_CORES, + l1 = fixture["l1"], + l2 = fixture["l2"], + default_mem_level = "L3", + double_buffer = False, + training = True, + training_num_data_inputs = overrides.get("num_data_inputs"), + training_tolerance = overrides.get("tolerance"), + ) + run_and_assert_test(test_name, config, skipgen, skipsim) + + @pytest.mark.siracusa_tiled @pytest.mark.kernels @pytest.mark.singlebuffer diff --git a/DeeployTest/test_siracusa_tiled_config.py b/DeeployTest/test_siracusa_tiled_config.py index bafa6635..a5409540 100644 --- a/DeeployTest/test_siracusa_tiled_config.py +++ b/DeeployTest/test_siracusa_tiled_config.py @@ -172,6 +172,33 @@ "Models/Training/CCT_LoRA/cct_lora_train": [128000], } +# Untiled-L3 baseline. Same fixtures as L3_SINGLEBUFFER_TRAINING_MODELS but +# the L1 budget is inflated so the SBTiler picks single-tile-per-tensor +# schedules (numTiles == 1 on every dim) — semantically untiled per op, but +# still uses the tile-codegen DMA wrappers because cluster cores cannot deref +# HyperRAM directly. The L1 working buffer ends up larger than physical +# Siracusa L1 (256 KB), so the deeploy_fake_l1 shim redirects pi_cl_l1_malloc +# into an FC-L2 arena via -Wl,--wrap; size cap = DEEPLOY_FAKE_L1_SIZE (set +# per-fixture below to fit the model's peak L1 working set with headroom). +# +# Maps test_name -> dict with: +# l1: planner-side L1 size (forces single-tile schedules) +# l2: planner-side L2 size +# fake_l1_size: physical bytes for the FC-L2 arena backing pi_cl_l1_malloc +# +# fake_l1_size baselining method: spike with --l1=4_000_000 → read off +# MEMORYARENA_L1 size from generated TrainingNetwork.c → round up. +L3_UNTILED_TRAINING_MODELS = { + "Models/Training/ResNet8/resnet8_train": { + "l1": 4_000_000, + "l2": 2_000_000, + "fake_l1_size": 1_048_576, # spike measured 739 KB; 1 MB headroom + }, + # MobileNetV1 / CCT / CCT_LoRA fixtures pending: ResNet8 ships first as + # the fastest-to-validate L3 baseline. Each new entry needs an explicit + # spike to size fake_l1_size before adding here. +} + # Per-model overrides for training tests. # # - num_data_inputs: required when inputs.npz has only one mini-batch (no diff --git a/TargetLibraries/PULPOpen/CMakeLists.txt b/TargetLibraries/PULPOpen/CMakeLists.txt index ce39fea7..3ae97d91 100644 --- a/TargetLibraries/PULPOpen/CMakeLists.txt +++ b/TargetLibraries/PULPOpen/CMakeLists.txt @@ -71,6 +71,23 @@ target_link_libraries(deeploypulp PUBLIC pulp-nn-mixed) target_link_libraries(deeploypulp INTERFACE pulp-sdk) target_sources(deeploypulp INTERFACE $) +# Untiled-L3 baseline: redirect pi_cl_l1_malloc/free to a static L2 arena via +# linker --wrap so cluster-L1 alloc requests up to DEEPLOY_FAKE_L1_SIZE bytes +# (must be ≤ remaining FC L2 ~1.94 MB) succeed even though physical L1 is only +# 256 KB. Source file deeploy_fake_l1.c is no-op when DEEPLOY_L1_AS_L2 is OFF. +option(DEEPLOY_L1_AS_L2 "Redirect pi_cl_l1_malloc to a static FC-L2 arena (untiled-L3 baseline)" OFF) +set(DEEPLOY_FAKE_L1_SIZE "1048576" CACHE STRING "Size in bytes of the fake-L1 arena placed in FC L2") +if(DEEPLOY_L1_AS_L2) + target_compile_definitions(deeploypulp PRIVATE + DEEPLOY_L1_AS_L2 + DEEPLOY_FAKE_L1_SIZE=${DEEPLOY_FAKE_L1_SIZE} + ) + target_link_options(deeploypulp INTERFACE + "-Wl,--wrap=pi_cl_l1_malloc" + "-Wl,--wrap=pi_cl_l1_free" + ) +endif() + set(PULP_NNX_PLATFORMS "Siracusa;Siracusa_w_neureka") if (platform IN_LIST PULP_NNX_PLATFORMS) if (platform STREQUAL "Siracusa" OR platform STREQUAL "Siracusa_w_neureka") diff --git a/TargetLibraries/PULPOpen/src/deeploy_fake_l1.c b/TargetLibraries/PULPOpen/src/deeploy_fake_l1.c new file mode 100644 index 00000000..6c6964f9 --- /dev/null +++ b/TargetLibraries/PULPOpen/src/deeploy_fake_l1.c @@ -0,0 +1,71 @@ +/* + * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + * + * Untiled-L3 baseline: redirect cluster-L1 allocations to a static L2 arena. + * + * Activated by `-DDEEPLOY_L1_AS_L2 -DDEEPLOY_FAKE_L1_SIZE=N` plus the linker + * flags `-Wl,--wrap=pi_cl_l1_malloc -Wl,--wrap=pi_cl_l1_free`. The wrap pair + * intercepts the SDK's strong symbols so the deeploy-generated code keeps + * calling pmsis_l1_malloc / pi_cl_l1_malloc as if it were targeting cluster + * L1 — physically the bytes live in the FC L2 region instead, which on + * Siracusa has ~1.94 MB of headroom (vs. ~256 KB for real cluster L1). + * + * Trade-off: kernels that were tuned for L1's 1-cycle access see L2 latency + * (~7 cycles), so cycle counts under this mode are NOT representative of + * silicon. The mode exists to provide a single-tile-per-tensor untiled-L3 + * baseline for *correctness* comparison against the tiled L3 path; cycle + * realism for the same workload still requires the tiled run. + */ + +#include + +#ifdef DEEPLOY_L1_AS_L2 + +#ifndef DEEPLOY_FAKE_L1_SIZE +#error "DEEPLOY_L1_AS_L2 requires -DDEEPLOY_FAKE_L1_SIZE=" +#endif + +#include "pmsis.h" + +/* Static arena in FC L2 — sized at compile time. PI_L2 maps to the SDK's L2 + * shared region (0x1C010000-0x1C200000 on Siracusa, ~1.94 MB). Aligned to 8 + * to satisfy any kernel that reinterprets float32_t / pulpv2 v2f16 buffers. */ +__attribute__((aligned(8))) +PI_L2 static uint8_t deeploy_fake_l1_arena[DEEPLOY_FAKE_L1_SIZE]; + +/* Bump pointer + free-list. The deeploy-generated harness allocates the L1 + * arena once at InitTrainingNetwork() time and frees it once at teardown, + * so we don't need a real heap — a bump allocator with a single rewind on + * full-arena free is sufficient and cheap. + * + * If the harness pattern ever changes (e.g. fine-grained per-op alloc/free), + * swap this for an extern_alloc_t pool the way dory_mem.c does for L3. */ +static uint32_t deeploy_fake_l1_offset = 0; + +void *__wrap_pi_cl_l1_malloc(struct pi_device *device, uint32_t size) { + (void)device; + /* 8-byte alignment for every allocation so consecutive callers stay + * aligned even when `size` is not a multiple of 8. */ + uint32_t aligned = (size + 7u) & ~7u; + if (deeploy_fake_l1_offset + aligned > DEEPLOY_FAKE_L1_SIZE) { + return (void *)0; + } + void *p = (void *)&deeploy_fake_l1_arena[deeploy_fake_l1_offset]; + deeploy_fake_l1_offset += aligned; + return p; +} + +void __wrap_pi_cl_l1_free(struct pi_device *device, void *chunk, int size) { + (void)device; + (void)chunk; + (void)size; + /* Bump-allocator semantics: per-block free is a no-op. The harness + * frees the whole arena at teardown; we rewind there. */ + if (deeploy_fake_l1_offset >= (uint32_t)size) { + deeploy_fake_l1_offset -= ((uint32_t)size + 7u) & ~7u; + } +} + +#endif /* DEEPLOY_L1_AS_L2 */ From 8f003bcdb514218fb39801d37b8855612effaad2 Mon Sep 17 00:00:00 2001 From: runwangdl Date: Sun, 10 May 2026 18:37:39 +0000 Subject: [PATCH 03/18] style: pre-commit fixes (yapf line wrap + clang-format 2-space indent) CI Lint surfaced two formatting nits from #21: - test_platforms.py: yapf wants `skipgen,` on the first line of the new test_siracusa_tiled_training_l3_untiled signature - deeploy_fake_l1.c: clang-format style is 2-space, not 4-space No semantic change. Co-Authored-By: Claude Opus 4.7 (1M context) --- DeeployTest/test_platforms.py | 4 +-- .../PULPOpen/src/deeploy_fake_l1.c | 36 +++++++++---------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/DeeployTest/test_platforms.py b/DeeployTest/test_platforms.py index 4492ae2b..adc56743 100644 --- a/DeeployTest/test_platforms.py +++ b/DeeployTest/test_platforms.py @@ -428,8 +428,8 @@ def test_siracusa_tiled_training_l3_singlebuffer(test_params, deeploy_test_dir, list(SIRACUSA_L3_UNTILED_TRAINING_MODELS.keys()), ids = list(SIRACUSA_L3_UNTILED_TRAINING_MODELS.keys()), ) -def test_siracusa_tiled_training_l3_untiled(test_name, deeploy_test_dir, toolchain, toolchain_dir, cmake_args, - skipgen, skipsim) -> None: +def test_siracusa_tiled_training_l3_untiled(test_name, deeploy_test_dir, toolchain, toolchain_dir, cmake_args, skipgen, + skipsim) -> None: """Untiled-L3 baseline. Reuses the tiled codegen pipeline but inflates --l1 large enough that the diff --git a/TargetLibraries/PULPOpen/src/deeploy_fake_l1.c b/TargetLibraries/PULPOpen/src/deeploy_fake_l1.c index 6c6964f9..db1c8b67 100644 --- a/TargetLibraries/PULPOpen/src/deeploy_fake_l1.c +++ b/TargetLibraries/PULPOpen/src/deeploy_fake_l1.c @@ -45,27 +45,27 @@ PI_L2 static uint8_t deeploy_fake_l1_arena[DEEPLOY_FAKE_L1_SIZE]; static uint32_t deeploy_fake_l1_offset = 0; void *__wrap_pi_cl_l1_malloc(struct pi_device *device, uint32_t size) { - (void)device; - /* 8-byte alignment for every allocation so consecutive callers stay - * aligned even when `size` is not a multiple of 8. */ - uint32_t aligned = (size + 7u) & ~7u; - if (deeploy_fake_l1_offset + aligned > DEEPLOY_FAKE_L1_SIZE) { - return (void *)0; - } - void *p = (void *)&deeploy_fake_l1_arena[deeploy_fake_l1_offset]; - deeploy_fake_l1_offset += aligned; - return p; + (void)device; + /* 8-byte alignment for every allocation so consecutive callers stay + * aligned even when `size` is not a multiple of 8. */ + uint32_t aligned = (size + 7u) & ~7u; + if (deeploy_fake_l1_offset + aligned > DEEPLOY_FAKE_L1_SIZE) { + return (void *)0; + } + void *p = (void *)&deeploy_fake_l1_arena[deeploy_fake_l1_offset]; + deeploy_fake_l1_offset += aligned; + return p; } void __wrap_pi_cl_l1_free(struct pi_device *device, void *chunk, int size) { - (void)device; - (void)chunk; - (void)size; - /* Bump-allocator semantics: per-block free is a no-op. The harness - * frees the whole arena at teardown; we rewind there. */ - if (deeploy_fake_l1_offset >= (uint32_t)size) { - deeploy_fake_l1_offset -= ((uint32_t)size + 7u) & ~7u; - } + (void)device; + (void)chunk; + (void)size; + /* Bump-allocator semantics: per-block free is a no-op. The harness + * frees the whole arena at teardown; we rewind there. */ + if (deeploy_fake_l1_offset >= (uint32_t)size) { + deeploy_fake_l1_offset -= ((uint32_t)size + 7u) & ~7u; + } } #endif /* DEEPLOY_L1_AS_L2 */ From e9445f1db18aef10388231f7cac2b985170aca12 Mon Sep 17 00:00:00 2001 From: runwangdl Date: Sun, 10 May 2026 18:50:43 +0000 Subject: [PATCH 04/18] fix(ci): drop ResNet8 untiled --l1 to 800 KB to fit ubuntu-latest 7 GB RAM MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CI ran out of memory (exit 137) on the new test_siracusa_tiled_training_l3_untiled job. The MiniMalloc constraint solver's RAM appetite scales with the L1 size — 4 MB blew past ubuntu-latest's 7 GB ceiling. Spike confirmed --l1=800 KB produces the *same* tile shapes as --l1=4 MB (numTiles arrays are byte-identical): everything single-tile except node_31_fc_Gemm_GradReduceSum_3_ReduceSum_backward, which has an intrinsic 10-tile reduction independent of L1 budget. The peak L1 working set is 739 KB regardless, so 800 KB is the smallest --l1 that still gives the minimal-tile schedule. fake_l1_size unchanged at 1 MB. Co-Authored-By: Claude Opus 4.7 (1M context) --- DeeployTest/test_siracusa_tiled_config.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/DeeployTest/test_siracusa_tiled_config.py b/DeeployTest/test_siracusa_tiled_config.py index a5409540..9bde2520 100644 --- a/DeeployTest/test_siracusa_tiled_config.py +++ b/DeeployTest/test_siracusa_tiled_config.py @@ -190,7 +190,12 @@ # MEMORYARENA_L1 size from generated TrainingNetwork.c → round up. L3_UNTILED_TRAINING_MODELS = { "Models/Training/ResNet8/resnet8_train": { - "l1": 4_000_000, + # 800 KB is the smallest --l1 that still yields the minimal-tile + # schedule (peak L1 working set = 739 KB). Anything between 800 KB + # and 4 MB produces identical numTiles arrays — we use the smallest + # value because the SBTiler's constraint solver (MiniMalloc) burns + # ubuntu-latest's 7 GB RAM at --l1=4 MB. + "l1": 800_000, "l2": 2_000_000, "fake_l1_size": 1_048_576, # spike measured 739 KB; 1 MB headroom }, From 5ce11e59eee707e036b482847886b8cd0617feb7 Mon Sep 17 00:00:00 2001 From: runwangdl Date: Sun, 10 May 2026 19:10:36 +0000 Subject: [PATCH 05/18] ci: --skipsim diagnostic for L3-untiled job + memory snapshot MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two CI runs at --l1=4 MB then --l1=800 KB both got SIGKILLed (exit 137) on ubuntu-latest after ~8 min of silent execution. To bisect compile vs sim, run the new L3-untiled job with --skipsim — if it passes, OOM is in gvsoc; if it still fails, OOM is in clang compilation of the single-tile-per-tensor TrainingNetwork.c. Adds a generic pytest-extra-args input to _runner-siracusa-tiled.yml plus a `free -m` snapshot before pytest for postmortem visibility. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/_runner-siracusa-tiled.yml | 9 ++++++++- .github/workflows/ci-platform-siracusa-tiled.yml | 10 +++++++++- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/.github/workflows/_runner-siracusa-tiled.yml b/.github/workflows/_runner-siracusa-tiled.yml index cc09f234..e1cecb44 100644 --- a/.github/workflows/_runner-siracusa-tiled.yml +++ b/.github/workflows/_runner-siracusa-tiled.yml @@ -17,6 +17,10 @@ name: _runner-siracusa-tiled pytest-marker: required: true type: string + pytest-extra-args: + required: false + type: string + default: "" jobs: test-runner-siracusa-tiled: @@ -36,5 +40,8 @@ jobs: - name: Run Test run: | cd DeeployTest - pytest test_platforms.py -v -m "siracusa_tiled and ${{ inputs.pytest-marker }}" + # Memory snapshot helps diagnose 137/OOM kills postmortem. + echo "=== free -m before pytest ==="; free -m || true + pytest test_platforms.py -v -m "siracusa_tiled and ${{ inputs.pytest-marker }}" ${{ inputs.pytest-extra-args }} + echo "=== free -m after pytest ==="; free -m || true shell: bash diff --git a/.github/workflows/ci-platform-siracusa-tiled.yml b/.github/workflows/ci-platform-siracusa-tiled.yml index 00ee3ded..d58ea711 100644 --- a/.github/workflows/ci-platform-siracusa-tiled.yml +++ b/.github/workflows/ci-platform-siracusa-tiled.yml @@ -48,7 +48,14 @@ jobs: pytest-marker: "training and l3 and singlebuffer" # Training tests - L3 untiled baseline (single-tile-per-tensor; uses fake-L1 - # shim to back the oversized L1 working buffer with FC L2) + # shim to back the oversized L1 working buffer with FC L2). + # + # --skipsim is intentional here: previous runs got SIGKILLed (exit 137) on + # ubuntu-latest's 7 GB RAM after ~8 minutes of silent execution, and we need + # to know whether the OOM is in compile (kernel codegen + clang) or in + # gvsoc. --skipsim verifies the codegen + compile path and the fake-L1 + # shim's link integrity; sim verification is deferred until we either move + # to a beefier runner or shrink the per-step memory peak. siracusa-training-tiled-l3-untiled: needs: select-env uses: ./.github/workflows/_runner-siracusa-tiled.yml @@ -56,3 +63,4 @@ jobs: runner: ${{ needs.select-env.outputs.runner }} docker-image: ${{ needs.select-env.outputs.image }} pytest-marker: "training and l3 and untiled" + pytest-extra-args: "--skipsim" From f9d2e9e9ce37f78b741a74e7c91de3472aa5219a Mon Sep 17 00:00:00 2001 From: runwangdl Date: Sun, 10 May 2026 19:26:59 +0000 Subject: [PATCH 06/18] feat(training): CCT/CCT_LoRA/MobileNetV1 untiled L3 + CI footprint summary MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the remaining 3 untiled-L3 fixtures, completing the matrix: | Fixture | --l1 | fake_l1_size | peak L1 working | |----------------|-----:|-------------:|----------------:| | CCT | 64K | 32K | 16K | | CCT_LoRA | 64K | 32K | 16K | | ResNet8 | 800K | 1024K | 722K | | MobileNetV1 | 800K | 768K | 530K | Each --l1 was bisected to the smallest value that yields the minimal-tile schedule. MobileNet specifically asserts in the codegen below 800K (`Keys should be the same while generating DMA transfer for tensor 'accum_buffer'`) so 800K is a hard floor, not a tunable. Also adds scripts/ci_footprint_summary.py — a small build-time summary that walks every TrainingNetwork.c under TEST_SIRACUSA and writes a per- fixture table of MEMORYARENA_L1/L2/L3 sizes plus distinct numTiles shapes to GITHUB_STEP_SUMMARY. Wired into _runner-siracusa-tiled.yml with `if: always()` so the table appears even when pytest fails. This is a build-time stand-in for the cycle comparison the user asked for; real cycle counts need gvsoc sim, which is currently --skipsim'd for the L3-untiled job because of the unresolved sim-side OOM. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/_runner-siracusa-tiled.yml | 8 ++ DeeployTest/test_siracusa_tiled_config.py | 38 +++++++-- scripts/ci_footprint_summary.py | 89 ++++++++++++++++++++ 3 files changed, 126 insertions(+), 9 deletions(-) create mode 100644 scripts/ci_footprint_summary.py diff --git a/.github/workflows/_runner-siracusa-tiled.yml b/.github/workflows/_runner-siracusa-tiled.yml index e1cecb44..15fd8041 100644 --- a/.github/workflows/_runner-siracusa-tiled.yml +++ b/.github/workflows/_runner-siracusa-tiled.yml @@ -45,3 +45,11 @@ jobs: pytest test_platforms.py -v -m "siracusa_tiled and ${{ inputs.pytest-marker }}" ${{ inputs.pytest-extra-args }} echo "=== free -m after pytest ==="; free -m || true shell: bash + - name: Build footprint summary + if: always() + env: + FOOTPRINT_SUMMARY_LABEL: ${{ inputs.pytest-marker }} + run: | + cd DeeployTest + python3 ../scripts/ci_footprint_summary.py TEST_SIRACUSA || true + shell: bash diff --git a/DeeployTest/test_siracusa_tiled_config.py b/DeeployTest/test_siracusa_tiled_config.py index 9bde2520..db27f791 100644 --- a/DeeployTest/test_siracusa_tiled_config.py +++ b/DeeployTest/test_siracusa_tiled_config.py @@ -189,19 +189,39 @@ # fake_l1_size baselining method: spike with --l1=4_000_000 → read off # MEMORYARENA_L1 size from generated TrainingNetwork.c → round up. L3_UNTILED_TRAINING_MODELS = { + # Per-model l1 / l2 / fake_l1_size were established by spiking + # testMVPTraining.py with --defaultMemLevel=L3 and reading + # MEMORYARENA_L1 from the generated TrainingNetwork.c. + # + # - l1: planner-side budget passed to SBTiler (forces single-tile + # schedules when generous enough). Use the smallest value that + # still compiles and yields the minimal-tile shape — larger values + # blow MiniMalloc's RAM appetite past CI's 16 GB ceiling. + # - l2: planner-side L2 budget; 2 MB matches the existing tiled L3 + # baseline. + # - fake_l1_size: physical bytes for the FC-L2-backed pi_cl_l1_malloc + # arena (deeploy_fake_l1.c). Must be ≥ MEMORYARENA_L1, with a + # small headroom for alignment. + "Models/Training/CCT/cct_train": { + "l1": 64_000, + "l2": 2_000_000, + "fake_l1_size": 32_768, # peak L1 working = 16388 B + }, + "Models/Training/CCT_LoRA/cct_lora_train": { + "l1": 64_000, + "l2": 2_000_000, + "fake_l1_size": 32_768, # peak L1 working = 16384 B + }, "Models/Training/ResNet8/resnet8_train": { - # 800 KB is the smallest --l1 that still yields the minimal-tile - # schedule (peak L1 working set = 739 KB). Anything between 800 KB - # and 4 MB produces identical numTiles arrays — we use the smallest - # value because the SBTiler's constraint solver (MiniMalloc) burns - # ubuntu-latest's 7 GB RAM at --l1=4 MB. "l1": 800_000, "l2": 2_000_000, - "fake_l1_size": 1_048_576, # spike measured 739 KB; 1 MB headroom + "fake_l1_size": 1_048_576, # peak L1 working = 739328 B + }, + "Models/Training/MobileNetV1/mobilenetv1_train": { + "l1": 800_000, # below 800K codegen asserts on accum_buffer DMA + "l2": 2_000_000, + "fake_l1_size": 786_432, # peak L1 working = 542720 B }, - # MobileNetV1 / CCT / CCT_LoRA fixtures pending: ResNet8 ships first as - # the fastest-to-validate L3 baseline. Each new entry needs an explicit - # spike to size fake_l1_size before adding here. } # Per-model overrides for training tests. diff --git a/scripts/ci_footprint_summary.py b/scripts/ci_footprint_summary.py new file mode 100644 index 00000000..989a92de --- /dev/null +++ b/scripts/ci_footprint_summary.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +# SPDX-License-Identifier: Apache-2.0 +""" +Emit a per-test footprint summary to GITHUB_STEP_SUMMARY. + +Walks `DeeployTest/TEST_SIRACUSA/` for generated `TrainingNetwork.c` files +and reports, per fixture: MEMORYARENA_L1/L2/L3 sizes (peak working sets + +L3 storage) and the number of distinct numTiles shapes. + +The numbers come from grepping the generated C — they're a build-time +proxy for "how much memory pressure does this configuration put on the +target". This is the closest stand-in for the cycle comparison the user +wants until the L3-untiled sim OOM is debugged and we can collect real +gvsoc cycle counts. + +Used in the siracusa-tiled CI workflow. Safe to run with no matching +files (just emits an empty summary). +""" + +import os +import re +import sys +from pathlib import Path + +ARENA_RE = re.compile(r"MEMORYARENA_(L1|L2|L3)\s*=.*\*\s*(\d+)") +TILES_RE = re.compile(r"numTiles\[\d+\]\s*=\s*\{[^}]+\}") + + +def parse_one(c_path: Path) -> dict: + arenas = {"L1": 0, "L2": 0, "L3": 0} + tile_shapes = set() + for line in c_path.read_text(errors="replace").splitlines(): + m = ARENA_RE.search(line) + if m: + arenas[m.group(1)] = max(arenas[m.group(1)], int(m.group(2))) + for t in TILES_RE.findall(line): + tile_shapes.add(t) + return {"arenas": arenas, "tile_shapes": len(tile_shapes)} + + +def fmt_kb(n: int) -> str: + if n == 0: + return "—" + return f"{n / 1024:.1f} KB" + + +def main() -> int: + test_root = Path(sys.argv[1]) if len(sys.argv) > 1 else Path("DeeployTest/TEST_SIRACUSA") + if not test_root.is_dir(): + print(f"[footprint-summary] {test_root} not a directory; skipping", file=sys.stderr) + return 0 + + rows = [] + for c_path in sorted(test_root.rglob("TrainingNetwork.c")): + rel = c_path.relative_to(test_root).parent + info = parse_one(c_path) + rows.append((str(rel), info)) + + # Pick the pytest marker label (passed by the workflow) for the section title. + label = os.environ.get("FOOTPRINT_SUMMARY_LABEL", "training") + + out_lines = [ + f"### Build footprint — `{label}`", + "", + "| Fixture | L1 working | L2 working | L3 storage | Distinct tile shapes |", + "|---|--:|--:|--:|--:|", + ] + for path, info in rows: + a = info["arenas"] + out_lines.append( + f"| `{path}` | {fmt_kb(a['L1'])} | {fmt_kb(a['L2'])} | {fmt_kb(a['L3'])} | {info['tile_shapes']} |") + if not rows: + out_lines.append("| _(no TrainingNetwork.c found)_ | | | | |") + out_lines.append("") + + summary_path = os.environ.get("GITHUB_STEP_SUMMARY") + if summary_path: + with open(summary_path, "a") as f: + f.write("\n".join(out_lines) + "\n") + print(f"[footprint-summary] wrote {len(rows)} rows to {summary_path}", file=sys.stderr) + else: + # Local invocation: print to stdout for visibility. + print("\n".join(out_lines)) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) From 443d45fe635cb70260e1971e40a24d25e966eefc Mon Sep 17 00:00:00 2001 From: runwangdl Date: Sun, 10 May 2026 19:49:56 +0000 Subject: [PATCH 07/18] feat(ci): per-model sim gate + cycle counts in untiled-L3 summary MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit User asked for the "how much slower is untiled vs tiled" data — the existing footprint table doesn't carry it because everything was --skipsim'd to dodge the sim-side OOM seen on ResNet8 / MobileNetV1. Splits the L3-untiled job by model: | Fixture | sim in CI? | reason | |-------------|------------|-----------------------------------------| | CCT | yes | 16 KB working set; gvsoc fits in 16 GB | | CCT_LoRA | yes | same | | ResNet8 | --skipsim | OOM at ~8 min; deferred | | MobileNetV1 | --skipsim | OOM at ~8 min; deferred | Mechanism: per-model `skip_sim_in_ci` flag in L3_UNTILED_TRAINING_MODELS; test_siracusa_tiled_training_l3_untiled forces skipsim only when `CI=true` AND the flag is set. Local runs always do the full pipeline. The global `--skipsim` is dropped from the CI workflow. Cycle extractor (in scripts/ci_footprint_summary.py): parses `DeeployTest/out.txt` for the `BENCH train_cycles=… opt_cycles=…` lines emitted by deeploytraintest.c, correlates each to the preceding `Testing ` banner, and emits a second markdown table to GITHUB_STEP_SUMMARY. Skipped fixtures contribute no cycle row, so the table only carries entries that actually ran. Cycle comparison untiled vs tiled is read by eyeballing the two job summaries side-by-side. A unified cross-job aggregation needs an artifact-passing pass; deferred. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../workflows/ci-platform-siracusa-tiled.yml | 11 +-- DeeployTest/test_platforms.py | 10 ++- DeeployTest/test_siracusa_tiled_config.py | 10 +++ scripts/ci_footprint_summary.py | 81 ++++++++++++++++--- 4 files changed, 94 insertions(+), 18 deletions(-) diff --git a/.github/workflows/ci-platform-siracusa-tiled.yml b/.github/workflows/ci-platform-siracusa-tiled.yml index d58ea711..e730c583 100644 --- a/.github/workflows/ci-platform-siracusa-tiled.yml +++ b/.github/workflows/ci-platform-siracusa-tiled.yml @@ -50,12 +50,10 @@ jobs: # Training tests - L3 untiled baseline (single-tile-per-tensor; uses fake-L1 # shim to back the oversized L1 working buffer with FC L2). # - # --skipsim is intentional here: previous runs got SIGKILLed (exit 137) on - # ubuntu-latest's 7 GB RAM after ~8 minutes of silent execution, and we need - # to know whether the OOM is in compile (kernel codegen + clang) or in - # gvsoc. --skipsim verifies the codegen + compile path and the fake-L1 - # shim's link integrity; sim verification is deferred until we either move - # to a beefier runner or shrink the per-step memory peak. + # Per-model skip_sim_in_ci gate (in test_siracusa_tiled_config.py) decides + # which fixtures actually run gvsoc on CI: CCT/CCT_LoRA do (16 KB working + # set, gvsoc fits comfortably); ResNet8/MobileNetV1 are --skipsim'd until + # the sim-side OOM at ~8 min is debugged or we move to a bigger runner. siracusa-training-tiled-l3-untiled: needs: select-env uses: ./.github/workflows/_runner-siracusa-tiled.yml @@ -63,4 +61,3 @@ jobs: runner: ${{ needs.select-env.outputs.runner }} docker-image: ${{ needs.select-env.outputs.image }} pytest-marker: "training and l3 and untiled" - pytest-extra-args: "--skipsim" diff --git a/DeeployTest/test_platforms.py b/DeeployTest/test_platforms.py index adc56743..0b4136e7 100644 --- a/DeeployTest/test_platforms.py +++ b/DeeployTest/test_platforms.py @@ -2,6 +2,8 @@ # # SPDX-License-Identifier: Apache-2.0 +import os + import pytest # Import platform-specific test configurations from test_chimera_config import KERNEL_TESTS as CHIMERA_KERNEL_TESTS @@ -436,6 +438,11 @@ def test_siracusa_tiled_training_l3_untiled(test_name, deeploy_test_dir, toolcha SBTiler picks single-tile-per-tensor schedules. The deeploy_fake_l1 shim (DEEPLOY_L1_AS_L2) redirects pi_cl_l1_malloc into an FC-L2 arena so the oversized "L1" working buffer (>physical 256 KB) actually fits. + + Per-model skip_sim_in_ci gate: large fixtures (ResNet8 / MobileNetV1) + skip the gvsoc sim on CI runners because two prior runs got SIGKILLed + at ~8 min during simulation. Local runs (no `CI` env var) still run + the full pipeline so the user can verify losses manually. """ fixture = SIRACUSA_L3_UNTILED_TRAINING_MODELS[test_name] overrides = SIRACUSA_TRAINING_MODEL_OVERRIDES.get(test_name, {}) @@ -443,6 +450,7 @@ def test_siracusa_tiled_training_l3_untiled(test_name, deeploy_test_dir, toolcha f"-DDEEPLOY_L1_AS_L2=ON", f"-DDEEPLOY_FAKE_L1_SIZE={fixture['fake_l1_size']}", ] + effective_skipsim = skipsim or (os.environ.get("CI") == "true" and fixture.get("skip_sim_in_ci", False)) config = create_test_config( test_name = test_name, platform = "Siracusa", @@ -461,7 +469,7 @@ def test_siracusa_tiled_training_l3_untiled(test_name, deeploy_test_dir, toolcha training_num_data_inputs = overrides.get("num_data_inputs"), training_tolerance = overrides.get("tolerance"), ) - run_and_assert_test(test_name, config, skipgen, skipsim) + run_and_assert_test(test_name, config, skipgen, effective_skipsim) @pytest.mark.siracusa_tiled diff --git a/DeeployTest/test_siracusa_tiled_config.py b/DeeployTest/test_siracusa_tiled_config.py index db27f791..d3273119 100644 --- a/DeeployTest/test_siracusa_tiled_config.py +++ b/DeeployTest/test_siracusa_tiled_config.py @@ -206,21 +206,31 @@ "l1": 64_000, "l2": 2_000_000, "fake_l1_size": 32_768, # peak L1 working = 16388 B + # Sim runs in CI: 16 KB working set is tiny enough that gvsoc + # doesn't OOM ubuntu-latest's 16 GB. + "skip_sim_in_ci": False, }, "Models/Training/CCT_LoRA/cct_lora_train": { "l1": 64_000, "l2": 2_000_000, "fake_l1_size": 32_768, # peak L1 working = 16384 B + "skip_sim_in_ci": False, }, "Models/Training/ResNet8/resnet8_train": { "l1": 800_000, "l2": 2_000_000, "fake_l1_size": 1_048_576, # peak L1 working = 739328 B + # Two prior CI runs got SIGKILLed (exit 137) at ~8 min during sim. + # Skip until the sim-side memory leak is debugged or we move to a + # bigger runner. --skipsim still verifies codegen + compile + the + # fake-L1 shim's link integrity. + "skip_sim_in_ci": True, }, "Models/Training/MobileNetV1/mobilenetv1_train": { "l1": 800_000, # below 800K codegen asserts on accum_buffer DMA "l2": 2_000_000, "fake_l1_size": 786_432, # peak L1 working = 542720 B + "skip_sim_in_ci": True, # same OOM concern as ResNet8 }, } diff --git a/scripts/ci_footprint_summary.py b/scripts/ci_footprint_summary.py index 989a92de..b8330082 100644 --- a/scripts/ci_footprint_summary.py +++ b/scripts/ci_footprint_summary.py @@ -2,20 +2,20 @@ # SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna # SPDX-License-Identifier: Apache-2.0 """ -Emit a per-test footprint summary to GITHUB_STEP_SUMMARY. +Emit a per-test footprint + cycle summary to GITHUB_STEP_SUMMARY. -Walks `DeeployTest/TEST_SIRACUSA/` for generated `TrainingNetwork.c` files -and reports, per fixture: MEMORYARENA_L1/L2/L3 sizes (peak working sets + -L3 storage) and the number of distinct numTiles shapes. +Two passes: -The numbers come from grepping the generated C — they're a build-time -proxy for "how much memory pressure does this configuration put on the -target". This is the closest stand-in for the cycle comparison the user -wants until the L3-untiled sim OOM is debugged and we can collect real -gvsoc cycle counts. +1. **Build footprint** — walks `DeeployTest/TEST_SIRACUSA/` for generated + `TrainingNetwork.c` files and reports per fixture: MEMORYARENA_L1/L2/L3 + sizes (peak working sets + L3 storage) and distinct numTiles shapes. +2. **Cycle counts** — parses `DeeployTest/out.txt` (where the test runner + appends every sim's stdout) for `BENCH train_cycles=… opt_cycles=… + weight_sram=…` lines, correlating each line to its preceding `Testing + ` banner. Skipped fixtures contribute no cycle row. Used in the siracusa-tiled CI workflow. Safe to run with no matching -files (just emits an empty summary). +files (just emits an empty section). """ import os @@ -25,6 +25,8 @@ ARENA_RE = re.compile(r"MEMORYARENA_(L1|L2|L3)\s*=.*\*\s*(\d+)") TILES_RE = re.compile(r"numTiles\[\d+\]\s*=\s*\{[^}]+\}") +TESTING_RE = re.compile(r"Testing\s+(\S+)\s+on\s+\S+\s+Platform") +BENCH_RE = re.compile(r"BENCH\s+train_cycles=(\d+)\s+opt_cycles=(\d+)\s+weight_sram=(\d+)") def parse_one(c_path: Path) -> dict: @@ -45,6 +47,42 @@ def fmt_kb(n: int) -> str: return f"{n / 1024:.1f} KB" +def fmt_cycles(n: int) -> str: + if n == 0: + return "—" + if n >= 1_000_000: + return f"{n / 1e6:.2f}M" + if n >= 1_000: + return f"{n / 1e3:.1f}K" + return str(n) + + +def parse_cycles(out_txt: Path) -> dict: + """Returns {test_dir: {train_cycles, opt_cycles, weight_sram}}. + + Each `Testing ` banner in out.txt opens a section; the next + `BENCH …` line in that section is the cycle row for that fixture. + Sections without a BENCH line (skipsim, sim crash) get no entry. + """ + if not out_txt.is_file(): + return {} + out: dict = {} + current = None + for line in out_txt.read_text(errors="replace").splitlines(): + m = TESTING_RE.search(line) + if m: + current = m.group(1) + continue + m = BENCH_RE.search(line) + if m and current is not None: + out[current] = { + "train_cycles": int(m.group(1)), + "opt_cycles": int(m.group(2)), + "weight_sram": int(m.group(3)), + } + return out + + def main() -> int: test_root = Path(sys.argv[1]) if len(sys.argv) > 1 else Path("DeeployTest/TEST_SIRACUSA") if not test_root.is_dir(): @@ -57,6 +95,8 @@ def main() -> int: info = parse_one(c_path) rows.append((str(rel), info)) + cycles = parse_cycles(test_root.parent / "out.txt") + # Pick the pytest marker label (passed by the workflow) for the section title. label = os.environ.get("FOOTPRINT_SUMMARY_LABEL", "training") @@ -74,6 +114,27 @@ def main() -> int: out_lines.append("| _(no TrainingNetwork.c found)_ | | | | |") out_lines.append("") + # Cycle table — only renders if at least one fixture actually simulated. + cycle_rows = [] + for path, _info in rows: + # The `Testing` banner uses the absolute test_dir path; match by basename. + match_key = next((k for k in cycles if k.endswith(path) or path.endswith(Path(k).name)), None) + if match_key: + cycle_rows.append((path, cycles[match_key])) + out_lines.append(f"### Cycle counts (gvsoc) — `{label}`") + out_lines.append("") + out_lines.append("| Fixture | train_cycles | opt_cycles | weight_sram |") + out_lines.append("|---|--:|--:|--:|") + if cycle_rows: + for path, c in cycle_rows: + out_lines.append( + f"| `{path}` | {fmt_cycles(c['train_cycles'])} | " + f"{fmt_cycles(c['opt_cycles'])} | {fmt_kb(c['weight_sram'])} |") + else: + out_lines.append( + "| _(no BENCH lines in out.txt — sim was --skipsim'd or crashed)_ | | | |") + out_lines.append("") + summary_path = os.environ.get("GITHUB_STEP_SUMMARY") if summary_path: with open(summary_path, "a") as f: From 2df2b0beb4854ca9d0d4243b83ed38a80f4e0d28 Mon Sep 17 00:00:00 2001 From: runwangdl Date: Sun, 10 May 2026 20:14:04 +0000 Subject: [PATCH 08/18] fix(untiled): drop CCT/CCT_LoRA + per-fixture needs_fake_l1 gate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previous CI run produced absurd cycle counts for CCT untiled (528 vs 10.27M tiled). Investigation: 1. CCT untiled and CCT tiled-L3 produce **byte-for-byte identical** TrainingNetwork.c (diff is just MiniMalloc statement-ordering noise). Both arena sizes match: L1=16K, L2=16K, L3=294K. 2. CCT's peak L1 working (16 KB) fits trivially in physical Siracusa L1 (256 KB), so the deeploy_fake_l1 wrap is unnecessary. 3. The wrap intercepts every pi_cl_l1_malloc call site, including any SDK-internal one — the 528-cycle anomaly is consistent with the cluster never actually running training kernels because the SDK allocation got served from our small fake arena. Restructure: - Drop CCT and CCT_LoRA from L3_UNTILED_TRAINING_MODELS — they're semantically already covered by the tiled-L3-singlebuffer entry. Keep the comment so future readers know why. - Add per-fixture `needs_fake_l1` flag (defaults False). Test only applies -DDEEPLOY_L1_AS_L2=ON when needs_fake_l1=True. Future fixtures in this dict that don't need the wrap won't get it. - ResNet8 and MobileNetV1 stay (their peak L1 working is 739K / 530K, genuinely > physical L1). Both still skip sim in CI pending OOM debug. Cycle comparison "untiled vs tiled" therefore can't be done in CI right now — the only fixtures where the comparison is meaningful (ResNet8 / MobileNetV1) are skipsim'd. Documented as a known follow-up. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/_runner-siracusa-tiled.yml | 8 +++ DeeployTest/test_platforms.py | 13 +++-- DeeployTest/test_siracusa_tiled_config.py | 53 +++++++++----------- 3 files changed, 40 insertions(+), 34 deletions(-) diff --git a/.github/workflows/_runner-siracusa-tiled.yml b/.github/workflows/_runner-siracusa-tiled.yml index 15fd8041..16047857 100644 --- a/.github/workflows/_runner-siracusa-tiled.yml +++ b/.github/workflows/_runner-siracusa-tiled.yml @@ -53,3 +53,11 @@ jobs: cd DeeployTest python3 ../scripts/ci_footprint_summary.py TEST_SIRACUSA || true shell: bash + - name: Upload sim out.txt + if: always() + uses: actions/upload-artifact@v4 + with: + name: sim-out-${{ inputs.pytest-marker }} + path: DeeployTest/out.txt + if-no-files-found: ignore + retention-days: 7 diff --git a/DeeployTest/test_platforms.py b/DeeployTest/test_platforms.py index 0b4136e7..4dd3fb4a 100644 --- a/DeeployTest/test_platforms.py +++ b/DeeployTest/test_platforms.py @@ -446,10 +446,15 @@ def test_siracusa_tiled_training_l3_untiled(test_name, deeploy_test_dir, toolcha """ fixture = SIRACUSA_L3_UNTILED_TRAINING_MODELS[test_name] overrides = SIRACUSA_TRAINING_MODEL_OVERRIDES.get(test_name, {}) - extra_cmake = list(cmake_args) + [ - f"-DDEEPLOY_L1_AS_L2=ON", - f"-DDEEPLOY_FAKE_L1_SIZE={fixture['fake_l1_size']}", - ] + extra_cmake = list(cmake_args) + if fixture.get("needs_fake_l1", False): + # Only opt in when peak L1 working > physical L1 — the wrap also + # intercepts SDK-internal pi_cl_l1_malloc calls and starves the + # cluster on small models that don't need it. + extra_cmake += [ + f"-DDEEPLOY_L1_AS_L2=ON", + f"-DDEEPLOY_FAKE_L1_SIZE={fixture['fake_l1_size']}", + ] effective_skipsim = skipsim or (os.environ.get("CI") == "true" and fixture.get("skip_sim_in_ci", False)) config = create_test_config( test_name = test_name, diff --git a/DeeployTest/test_siracusa_tiled_config.py b/DeeployTest/test_siracusa_tiled_config.py index d3273119..6811a2c0 100644 --- a/DeeployTest/test_siracusa_tiled_config.py +++ b/DeeployTest/test_siracusa_tiled_config.py @@ -188,41 +188,33 @@ # # fake_l1_size baselining method: spike with --l1=4_000_000 → read off # MEMORYARENA_L1 size from generated TrainingNetwork.c → round up. +# Untiled-L3 baseline. ONLY contains models whose peak L1 working set +# exceeds physical Siracusa L1 (~256 KB) — those are the only ones where +# "untiled-L3" produces a different schedule than the existing tiled L3 +# singlebuffer test. Smaller models (CCT, CCT_LoRA, ~16 KB working) get +# the same byte-for-byte TrainingNetwork.c whether you pick tiled L3 or +# untiled L3, so adding them here would be redundant — their tiled L3 +# entry IS their untiled-L3 baseline. +# +# The deeploy_fake_l1 shim (DEEPLOY_L1_AS_L2) intercepts pi_cl_l1_malloc +# and serves it from a static FC-L2 arena, which is the only way the +# 700-KB-class L1 working buffer can fit at runtime. The shim has the +# side effect of redirecting *every* pi_cl_l1_malloc call (including any +# SDK-internal one), which breaks small models that don't actually need +# it — hence the per-fixture needs_fake_l1 gate. L3_UNTILED_TRAINING_MODELS = { - # Per-model l1 / l2 / fake_l1_size were established by spiking - # testMVPTraining.py with --defaultMemLevel=L3 and reading - # MEMORYARENA_L1 from the generated TrainingNetwork.c. - # - # - l1: planner-side budget passed to SBTiler (forces single-tile - # schedules when generous enough). Use the smallest value that - # still compiles and yields the minimal-tile shape — larger values - # blow MiniMalloc's RAM appetite past CI's 16 GB ceiling. - # - l2: planner-side L2 budget; 2 MB matches the existing tiled L3 - # baseline. - # - fake_l1_size: physical bytes for the FC-L2-backed pi_cl_l1_malloc - # arena (deeploy_fake_l1.c). Must be ≥ MEMORYARENA_L1, with a - # small headroom for alignment. - "Models/Training/CCT/cct_train": { - "l1": 64_000, - "l2": 2_000_000, - "fake_l1_size": 32_768, # peak L1 working = 16388 B - # Sim runs in CI: 16 KB working set is tiny enough that gvsoc - # doesn't OOM ubuntu-latest's 16 GB. - "skip_sim_in_ci": False, - }, - "Models/Training/CCT_LoRA/cct_lora_train": { - "l1": 64_000, - "l2": 2_000_000, - "fake_l1_size": 32_768, # peak L1 working = 16384 B - "skip_sim_in_ci": False, - }, "Models/Training/ResNet8/resnet8_train": { + # 800 KB is the smallest --l1 that yields the minimal-tile shape + # (peak L1 working = 739 KB). Larger values inflate MiniMalloc's + # RAM appetite past CI's ceiling. "l1": 800_000, "l2": 2_000_000, "fake_l1_size": 1_048_576, # peak L1 working = 739328 B - # Two prior CI runs got SIGKILLed (exit 137) at ~8 min during sim. - # Skip until the sim-side memory leak is debugged or we move to a - # bigger runner. --skipsim still verifies codegen + compile + the + "needs_fake_l1": True, + # Two prior CI runs got SIGKILLed (exit 137) at ~8 min during sim + # — gvsoc memory grows unbounded for the long single-tile training + # loop. --skipsim until that's debugged or we move to a bigger + # runner. --skipsim still verifies codegen + compile + the # fake-L1 shim's link integrity. "skip_sim_in_ci": True, }, @@ -230,6 +222,7 @@ "l1": 800_000, # below 800K codegen asserts on accum_buffer DMA "l2": 2_000_000, "fake_l1_size": 786_432, # peak L1 working = 542720 B + "needs_fake_l1": True, "skip_sim_in_ci": True, # same OOM concern as ResNet8 }, } From 5580739a41278868bbdbd8a4564493751574c110 Mon Sep 17 00:00:00 2001 From: runwangdl Date: Sun, 10 May 2026 20:34:08 +0000 Subject: [PATCH 09/18] fix(fake-l1): try real L1 first; restore CCT/CCT_LoRA + sim for all 4 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two related fixes so the L3-untiled CI table actually carries cycles for every L3 model. # Shim no longer pollutes SDK Old shim served EVERY pi_cl_l1_malloc from the static FC-L2 arena — SDK-internal calls included. CCT untiled then reported 528 cycles (cluster never ran the kernels because some SDK invariant broke). New shim tries __real_pi_cl_l1_malloc first. Only requests that real L1 cannot satisfy fall through to the FC-L2 arena. __wrap_pi_cl_l1_free mirrors by routing arena-range pointers to the bump rewind and everything else to __real_pi_cl_l1_free. SDK gets real L1 transparently; only Deeploy's oversized MEMORYARENA_L1 sees the fake arena. # Test fixture restored CCT and CCT_LoRA back in L3_UNTILED_TRAINING_MODELS with needs_fake_l1=False — they fit physical L1, codegen is byte-identical to the tiled-L3 entry, and now sim runs cleanly because the shim is no longer destructive. Their cycles therefore == tiled-L3 cycles by construction (a useful sanity row in the summary). ResNet8 / MobileNetV1: skip_sim_in_ci=False — re-enable sim with the fixed shim. The earlier ~8-min SIGKILLs were almost certainly the shim looping cluster init, not a genuine gvsoc memory leak. If sim still OOMs on ubuntu-latest after this fix, fall back to skipsim. Co-Authored-By: Claude Opus 4.7 (1M context) --- DeeployTest/test_siracusa_tiled_config.py | 59 ++++++++----- .../PULPOpen/src/deeploy_fake_l1.c | 83 ++++++++++++------- 2 files changed, 92 insertions(+), 50 deletions(-) diff --git a/DeeployTest/test_siracusa_tiled_config.py b/DeeployTest/test_siracusa_tiled_config.py index 6811a2c0..88117511 100644 --- a/DeeployTest/test_siracusa_tiled_config.py +++ b/DeeployTest/test_siracusa_tiled_config.py @@ -188,21 +188,41 @@ # # fake_l1_size baselining method: spike with --l1=4_000_000 → read off # MEMORYARENA_L1 size from generated TrainingNetwork.c → round up. -# Untiled-L3 baseline. ONLY contains models whose peak L1 working set -# exceeds physical Siracusa L1 (~256 KB) — those are the only ones where -# "untiled-L3" produces a different schedule than the existing tiled L3 -# singlebuffer test. Smaller models (CCT, CCT_LoRA, ~16 KB working) get -# the same byte-for-byte TrainingNetwork.c whether you pick tiled L3 or -# untiled L3, so adding them here would be redundant — their tiled L3 -# entry IS their untiled-L3 baseline. +# Untiled-L3 baseline — single-tile-per-tensor schedules for every L3 +# training model so the user can read off "untiled L3 latency" alongside +# the existing tiled-L3 cycles. Each fixture goes through the same +# SBTiler infrastructure as the L3 singlebuffer tests, but with --l1 +# inflated to the smallest value that still yields the minimal-tile +# shape — at that point the generated C is one kernel call per op with +# integral L3↔L2 DMA wrappers, no spatial split. # -# The deeploy_fake_l1 shim (DEEPLOY_L1_AS_L2) intercepts pi_cl_l1_malloc -# and serves it from a static FC-L2 arena, which is the only way the -# 700-KB-class L1 working buffer can fit at runtime. The shim has the -# side effect of redirecting *every* pi_cl_l1_malloc call (including any -# SDK-internal one), which breaks small models that don't actually need -# it — hence the per-fixture needs_fake_l1 gate. +# Two per-fixture knobs: +# +# - needs_fake_l1: True when peak L1 working > physical Siracusa L1 +# (256 KB). When True, build adds -DDEEPLOY_L1_AS_L2 + the linker +# wrap that serves oversized pi_cl_l1_malloc calls from a static +# FC-L2 arena (deeploy_fake_l1.c). Small-working-set models (CCT / +# CCT_LoRA, ~16 KB peak) leave it False — they fit real L1 and +# produce byte-identical codegen to the tiled L3 entry, so their +# untiled cycles == tiled cycles by construction. +# +# - skip_sim_in_ci: True for models where gvsoc has historically +# OOMed the runner during the long single-tile loop. CI still +# verifies codegen + compile + link in that case; sim is a manual +# local exercise. L3_UNTILED_TRAINING_MODELS = { + "Models/Training/CCT/cct_train": { + "l1": 64_000, + "l2": 2_000_000, + "needs_fake_l1": False, # peak L1 working = 16388 B fits real L1 + "skip_sim_in_ci": False, + }, + "Models/Training/CCT_LoRA/cct_lora_train": { + "l1": 64_000, + "l2": 2_000_000, + "needs_fake_l1": False, # peak L1 working = 16384 B + "skip_sim_in_ci": False, + }, "Models/Training/ResNet8/resnet8_train": { # 800 KB is the smallest --l1 that yields the minimal-tile shape # (peak L1 working = 739 KB). Larger values inflate MiniMalloc's @@ -211,19 +231,18 @@ "l2": 2_000_000, "fake_l1_size": 1_048_576, # peak L1 working = 739328 B "needs_fake_l1": True, - # Two prior CI runs got SIGKILLed (exit 137) at ~8 min during sim - # — gvsoc memory grows unbounded for the long single-tile training - # loop. --skipsim until that's debugged or we move to a bigger - # runner. --skipsim still verifies codegen + compile + the - # fake-L1 shim's link integrity. - "skip_sim_in_ci": True, + # Try sim again with the fixed shim (real-L1-first, fall back to + # FC-L2 arena only when L1 is exhausted). Earlier runs OOMed at + # ~8 min — believed to be the broken shim looping cluster init, + # not a real gvsoc memory leak. + "skip_sim_in_ci": False, }, "Models/Training/MobileNetV1/mobilenetv1_train": { "l1": 800_000, # below 800K codegen asserts on accum_buffer DMA "l2": 2_000_000, "fake_l1_size": 786_432, # peak L1 working = 542720 B "needs_fake_l1": True, - "skip_sim_in_ci": True, # same OOM concern as ResNet8 + "skip_sim_in_ci": False, }, } diff --git a/TargetLibraries/PULPOpen/src/deeploy_fake_l1.c b/TargetLibraries/PULPOpen/src/deeploy_fake_l1.c index db1c8b67..e703a413 100644 --- a/TargetLibraries/PULPOpen/src/deeploy_fake_l1.c +++ b/TargetLibraries/PULPOpen/src/deeploy_fake_l1.c @@ -3,20 +3,30 @@ * * SPDX-License-Identifier: Apache-2.0 * - * Untiled-L3 baseline: redirect cluster-L1 allocations to a static L2 arena. + * Untiled-L3 baseline: redirect cluster-L1 allocations to a static L2 arena + * — but ONLY for allocations that don't fit in the real L1 heap. * * Activated by `-DDEEPLOY_L1_AS_L2 -DDEEPLOY_FAKE_L1_SIZE=N` plus the linker - * flags `-Wl,--wrap=pi_cl_l1_malloc -Wl,--wrap=pi_cl_l1_free`. The wrap pair - * intercepts the SDK's strong symbols so the deeploy-generated code keeps - * calling pmsis_l1_malloc / pi_cl_l1_malloc as if it were targeting cluster - * L1 — physically the bytes live in the FC L2 region instead, which on - * Siracusa has ~1.94 MB of headroom (vs. ~256 KB for real cluster L1). + * flags `-Wl,--wrap=pi_cl_l1_malloc -Wl,--wrap=pi_cl_l1_free`. + * + * Coexistence with the SDK + * ------------------------ + * The wrap intercepts EVERY pi_cl_l1_malloc call site, including SDK-internal + * ones (cluster init, driver scratch, etc.). A first version that always + * served from the static arena broke the cluster on small models — likely + * because SDK code received a pointer outside the real L1 region and either + * its own bookkeeping went wrong or a downstream API rejected it. + * + * Mitigation: try the SDK's real L1 allocator first via the linker's + * `__real_*` symbols. If that succeeds, hand the SDK pointer back. Only when + * the request is too big for real L1 (the case we're here for: an oversized + * MEMORYARENA_L1) fall through to the static FC-L2 arena. Free mirrors the + * decision by checking whether the pointer falls inside our arena. * * Trade-off: kernels that were tuned for L1's 1-cycle access see L2 latency - * (~7 cycles), so cycle counts under this mode are NOT representative of - * silicon. The mode exists to provide a single-tile-per-tensor untiled-L3 - * baseline for *correctness* comparison against the tiled L3 path; cycle - * realism for the same workload still requires the tiled run. + * (~7 cycles) for the buffers served from the fake arena. Cycles under this + * mode are NOT silicon-representative — the mode exists to give a per-op + * single-tile latency baseline against the existing tiled-L3 path. */ #include @@ -29,42 +39,55 @@ #include "pmsis.h" +/* Linker-provided originals (--wrap=foo exposes __real_foo). */ +extern void *__real_pi_cl_l1_malloc(struct pi_device *device, uint32_t size); +extern void __real_pi_cl_l1_free(struct pi_device *device, void *chunk, int size); + /* Static arena in FC L2 — sized at compile time. PI_L2 maps to the SDK's L2 * shared region (0x1C010000-0x1C200000 on Siracusa, ~1.94 MB). Aligned to 8 * to satisfy any kernel that reinterprets float32_t / pulpv2 v2f16 buffers. */ __attribute__((aligned(8))) PI_L2 static uint8_t deeploy_fake_l1_arena[DEEPLOY_FAKE_L1_SIZE]; - -/* Bump pointer + free-list. The deeploy-generated harness allocates the L1 - * arena once at InitTrainingNetwork() time and frees it once at teardown, - * so we don't need a real heap — a bump allocator with a single rewind on - * full-arena free is sufficient and cheap. - * - * If the harness pattern ever changes (e.g. fine-grained per-op alloc/free), - * swap this for an extern_alloc_t pool the way dory_mem.c does for L3. */ static uint32_t deeploy_fake_l1_offset = 0; +static inline int in_fake_arena(const void *p) { + return (const uint8_t *)p >= deeploy_fake_l1_arena + && (const uint8_t *)p < deeploy_fake_l1_arena + DEEPLOY_FAKE_L1_SIZE; +} + void *__wrap_pi_cl_l1_malloc(struct pi_device *device, uint32_t size) { - (void)device; - /* 8-byte alignment for every allocation so consecutive callers stay - * aligned even when `size` is not a multiple of 8. */ + /* Try real L1 first — any small SDK / Deeploy alloc that fits stays in + * real L1, so SDK bookkeeping and L1-tuned kernels are unaffected. */ + void *p = __real_pi_cl_l1_malloc(device, size); + if (p != (void *)0) { + return p; + } + /* Real L1 exhausted (or request bigger than L1 heap). Serve from FC-L2 + * arena: the only legitimate caller here is Deeploy's MEMORYARENA_L1 + * for a model whose peak L1 working set exceeds 256 KB. */ uint32_t aligned = (size + 7u) & ~7u; if (deeploy_fake_l1_offset + aligned > DEEPLOY_FAKE_L1_SIZE) { return (void *)0; } - void *p = (void *)&deeploy_fake_l1_arena[deeploy_fake_l1_offset]; + void *q = (void *)&deeploy_fake_l1_arena[deeploy_fake_l1_offset]; deeploy_fake_l1_offset += aligned; - return p; + return q; } void __wrap_pi_cl_l1_free(struct pi_device *device, void *chunk, int size) { - (void)device; - (void)chunk; - (void)size; - /* Bump-allocator semantics: per-block free is a no-op. The harness - * frees the whole arena at teardown; we rewind there. */ - if (deeploy_fake_l1_offset >= (uint32_t)size) { - deeploy_fake_l1_offset -= ((uint32_t)size + 7u) & ~7u; + if (!in_fake_arena(chunk)) { + __real_pi_cl_l1_free(device, chunk, size); + return; + } + /* Bump-allocator rewind for arena pointers. Safe under LIFO free order + * (the only pattern Deeploy's harness uses); a non-LIFO free is a no-op + * and leaks until the next full reset. */ + uint32_t aligned = ((uint32_t)size + 7u) & ~7u; + if (deeploy_fake_l1_offset >= aligned) { + uint8_t *expected = deeploy_fake_l1_arena + deeploy_fake_l1_offset - aligned; + if ((uint8_t *)chunk == expected) { + deeploy_fake_l1_offset -= aligned; + } } } From ef41f3a754e1affa09a38ee43b360cb7fec1de6f Mon Sep 17 00:00:00 2001 From: runwangdl Date: Sun, 10 May 2026 21:13:24 +0000 Subject: [PATCH 10/18] =?UTF-8?q?feat(untiled):=20sed-based=20L1=E2=86=92L?= =?UTF-8?q?2=20codegen=20rewrite=20+=20isolate=20to=20one=20CI=20job?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces the fake-L1 shim approach with a direct codegen post-process so all 4 L3 untiled fixtures end up with kernels physically reading FC L2 (no SDK pollution, no wrap, no shim). # Codegen post-process (test_siracusa_tiled_training_l3_untiled) After generate_network() and before configure_cmake(), the test rewrites the generated TrainingNetwork.c / OptimizerNetwork.c: pmsis_l1_malloc -> pi_l2_malloc PI_L1 -> PI_L2 Every L1-annotated buffer (including MEMORYARENA_L1) now lives in FC L2. Cluster cores access kernel buffers via the fabric (~7x slower than real L1) — this is the deliberate "untiled, L2-resident working set" cycle semantic the user asked for. All 4 L3 models give comparable cycle counts under the same resource model. # Removals - deeploy_fake_l1.c (gone) - DEEPLOY_L1_AS_L2 / DEEPLOY_FAKE_L1_SIZE / linker --wrap flags (gone) - needs_fake_l1 / fake_l1_size fixture fields (gone) # CI temporarily isolated Goal of this branch is to collect untiled cycle data, so: - ci-platform-siracusa.yml: push/pull_request triggers disabled (workflow_dispatch only) - ci-platform-siracusa-tiled.yml: L2/L3-singlebuffer jobs commented out; only siracusa-training-tiled-l3-untiled runs Both flagged with "restore before merging" comments. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../workflows/ci-platform-siracusa-tiled.yml | 50 +++++----- .github/workflows/ci-platform-siracusa.yml | 9 +- DeeployTest/test_platforms.py | 59 ++++++++---- DeeployTest/test_siracusa_tiled_config.py | 41 +++----- TargetLibraries/PULPOpen/CMakeLists.txt | 17 ---- .../PULPOpen/src/deeploy_fake_l1.c | 94 ------------------- 6 files changed, 82 insertions(+), 188 deletions(-) delete mode 100644 TargetLibraries/PULPOpen/src/deeploy_fake_l1.c diff --git a/.github/workflows/ci-platform-siracusa-tiled.yml b/.github/workflows/ci-platform-siracusa-tiled.yml index e730c583..5536fda5 100644 --- a/.github/workflows/ci-platform-siracusa-tiled.yml +++ b/.github/workflows/ci-platform-siracusa-tiled.yml @@ -29,31 +29,33 @@ jobs: with: docker_image_deeploy: ${{ inputs.docker_image_deeploy }} - # Training tests - L2 singlebuffer - siracusa-training-tiled-l2-singlebuffer: - needs: select-env - uses: ./.github/workflows/_runner-siracusa-tiled.yml - with: - runner: ${{ needs.select-env.outputs.runner }} - docker-image: ${{ needs.select-env.outputs.image }} - pytest-marker: "training and l2 and singlebuffer" - - # Training tests - L3 singlebuffer (models that spill weights/activations to L3) - siracusa-training-tiled-l3-singlebuffer: - needs: select-env - uses: ./.github/workflows/_runner-siracusa-tiled.yml - with: - runner: ${{ needs.select-env.outputs.runner }} - docker-image: ${{ needs.select-env.outputs.image }} - pytest-marker: "training and l3 and singlebuffer" - - # Training tests - L3 untiled baseline (single-tile-per-tensor; uses fake-L1 - # shim to back the oversized L1 working buffer with FC L2). + # NOTE: All other Siracusa-tiled jobs are temporarily disabled on this + # branch so the L3-untiled job runs in isolation while we collect + # cycle-count data. Restore the L2 / L3 singlebuffer entries below + # before merging. + # + # # Training tests - L2 singlebuffer + # siracusa-training-tiled-l2-singlebuffer: + # needs: select-env + # uses: ./.github/workflows/_runner-siracusa-tiled.yml + # with: + # runner: ${{ needs.select-env.outputs.runner }} + # docker-image: ${{ needs.select-env.outputs.image }} + # pytest-marker: "training and l2 and singlebuffer" # - # Per-model skip_sim_in_ci gate (in test_siracusa_tiled_config.py) decides - # which fixtures actually run gvsoc on CI: CCT/CCT_LoRA do (16 KB working - # set, gvsoc fits comfortably); ResNet8/MobileNetV1 are --skipsim'd until - # the sim-side OOM at ~8 min is debugged or we move to a bigger runner. + # # Training tests - L3 singlebuffer + # siracusa-training-tiled-l3-singlebuffer: + # needs: select-env + # uses: ./.github/workflows/_runner-siracusa-tiled.yml + # with: + # runner: ${{ needs.select-env.outputs.runner }} + # docker-image: ${{ needs.select-env.outputs.image }} + # pytest-marker: "training and l3 and singlebuffer" + + # Training tests - L3 untiled baseline. Codegen post-process rewrites + # every L1-annotated buffer to FC L2 so cluster cores access kernel + # buffers via the fabric — "untiled, L2-resident working set" cycle + # semantics for all 4 L3 models. siracusa-training-tiled-l3-untiled: needs: select-env uses: ./.github/workflows/_runner-siracusa-tiled.yml diff --git a/.github/workflows/ci-platform-siracusa.yml b/.github/workflows/ci-platform-siracusa.yml index 7a4f415e..c1e7db5d 100644 --- a/.github/workflows/ci-platform-siracusa.yml +++ b/.github/workflows/ci-platform-siracusa.yml @@ -5,13 +5,10 @@ --- name: CI • Siracusa +# NOTE: Push / pull_request triggers temporarily disabled on this branch +# so only the L3-untiled job runs while we collect cycle-count data. +# Restore the push: / pull_request: blocks before merging. "on": - push: - branches: - - "**" - tags: - - "v*.*.*" - pull_request: workflow_dispatch: inputs: docker_image_deeploy: diff --git a/DeeployTest/test_platforms.py b/DeeployTest/test_platforms.py index 4dd3fb4a..df2653a7 100644 --- a/DeeployTest/test_platforms.py +++ b/DeeployTest/test_platforms.py @@ -434,27 +434,29 @@ def test_siracusa_tiled_training_l3_untiled(test_name, deeploy_test_dir, toolcha skipsim) -> None: """Untiled-L3 baseline. - Reuses the tiled codegen pipeline but inflates --l1 large enough that the - SBTiler picks single-tile-per-tensor schedules. The deeploy_fake_l1 shim - (DEEPLOY_L1_AS_L2) redirects pi_cl_l1_malloc into an FC-L2 arena so the - oversized "L1" working buffer (>physical 256 KB) actually fits. - - Per-model skip_sim_in_ci gate: large fixtures (ResNet8 / MobileNetV1) - skip the gvsoc sim on CI runners because two prior runs got SIGKILLed - at ~8 min during simulation. Local runs (no `CI` env var) still run - the full pipeline so the user can verify losses manually. + SBTiler picks single-tile-per-tensor schedules (--l1 inflated above the + op working set so no spatial split happens). The generated C is one + kernel call per op with integral L3↔L2 DMA wrappers. + + To make the L1 staging buffer physically live in FC L2 (so cycles + represent "kernel actually accessing L2"), we post-process the + generated TrainingNetwork.c / OptimizerNetwork.c after codegen but + before cmake build: + + pmsis_l1_malloc -> pi_l2_malloc + PI_L1 -> PI_L2 + + Every L1-annotated buffer ends up in FC L2. Cluster cores access L2 + via the fabric (~7x slower than real L1) — that's the deliberate + semantics of "untiled L2-resident". No fake-L1 shim, no linker wrap, + no SDK pollution. """ + from pathlib import Path + + from testUtils.core.execution import build_binary, configure_cmake, generate_network, run_simulation + fixture = SIRACUSA_L3_UNTILED_TRAINING_MODELS[test_name] overrides = SIRACUSA_TRAINING_MODEL_OVERRIDES.get(test_name, {}) - extra_cmake = list(cmake_args) - if fixture.get("needs_fake_l1", False): - # Only opt in when peak L1 working > physical L1 — the wrap also - # intercepts SDK-internal pi_cl_l1_malloc calls and starves the - # cluster on small models that don't need it. - extra_cmake += [ - f"-DDEEPLOY_L1_AS_L2=ON", - f"-DDEEPLOY_FAKE_L1_SIZE={fixture['fake_l1_size']}", - ] effective_skipsim = skipsim or (os.environ.get("CI") == "true" and fixture.get("skip_sim_in_ci", False)) config = create_test_config( test_name = test_name, @@ -463,7 +465,7 @@ def test_siracusa_tiled_training_l3_untiled(test_name, deeploy_test_dir, toolcha deeploy_test_dir = deeploy_test_dir, toolchain = toolchain, toolchain_dir = toolchain_dir, - cmake_args = extra_cmake, + cmake_args = cmake_args, tiling = True, cores = SIRACUSA_DEFAULT_CORES, l1 = fixture["l1"], @@ -474,7 +476,24 @@ def test_siracusa_tiled_training_l3_untiled(test_name, deeploy_test_dir, toolcha training_num_data_inputs = overrides.get("num_data_inputs"), training_tolerance = overrides.get("tolerance"), ) - run_and_assert_test(test_name, config, skipgen, effective_skipsim) + + # Inline the test runner stages so we can sed between codegen and build. + generate_network(config, skip = skipgen) + for c_name in ("TrainingNetwork.c", "OptimizerNetwork.c"): + c_path = Path(config.gen_dir) / c_name + if not c_path.exists(): + continue + text = c_path.read_text() + text = text.replace("pmsis_l1_malloc", "pi_l2_malloc") + text = text.replace("PI_L1 ", "PI_L2 ") + c_path.write_text(text) + configure_cmake(config) + build_binary(config) + result = run_simulation(config, skip = effective_skipsim) + assert result.success, (f"Test {test_name} failed with {result.error_count} errors out of " + f"{result.total_count}\nOutput:\n{result.stdout}") + if result.error_count >= 0: + assert result.error_count == 0, (f"Found {result.error_count} errors out of {result.total_count} tests") @pytest.mark.siracusa_tiled diff --git a/DeeployTest/test_siracusa_tiled_config.py b/DeeployTest/test_siracusa_tiled_config.py index 88117511..2dd8f2ba 100644 --- a/DeeployTest/test_siracusa_tiled_config.py +++ b/DeeployTest/test_siracusa_tiled_config.py @@ -190,37 +190,32 @@ # MEMORYARENA_L1 size from generated TrainingNetwork.c → round up. # Untiled-L3 baseline — single-tile-per-tensor schedules for every L3 # training model so the user can read off "untiled L3 latency" alongside -# the existing tiled-L3 cycles. Each fixture goes through the same -# SBTiler infrastructure as the L3 singlebuffer tests, but with --l1 -# inflated to the smallest value that still yields the minimal-tile -# shape — at that point the generated C is one kernel call per op with -# integral L3↔L2 DMA wrappers, no spatial split. +# the existing tiled-L3 cycles. # -# Two per-fixture knobs: +# Each fixture goes through the same SBTiler infrastructure as the L3 +# singlebuffer tests, with --l1 inflated to the smallest value that +# yields the minimal-tile shape (one kernel call per op + integral +# L3↔L2 DMA, no spatial split). # -# - needs_fake_l1: True when peak L1 working > physical Siracusa L1 -# (256 KB). When True, build adds -DDEEPLOY_L1_AS_L2 + the linker -# wrap that serves oversized pi_cl_l1_malloc calls from a static -# FC-L2 arena (deeploy_fake_l1.c). Small-working-set models (CCT / -# CCT_LoRA, ~16 KB peak) leave it False — they fit real L1 and -# produce byte-identical codegen to the tiled L3 entry, so their -# untiled cycles == tiled cycles by construction. +# After codegen, the test post-processes TrainingNetwork.c / +# OptimizerNetwork.c to swap pmsis_l1_malloc → pi_l2_malloc and +# PI_L1 → PI_L2, so every L1-annotated buffer physically lives in +# FC L2. Cluster cores access these via the fabric (~7x slower than +# real L1) — that's the deliberate semantics of "untiled L2-resident". # -# - skip_sim_in_ci: True for models where gvsoc has historically -# OOMed the runner during the long single-tile loop. CI still -# verifies codegen + compile + link in that case; sim is a manual -# local exercise. +# skip_sim_in_ci: True for fixtures where gvsoc has historically OOMed +# during the long single-tile loop. CI still verifies codegen + +# compile + link in that case; sim is deferred to a manual local run +# or a beefier runner. L3_UNTILED_TRAINING_MODELS = { "Models/Training/CCT/cct_train": { "l1": 64_000, "l2": 2_000_000, - "needs_fake_l1": False, # peak L1 working = 16388 B fits real L1 "skip_sim_in_ci": False, }, "Models/Training/CCT_LoRA/cct_lora_train": { "l1": 64_000, "l2": 2_000_000, - "needs_fake_l1": False, # peak L1 working = 16384 B "skip_sim_in_ci": False, }, "Models/Training/ResNet8/resnet8_train": { @@ -229,19 +224,11 @@ # RAM appetite past CI's ceiling. "l1": 800_000, "l2": 2_000_000, - "fake_l1_size": 1_048_576, # peak L1 working = 739328 B - "needs_fake_l1": True, - # Try sim again with the fixed shim (real-L1-first, fall back to - # FC-L2 arena only when L1 is exhausted). Earlier runs OOMed at - # ~8 min — believed to be the broken shim looping cluster init, - # not a real gvsoc memory leak. "skip_sim_in_ci": False, }, "Models/Training/MobileNetV1/mobilenetv1_train": { "l1": 800_000, # below 800K codegen asserts on accum_buffer DMA "l2": 2_000_000, - "fake_l1_size": 786_432, # peak L1 working = 542720 B - "needs_fake_l1": True, "skip_sim_in_ci": False, }, } diff --git a/TargetLibraries/PULPOpen/CMakeLists.txt b/TargetLibraries/PULPOpen/CMakeLists.txt index 3ae97d91..ce39fea7 100644 --- a/TargetLibraries/PULPOpen/CMakeLists.txt +++ b/TargetLibraries/PULPOpen/CMakeLists.txt @@ -71,23 +71,6 @@ target_link_libraries(deeploypulp PUBLIC pulp-nn-mixed) target_link_libraries(deeploypulp INTERFACE pulp-sdk) target_sources(deeploypulp INTERFACE $) -# Untiled-L3 baseline: redirect pi_cl_l1_malloc/free to a static L2 arena via -# linker --wrap so cluster-L1 alloc requests up to DEEPLOY_FAKE_L1_SIZE bytes -# (must be ≤ remaining FC L2 ~1.94 MB) succeed even though physical L1 is only -# 256 KB. Source file deeploy_fake_l1.c is no-op when DEEPLOY_L1_AS_L2 is OFF. -option(DEEPLOY_L1_AS_L2 "Redirect pi_cl_l1_malloc to a static FC-L2 arena (untiled-L3 baseline)" OFF) -set(DEEPLOY_FAKE_L1_SIZE "1048576" CACHE STRING "Size in bytes of the fake-L1 arena placed in FC L2") -if(DEEPLOY_L1_AS_L2) - target_compile_definitions(deeploypulp PRIVATE - DEEPLOY_L1_AS_L2 - DEEPLOY_FAKE_L1_SIZE=${DEEPLOY_FAKE_L1_SIZE} - ) - target_link_options(deeploypulp INTERFACE - "-Wl,--wrap=pi_cl_l1_malloc" - "-Wl,--wrap=pi_cl_l1_free" - ) -endif() - set(PULP_NNX_PLATFORMS "Siracusa;Siracusa_w_neureka") if (platform IN_LIST PULP_NNX_PLATFORMS) if (platform STREQUAL "Siracusa" OR platform STREQUAL "Siracusa_w_neureka") diff --git a/TargetLibraries/PULPOpen/src/deeploy_fake_l1.c b/TargetLibraries/PULPOpen/src/deeploy_fake_l1.c deleted file mode 100644 index e703a413..00000000 --- a/TargetLibraries/PULPOpen/src/deeploy_fake_l1.c +++ /dev/null @@ -1,94 +0,0 @@ -/* - * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna - * - * SPDX-License-Identifier: Apache-2.0 - * - * Untiled-L3 baseline: redirect cluster-L1 allocations to a static L2 arena - * — but ONLY for allocations that don't fit in the real L1 heap. - * - * Activated by `-DDEEPLOY_L1_AS_L2 -DDEEPLOY_FAKE_L1_SIZE=N` plus the linker - * flags `-Wl,--wrap=pi_cl_l1_malloc -Wl,--wrap=pi_cl_l1_free`. - * - * Coexistence with the SDK - * ------------------------ - * The wrap intercepts EVERY pi_cl_l1_malloc call site, including SDK-internal - * ones (cluster init, driver scratch, etc.). A first version that always - * served from the static arena broke the cluster on small models — likely - * because SDK code received a pointer outside the real L1 region and either - * its own bookkeeping went wrong or a downstream API rejected it. - * - * Mitigation: try the SDK's real L1 allocator first via the linker's - * `__real_*` symbols. If that succeeds, hand the SDK pointer back. Only when - * the request is too big for real L1 (the case we're here for: an oversized - * MEMORYARENA_L1) fall through to the static FC-L2 arena. Free mirrors the - * decision by checking whether the pointer falls inside our arena. - * - * Trade-off: kernels that were tuned for L1's 1-cycle access see L2 latency - * (~7 cycles) for the buffers served from the fake arena. Cycles under this - * mode are NOT silicon-representative — the mode exists to give a per-op - * single-tile latency baseline against the existing tiled-L3 path. - */ - -#include - -#ifdef DEEPLOY_L1_AS_L2 - -#ifndef DEEPLOY_FAKE_L1_SIZE -#error "DEEPLOY_L1_AS_L2 requires -DDEEPLOY_FAKE_L1_SIZE=" -#endif - -#include "pmsis.h" - -/* Linker-provided originals (--wrap=foo exposes __real_foo). */ -extern void *__real_pi_cl_l1_malloc(struct pi_device *device, uint32_t size); -extern void __real_pi_cl_l1_free(struct pi_device *device, void *chunk, int size); - -/* Static arena in FC L2 — sized at compile time. PI_L2 maps to the SDK's L2 - * shared region (0x1C010000-0x1C200000 on Siracusa, ~1.94 MB). Aligned to 8 - * to satisfy any kernel that reinterprets float32_t / pulpv2 v2f16 buffers. */ -__attribute__((aligned(8))) -PI_L2 static uint8_t deeploy_fake_l1_arena[DEEPLOY_FAKE_L1_SIZE]; -static uint32_t deeploy_fake_l1_offset = 0; - -static inline int in_fake_arena(const void *p) { - return (const uint8_t *)p >= deeploy_fake_l1_arena - && (const uint8_t *)p < deeploy_fake_l1_arena + DEEPLOY_FAKE_L1_SIZE; -} - -void *__wrap_pi_cl_l1_malloc(struct pi_device *device, uint32_t size) { - /* Try real L1 first — any small SDK / Deeploy alloc that fits stays in - * real L1, so SDK bookkeeping and L1-tuned kernels are unaffected. */ - void *p = __real_pi_cl_l1_malloc(device, size); - if (p != (void *)0) { - return p; - } - /* Real L1 exhausted (or request bigger than L1 heap). Serve from FC-L2 - * arena: the only legitimate caller here is Deeploy's MEMORYARENA_L1 - * for a model whose peak L1 working set exceeds 256 KB. */ - uint32_t aligned = (size + 7u) & ~7u; - if (deeploy_fake_l1_offset + aligned > DEEPLOY_FAKE_L1_SIZE) { - return (void *)0; - } - void *q = (void *)&deeploy_fake_l1_arena[deeploy_fake_l1_offset]; - deeploy_fake_l1_offset += aligned; - return q; -} - -void __wrap_pi_cl_l1_free(struct pi_device *device, void *chunk, int size) { - if (!in_fake_arena(chunk)) { - __real_pi_cl_l1_free(device, chunk, size); - return; - } - /* Bump-allocator rewind for arena pointers. Safe under LIFO free order - * (the only pattern Deeploy's harness uses); a non-LIFO free is a no-op - * and leaks until the next full reset. */ - uint32_t aligned = ((uint32_t)size + 7u) & ~7u; - if (deeploy_fake_l1_offset >= aligned) { - uint8_t *expected = deeploy_fake_l1_arena + deeploy_fake_l1_offset - aligned; - if ((uint8_t *)chunk == expected) { - deeploy_fake_l1_offset -= aligned; - } - } -} - -#endif /* DEEPLOY_L1_AS_L2 */ From 13741bf6d8ad123bb4860a6274401687f2e99247 Mon Sep 17 00:00:00 2001 From: runwangdl Date: Sun, 10 May 2026 21:41:06 +0000 Subject: [PATCH 11/18] fix(untiled): mchan_transfer_1d -> memcpy under DEEPLOY_L1_AS_L2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CI run 25639948195 had all 4 L3-untiled fixtures FAILED with "computed=0.0 ref=N.NN" + cluster L1 bank "out-of-bound request" warnings. Cause: PULP mchan DMA hardware ignores destination pointer addresses and unconditionally routes the `loc` parameter into cluster L1 banks. Sed-rewriting buffers to FC L2 left the DMA calls intact, so DMA wrote into L1 (out of bounds) while kernels read from L2 (empty). Fix in TargetLibraries/PULPOpen/inc/mchan_v7.h: under DEEPLOY_L1_AS_L2, mchan_transfer_1d is replaced with memcpy that respects the EXT2LOC / LOC2EXT direction flag, and the channel API (alloc/wait/free/is_busy) becomes a no-op. Combined with the existing test-side sed, every buffer + every staging copy now lives in / goes through FC L2 — the "untiled L2-resident" semantic the user actually wanted. CMake exposes the option; the L3-untiled pytest fixture passes -DDEEPLOY_L1_AS_L2=ON automatically. Only mchan_transfer_1d gets the memcpy fallback because that's the only variant the 4 L3 training fixtures emit; the 2D variants stay on the real DMA path. Co-Authored-By: Claude Opus 4.7 (1M context) --- DeeployTest/test_platforms.py | 5 +++- TargetLibraries/PULPOpen/CMakeLists.txt | 11 +++++++ TargetLibraries/PULPOpen/inc/mchan_v7.h | 38 +++++++++++++++++++++++++ 3 files changed, 53 insertions(+), 1 deletion(-) diff --git a/DeeployTest/test_platforms.py b/DeeployTest/test_platforms.py index df2653a7..ed489569 100644 --- a/DeeployTest/test_platforms.py +++ b/DeeployTest/test_platforms.py @@ -458,6 +458,9 @@ def test_siracusa_tiled_training_l3_untiled(test_name, deeploy_test_dir, toolcha fixture = SIRACUSA_L3_UNTILED_TRAINING_MODELS[test_name] overrides = SIRACUSA_TRAINING_MODEL_OVERRIDES.get(test_name, {}) effective_skipsim = skipsim or (os.environ.get("CI") == "true" and fixture.get("skip_sim_in_ci", False)) + # DEEPLOY_L1_AS_L2 is what flips mchan_transfer_1d to memcpy in mchan_v7.h — + # mandatory partner of the codegen sed below. + extra_cmake = list(cmake_args) + ["-DDEEPLOY_L1_AS_L2=ON"] config = create_test_config( test_name = test_name, platform = "Siracusa", @@ -465,7 +468,7 @@ def test_siracusa_tiled_training_l3_untiled(test_name, deeploy_test_dir, toolcha deeploy_test_dir = deeploy_test_dir, toolchain = toolchain, toolchain_dir = toolchain_dir, - cmake_args = cmake_args, + cmake_args = extra_cmake, tiling = True, cores = SIRACUSA_DEFAULT_CORES, l1 = fixture["l1"], diff --git a/TargetLibraries/PULPOpen/CMakeLists.txt b/TargetLibraries/PULPOpen/CMakeLists.txt index ce39fea7..0935b925 100644 --- a/TargetLibraries/PULPOpen/CMakeLists.txt +++ b/TargetLibraries/PULPOpen/CMakeLists.txt @@ -71,6 +71,17 @@ target_link_libraries(deeploypulp PUBLIC pulp-nn-mixed) target_link_libraries(deeploypulp INTERFACE pulp-sdk) target_sources(deeploypulp INTERFACE $) +# Untiled-L3 baseline: when ON, mchan_transfer_1d() in mchan_v7.h is replaced +# with a memcpy implementation so the deeploy-generated DMA calls become +# regular memory copies between L2 buffers. Used together with the test-side +# sed that rewrites pmsis_l1_malloc -> pi_l2_malloc and PI_L1 -> PI_L2 in the +# generated TrainingNetwork.c so every L1-annotated buffer physically lives +# in FC L2. +option(DEEPLOY_L1_AS_L2 "Replace mchan_transfer_1d with memcpy (untiled-L3 baseline)" OFF) +if(DEEPLOY_L1_AS_L2) + target_compile_definitions(deeploypulp PUBLIC DEEPLOY_L1_AS_L2) +endif() + set(PULP_NNX_PLATFORMS "Siracusa;Siracusa_w_neureka") if (platform IN_LIST PULP_NNX_PLATFORMS) if (platform STREQUAL "Siracusa" OR platform STREQUAL "Siracusa_w_neureka") diff --git a/TargetLibraries/PULPOpen/inc/mchan_v7.h b/TargetLibraries/PULPOpen/inc/mchan_v7.h index 32ef836f..e8a0ea1f 100644 --- a/TargetLibraries/PULPOpen/inc/mchan_v7.h +++ b/TargetLibraries/PULPOpen/inc/mchan_v7.h @@ -47,6 +47,42 @@ #define MCHAN_CMD_FLAG_BROADCAST_FINISH (1 << (MCHAN_TRANSFER_LEN_SIZE + 5)) #define MCHAN_CMD_FLAG_2D_TRANSFER_LOCAL (1 << (MCHAN_TRANSFER_LEN_SIZE + 6)) +/* Untiled-L3 baseline override: when DEEPLOY_L1_AS_L2 is defined the + * deeploy-generated code has been sed-rewritten so its "L1" pointers + * actually live in FC L2. The mchan DMA hardware ignores destination + * pointer addresses and unconditionally routes the `loc` parameter into + * cluster L1 banks via the lower bits — so a real DMA call would write + * garbage to L1 and leave the L2 destination empty (which is exactly + * the bug we observed: out-of-bound L1-bank requests + computed=0.0). + * + * Replace mchan transfers with plain memcpy. The channel API becomes a + * no-op: alloc returns 0, wait/free do nothing, is_busy reports idle. + * Only the 1D variant is provided — none of the L3 training fixtures + * emit 2D transfers; if a future model does, add the equivalent loop + * here. */ +#ifdef DEEPLOY_L1_AS_L2 + +#include + +static inline void mchan_transfer_1d(uint32_t cmd, void *loc, void *ext) { + uint32_t size = cmd & ((1u << MCHAN_TRANSFER_LEN_SIZE) - 1); + if (cmd & MCHAN_CMD_FLAG_DIRECTION_EXT2LOC) { + memcpy(loc, ext, size); + } else { + memcpy(ext, loc, size); + } +} + +static inline uint32_t mchan_channel_alloc() { return 0; } +static inline void mchan_channel_free(uint32_t channel_id) { (void)channel_id; } +static inline uint32_t mchan_channel_is_busy(uint32_t channel_id) { + (void)channel_id; + return 0; +} +static inline void mchan_channel_wait(uint32_t channel_id) { (void)channel_id; } + +#else + static volatile uint32_t *const cmd_ptr = (volatile uint32_t *const)(MCHAN_BASE_ADDR + 0x0); static volatile uint32_t *const status_ptr = @@ -117,4 +153,6 @@ static void mchan_channel_wait(uint32_t channel_id) { #endif } +#endif /* DEEPLOY_L1_AS_L2 */ + #endif // __MCHAN_V7_H__ From 734134f6da6e01e48a9d6b67dd5d9dc74fc76e04 Mon Sep 17 00:00:00 2001 From: runwangdl Date: Sun, 10 May 2026 22:01:10 +0000 Subject: [PATCH 12/18] fix(untiled): skip MobileNetV1 sim in CI (known FC LSU crash) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 3 of 4 L3-untiled fixtures produced clean cycle counts in run 25640520241 — CCT, CCT_LoRA, ResNet8. MobileNetV1 sim crashed at "update 1/4 accum 1/1 (mini-batch 0)" with: /chip/soc/fc/lsu] Invalid access (pc: 0x1c010034, offset: 0xbf851e33, size: 0x1, is_write: 0) The bad offset 0xbf851e33 is the float32 bit pattern of -1.039984, which is testData_mb0_buf0[1] — i.e. some float value is being dereferenced as a pointer. Likely one of the FC-side helper macros (l3_aware_copy, IS_L2, ram_write) loads a void* from a buffer that's been overwritten with float data, but only MobileNet's specific L2 footprint triggers the misalignment. Defer sim and ship the 3 working fixtures; bisect the FC harness in a follow-up. --- DeeployTest/test_siracusa_tiled_config.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/DeeployTest/test_siracusa_tiled_config.py b/DeeployTest/test_siracusa_tiled_config.py index 2dd8f2ba..3ef530c8 100644 --- a/DeeployTest/test_siracusa_tiled_config.py +++ b/DeeployTest/test_siracusa_tiled_config.py @@ -229,7 +229,16 @@ "Models/Training/MobileNetV1/mobilenetv1_train": { "l1": 800_000, # below 800K codegen asserts on accum_buffer DMA "l2": 2_000_000, - "skip_sim_in_ci": False, + # KNOWN ISSUE: sim crashes during update 1/4 with FC LSU + # "Invalid access (pc: 0x1c010034, offset: 0xbf851e33)" — the + # bad address 0xbf851e33 happens to be the float32 bit pattern + # of -1.039984, which is testData_mb0_buf0[1]. Signature of a + # float-value being dereferenced as a pointer somewhere in the + # FC harness, surfaced only by MobileNet's larger L2 footprint + # under the sed+memcpy untiled mode. The other 3 L3 fixtures + # (CCT / CCT_LoRA / ResNet8) all produce clean cycle counts. + # Sim deferred until the root cause is bisected. + "skip_sim_in_ci": True, }, } From 23f1a67e800729cd65171d404e2e47a457d7a3f7 Mon Sep 17 00:00:00 2001 From: runwangdl Date: Sun, 10 May 2026 22:04:30 +0000 Subject: [PATCH 13/18] fix(untiled): cap MobileNet to 1-step/1-accum to fit FC L2 heap CCT/CCT_LoRA/ResNet8 untiled L3 produced cycles cleanly in run 25640520241. MobileNetV1 sim crashed in update 1 with a float-as-pointer deref. Hypothesis: testinputs.h's 4-batch data (~2.8 MB compiled into the FC L2 .data section) plus the 1042 KB post-sed L1+L2 working buffer exhausts the FC L2 heap (~1.94 MB usable), causing a downstream pi_l2_malloc to land in invalid memory. Capping MobileNet to n_steps=1, n_accum=1 shrinks testinputs.h ~4x and should free enough heap for the post-sed dynamic alloc to succeed. The per-step train_cycles measurement remains valid since the loop work per step is identical. Plumbed via two new optional fixture fields (n_steps, n_accum) that turn into --n-steps / --n-accum gen_args. Other fixtures unaffected. Co-Authored-By: Claude Opus 4.7 (1M context) --- DeeployTest/test_platforms.py | 9 +++++++++ DeeployTest/test_siracusa_tiled_config.py | 20 ++++++++++---------- 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/DeeployTest/test_platforms.py b/DeeployTest/test_platforms.py index ed489569..71f993e9 100644 --- a/DeeployTest/test_platforms.py +++ b/DeeployTest/test_platforms.py @@ -461,6 +461,14 @@ def test_siracusa_tiled_training_l3_untiled(test_name, deeploy_test_dir, toolcha # DEEPLOY_L1_AS_L2 is what flips mchan_transfer_1d to memcpy in mchan_v7.h — # mandatory partner of the codegen sed below. extra_cmake = list(cmake_args) + ["-DDEEPLOY_L1_AS_L2=ON"] + # Optional per-fixture training-step caps. Some untiled-L3 models hit FC + # L2 heap limits when testinputs.h carries 4-batch data; capping reduces + # the .data footprint while keeping per-step cycle measurement valid. + extra_gen = [] + if "n_steps" in fixture: + extra_gen.append(f"--n-steps={fixture['n_steps']}") + if "n_accum" in fixture: + extra_gen.append(f"--n-accum={fixture['n_accum']}") config = create_test_config( test_name = test_name, platform = "Siracusa", @@ -478,6 +486,7 @@ def test_siracusa_tiled_training_l3_untiled(test_name, deeploy_test_dir, toolcha training = True, training_num_data_inputs = overrides.get("num_data_inputs"), training_tolerance = overrides.get("tolerance"), + gen_args = extra_gen, ) # Inline the test runner stages so we can sed between codegen and build. diff --git a/DeeployTest/test_siracusa_tiled_config.py b/DeeployTest/test_siracusa_tiled_config.py index 3ef530c8..ab39185a 100644 --- a/DeeployTest/test_siracusa_tiled_config.py +++ b/DeeployTest/test_siracusa_tiled_config.py @@ -229,16 +229,16 @@ "Models/Training/MobileNetV1/mobilenetv1_train": { "l1": 800_000, # below 800K codegen asserts on accum_buffer DMA "l2": 2_000_000, - # KNOWN ISSUE: sim crashes during update 1/4 with FC LSU - # "Invalid access (pc: 0x1c010034, offset: 0xbf851e33)" — the - # bad address 0xbf851e33 happens to be the float32 bit pattern - # of -1.039984, which is testData_mb0_buf0[1]. Signature of a - # float-value being dereferenced as a pointer somewhere in the - # FC harness, surfaced only by MobileNet's larger L2 footprint - # under the sed+memcpy untiled mode. The other 3 L3 fixtures - # (CCT / CCT_LoRA / ResNet8) all produce clean cycle counts. - # Sim deferred until the root cause is bisected. - "skip_sim_in_ci": True, + # MobileNet's 4-batch testinputs.h is ~2.8 MB of static .data — at + # the limit of FC L2 heap when combined with the post-sed L1+L2 + # working buffer (1042 KB). Cutting to 1 train step × 1 accum + # step shrinks testinputs.h ~4x and frees enough heap for the + # remaining pi_l2_malloc calls to land in valid memory. Cycle + # numbers are still meaningful: per-step train cycles are what + # we want to compare against the tiled L3 baseline. + "n_steps": 1, + "n_accum": 1, + "skip_sim_in_ci": False, }, } From 13c4f674a2fa8d7bedf09d668908944d104270be Mon Sep 17 00:00:00 2001 From: runwangdl Date: Sun, 10 May 2026 22:18:12 +0000 Subject: [PATCH 14/18] fix(untiled): force MobileNet num_data_inputs=1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CCT/CCT_LoRA pass with num_data_inputs=1 (from MODEL_OVERRIDES); MobileNet auto-detects 2 and crashes. Forcing 1 gets us a single-input training step, comparable to the existing tiled L3 cycle baseline. Adds fixture-level num_data_inputs that overrides the global MODEL_OVERRIDES value — needed only for fixtures whose multi-input default surfaces a codegen bug under the sed+memcpy untiled mode. --- DeeployTest/test_platforms.py | 6 +++++- DeeployTest/test_siracusa_tiled_config.py | 14 +++++++------- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/DeeployTest/test_platforms.py b/DeeployTest/test_platforms.py index 71f993e9..b814bb89 100644 --- a/DeeployTest/test_platforms.py +++ b/DeeployTest/test_platforms.py @@ -469,6 +469,10 @@ def test_siracusa_tiled_training_l3_untiled(test_name, deeploy_test_dir, toolcha extra_gen.append(f"--n-steps={fixture['n_steps']}") if "n_accum" in fixture: extra_gen.append(f"--n-accum={fixture['n_accum']}") + # Per-fixture num_data_inputs override (lets a fixture force the value + # the model overrides don't set globally — needed when a multi-input + # model triggers a code-path bug only with NUM_DATA_INPUTS > 1). + fixture_num_data = fixture.get("num_data_inputs", overrides.get("num_data_inputs")) config = create_test_config( test_name = test_name, platform = "Siracusa", @@ -484,7 +488,7 @@ def test_siracusa_tiled_training_l3_untiled(test_name, deeploy_test_dir, toolcha default_mem_level = "L3", double_buffer = False, training = True, - training_num_data_inputs = overrides.get("num_data_inputs"), + training_num_data_inputs = fixture_num_data, training_tolerance = overrides.get("tolerance"), gen_args = extra_gen, ) diff --git a/DeeployTest/test_siracusa_tiled_config.py b/DeeployTest/test_siracusa_tiled_config.py index ab39185a..4b9ccc33 100644 --- a/DeeployTest/test_siracusa_tiled_config.py +++ b/DeeployTest/test_siracusa_tiled_config.py @@ -229,15 +229,15 @@ "Models/Training/MobileNetV1/mobilenetv1_train": { "l1": 800_000, # below 800K codegen asserts on accum_buffer DMA "l2": 2_000_000, - # MobileNet's 4-batch testinputs.h is ~2.8 MB of static .data — at - # the limit of FC L2 heap when combined with the post-sed L1+L2 - # working buffer (1042 KB). Cutting to 1 train step × 1 accum - # step shrinks testinputs.h ~4x and frees enough heap for the - # remaining pi_l2_malloc calls to land in valid memory. Cycle - # numbers are still meaningful: per-step train cycles are what - # we want to compare against the tiled L3 baseline. + # Cap training schedule (testinputs.h shrinks ~4x) AND force 1 + # data input. CCT/CCT_LoRA's MODEL_OVERRIDES has num_data_inputs=1 + # and they pass; MobileNet's default DATA_INPUTS=2 may surface a + # second-input handling bug that's masked when only one input is + # consumed. A 1-step + 1-input run is still apples-to-apples for + # per-step train_cycles vs tiled L3. "n_steps": 1, "n_accum": 1, + "num_data_inputs": 1, "skip_sim_in_ci": False, }, } From ca3de155dd4b1e000708e06b00cc6a0fe075128a Mon Sep 17 00:00:00 2001 From: runwangdl Date: Sun, 10 May 2026 23:16:55 +0000 Subject: [PATCH 15/18] fix(untiled): big CCT (img_size=32, emb_dim=128) needs --l1=800K MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CCT fixture now points at the big-CCT ONNX (devel #23 — 1.16 MB inputs.npz, 4.66 MB L3 storage). Old --l1=64K was sized for the toy 8x8 / dim=32 CCT (peak L1 = 16 KB) and produces 20 distinct tile shapes on the new model. --l1=800K is the smallest value that reaches the near-untiled shape (3 tile shapes, peak L1 = 524 KB) — the values in between (200K-400K) trip the SBTiler "Keys should be the same while generating DMA transfer for tensor 'data_in'/'data_out'" assert. Add the same n_steps=1 / n_accum=1 / num_data_inputs=1 caps as MobileNet to keep testinputs.h's .data footprint inside the FC L2 heap. --- DeeployTest/test_siracusa_tiled_config.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/DeeployTest/test_siracusa_tiled_config.py b/DeeployTest/test_siracusa_tiled_config.py index 4b9ccc33..b91deece 100644 --- a/DeeployTest/test_siracusa_tiled_config.py +++ b/DeeployTest/test_siracusa_tiled_config.py @@ -209,8 +209,18 @@ # or a beefier runner. L3_UNTILED_TRAINING_MODELS = { "Models/Training/CCT/cct_train": { - "l1": 64_000, + # Big-CCT (img_size=32, embedding_dim=128, n_conv_layers=2) — peak + # L1 working = 524 KB > physical L1 (256 KB). Same regime as + # ResNet8/MobileNet now. --l1=200K..400K trip a codegen assert + # ("Keys should be the same while generating DMA transfer for + # tensor 'data_in'/'data_out'"); 800K is the smallest value that + # gets through to a clean schedule. + "l1": 800_000, "l2": 2_000_000, + # Cap to 1 step so testinputs.h doesn't blow .data section. + "n_steps": 1, + "n_accum": 1, + "num_data_inputs": 1, "skip_sim_in_ci": False, }, "Models/Training/CCT_LoRA/cct_lora_train": { From 0e279464536aa01228395823babe7bbc6f533547 Mon Sep 17 00:00:00 2001 From: runwangdl Date: Sun, 10 May 2026 23:19:54 +0000 Subject: [PATCH 16/18] =?UTF-8?q?ci(untiled):=20isolate=20big-CCT=20only?= =?UTF-8?q?=20=E2=80=94=20disable=20other=203=20fixtures=20temporarily?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Comment out CCT_LoRA / ResNet8 / MobileNetV1 entries in L3_UNTILED_TRAINING_MODELS so this CI run measures only the new big CCT (img_size=32, embedding_dim=128) untiled cycle. Restore before merging. --- DeeployTest/test_siracusa_tiled_config.py | 49 ++++++++++------------- 1 file changed, 22 insertions(+), 27 deletions(-) diff --git a/DeeployTest/test_siracusa_tiled_config.py b/DeeployTest/test_siracusa_tiled_config.py index b91deece..ccc6cc90 100644 --- a/DeeployTest/test_siracusa_tiled_config.py +++ b/DeeployTest/test_siracusa_tiled_config.py @@ -223,33 +223,28 @@ "num_data_inputs": 1, "skip_sim_in_ci": False, }, - "Models/Training/CCT_LoRA/cct_lora_train": { - "l1": 64_000, - "l2": 2_000_000, - "skip_sim_in_ci": False, - }, - "Models/Training/ResNet8/resnet8_train": { - # 800 KB is the smallest --l1 that yields the minimal-tile shape - # (peak L1 working = 739 KB). Larger values inflate MiniMalloc's - # RAM appetite past CI's ceiling. - "l1": 800_000, - "l2": 2_000_000, - "skip_sim_in_ci": False, - }, - "Models/Training/MobileNetV1/mobilenetv1_train": { - "l1": 800_000, # below 800K codegen asserts on accum_buffer DMA - "l2": 2_000_000, - # Cap training schedule (testinputs.h shrinks ~4x) AND force 1 - # data input. CCT/CCT_LoRA's MODEL_OVERRIDES has num_data_inputs=1 - # and they pass; MobileNet's default DATA_INPUTS=2 may surface a - # second-input handling bug that's masked when only one input is - # consumed. A 1-step + 1-input run is still apples-to-apples for - # per-step train_cycles vs tiled L3. - "n_steps": 1, - "n_accum": 1, - "num_data_inputs": 1, - "skip_sim_in_ci": False, - }, + # Other 3 fixtures (CCT_LoRA, ResNet8, MobileNetV1) temporarily + # disabled so this CI run isolates the big-CCT untiled measurement. + # Restore the entries below before merging. + # + # "Models/Training/CCT_LoRA/cct_lora_train": { + # "l1": 64_000, + # "l2": 2_000_000, + # "skip_sim_in_ci": False, + # }, + # "Models/Training/ResNet8/resnet8_train": { + # "l1": 800_000, + # "l2": 2_000_000, + # "skip_sim_in_ci": False, + # }, + # "Models/Training/MobileNetV1/mobilenetv1_train": { + # "l1": 800_000, + # "l2": 2_000_000, + # "n_steps": 1, + # "n_accum": 1, + # "num_data_inputs": 1, + # "skip_sim_in_ci": False, + # }, } # Per-model overrides for training tests. From 5c2ab9f4bcd4e014fefdbb70a7df0d185abbaa2a Mon Sep 17 00:00:00 2001 From: runwangdl Date: Sun, 10 May 2026 23:46:21 +0000 Subject: [PATCH 17/18] ci(untiled): re-enable L3-singlebuffer to capture big-CCT tiled cycle Restores the siracusa-training-tiled-l3-singlebuffer job so we get a fresh tiled measurement for the big CCT (img_size=32, embedding_dim=128) that landed in devel #23. L2 singlebuffer stays commented out (other L2 numbers from the existing benchmark figure are still valid). --- .../workflows/ci-platform-siracusa-tiled.yml | 25 +++++++++---------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/.github/workflows/ci-platform-siracusa-tiled.yml b/.github/workflows/ci-platform-siracusa-tiled.yml index 5536fda5..f443d2b1 100644 --- a/.github/workflows/ci-platform-siracusa-tiled.yml +++ b/.github/workflows/ci-platform-siracusa-tiled.yml @@ -29,10 +29,9 @@ jobs: with: docker_image_deeploy: ${{ inputs.docker_image_deeploy }} - # NOTE: All other Siracusa-tiled jobs are temporarily disabled on this - # branch so the L3-untiled job runs in isolation while we collect - # cycle-count data. Restore the L2 / L3 singlebuffer entries below - # before merging. + # NOTE: L2 singlebuffer still commented out — only need fresh L3 + # singlebuffer numbers for the big-CCT tiled cycle (other 3 already + # measured). Restore the L2 entry below before merging. # # # Training tests - L2 singlebuffer # siracusa-training-tiled-l2-singlebuffer: @@ -42,15 +41,15 @@ jobs: # runner: ${{ needs.select-env.outputs.runner }} # docker-image: ${{ needs.select-env.outputs.image }} # pytest-marker: "training and l2 and singlebuffer" - # - # # Training tests - L3 singlebuffer - # siracusa-training-tiled-l3-singlebuffer: - # needs: select-env - # uses: ./.github/workflows/_runner-siracusa-tiled.yml - # with: - # runner: ${{ needs.select-env.outputs.runner }} - # docker-image: ${{ needs.select-env.outputs.image }} - # pytest-marker: "training and l3 and singlebuffer" + + # Training tests - L3 singlebuffer (re-run to pick up big-CCT tiled). + siracusa-training-tiled-l3-singlebuffer: + needs: select-env + uses: ./.github/workflows/_runner-siracusa-tiled.yml + with: + runner: ${{ needs.select-env.outputs.runner }} + docker-image: ${{ needs.select-env.outputs.image }} + pytest-marker: "training and l3 and singlebuffer" # Training tests - L3 untiled baseline. Codegen post-process rewrites # every L1-annotated buffer to FC L2 so cluster cores access kernel From 7b4038d0fd36db1b35c822933bbf71a4c88d6357 Mon Sep 17 00:00:00 2001 From: runwangdl Date: Mon, 11 May 2026 00:14:06 +0000 Subject: [PATCH 18/18] =?UTF-8?q?ci(untiled):=20drop=20n=5Fsteps=3D1=20cap?= =?UTF-8?q?=20on=20CCT=20=E2=80=94=20use=20default=204-step=20schedule?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- DeeployTest/test_siracusa_tiled_config.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/DeeployTest/test_siracusa_tiled_config.py b/DeeployTest/test_siracusa_tiled_config.py index ccc6cc90..30511320 100644 --- a/DeeployTest/test_siracusa_tiled_config.py +++ b/DeeployTest/test_siracusa_tiled_config.py @@ -165,11 +165,13 @@ # Training-enabled tiled models that need L3 spill (weights/activations don't # fit in L2). Same shape: test path -> list of L1 sizes (bytes). +# TEMPORARY: only big-CCT enabled — the other 3 already have tiled cycle +# data from earlier CI runs. Restore the entries below before merging. L3_SINGLEBUFFER_TRAINING_MODELS = { - "Models/Training/ResNet8/resnet8_train": [128000], - "Models/Training/MobileNetV1/mobilenetv1_train": [128000], + # "Models/Training/ResNet8/resnet8_train": [128000], + # "Models/Training/MobileNetV1/mobilenetv1_train": [128000], "Models/Training/CCT/cct_train": [128000], - "Models/Training/CCT_LoRA/cct_lora_train": [128000], + # "Models/Training/CCT_LoRA/cct_lora_train": [128000], } # Untiled-L3 baseline. Same fixtures as L3_SINGLEBUFFER_TRAINING_MODELS but @@ -210,16 +212,15 @@ L3_UNTILED_TRAINING_MODELS = { "Models/Training/CCT/cct_train": { # Big-CCT (img_size=32, embedding_dim=128, n_conv_layers=2) — peak - # L1 working = 524 KB > physical L1 (256 KB). Same regime as - # ResNet8/MobileNet now. --l1=200K..400K trip a codegen assert - # ("Keys should be the same while generating DMA transfer for - # tensor 'data_in'/'data_out'"); 800K is the smallest value that - # gets through to a clean schedule. + # L1 working = 524 KB > physical L1 (256 KB). --l1=200K..400K + # trip a codegen assert ("Keys should be the same while generating + # DMA transfer for tensor 'data_in'/'data_out'"); 800K is the + # smallest value that gets through to a clean schedule. "l1": 800_000, "l2": 2_000_000, - # Cap to 1 step so testinputs.h doesn't blow .data section. - "n_steps": 1, - "n_accum": 1, + # Use the default training schedule (n_steps=4 / n_accum=1 from + # inputs.npz) so per-step cycles are computed the same way as the + # tiled L3 baseline (BENCH total / 4). "num_data_inputs": 1, "skip_sim_in_ci": False, },