Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
525d08a
feat(training): add CCT + CCT_LoRA to non-tiled siracusa training tests
runwangdl May 10, 2026
11d7aa9
feat(training): add L3 untiled baseline via fake-L1 shim + ResNet8 fi…
runwangdl May 10, 2026
8f003bc
style: pre-commit fixes (yapf line wrap + clang-format 2-space indent)
runwangdl May 10, 2026
e9445f1
fix(ci): drop ResNet8 untiled --l1 to 800 KB to fit ubuntu-latest 7 G…
runwangdl May 10, 2026
5ce11e5
ci: --skipsim diagnostic for L3-untiled job + memory snapshot
runwangdl May 10, 2026
f9d2e9e
feat(training): CCT/CCT_LoRA/MobileNetV1 untiled L3 + CI footprint su…
runwangdl May 10, 2026
443d45f
feat(ci): per-model sim gate + cycle counts in untiled-L3 summary
runwangdl May 10, 2026
2df2b0b
fix(untiled): drop CCT/CCT_LoRA + per-fixture needs_fake_l1 gate
runwangdl May 10, 2026
5580739
fix(fake-l1): try real L1 first; restore CCT/CCT_LoRA + sim for all 4
runwangdl May 10, 2026
ef41f3a
feat(untiled): sed-based L1→L2 codegen rewrite + isolate to one CI job
runwangdl May 10, 2026
13741bf
fix(untiled): mchan_transfer_1d -> memcpy under DEEPLOY_L1_AS_L2
runwangdl May 10, 2026
734134f
fix(untiled): skip MobileNetV1 sim in CI (known FC LSU crash)
runwangdl May 10, 2026
23f1a67
fix(untiled): cap MobileNet to 1-step/1-accum to fit FC L2 heap
runwangdl May 10, 2026
13c4f67
fix(untiled): force MobileNet num_data_inputs=1
runwangdl May 10, 2026
e456d57
Merge remote-tracking branch 'origin/devel' into feat/untiling
runwangdl May 10, 2026
ca3de15
fix(untiled): big CCT (img_size=32, emb_dim=128) needs --l1=800K
runwangdl May 10, 2026
0e27946
ci(untiled): isolate big-CCT only — disable other 3 fixtures temporarily
runwangdl May 10, 2026
5c2ab9f
ci(untiled): re-enable L3-singlebuffer to capture big-CCT tiled cycle
runwangdl May 10, 2026
7b4038d
ci(untiled): drop n_steps=1 cap on CCT — use default 4-step schedule
runwangdl May 11, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 24 additions & 1 deletion .github/workflows/_runner-siracusa-tiled.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@ name: _runner-siracusa-tiled
pytest-marker:
required: true
type: string
pytest-extra-args:
required: false
type: string
default: ""

jobs:
test-runner-siracusa-tiled:
Expand All @@ -36,5 +40,24 @@ jobs:
- name: Run Test
run: |
cd DeeployTest
pytest test_platforms.py -v -m "siracusa_tiled and ${{ inputs.pytest-marker }}"
# Memory snapshot helps diagnose 137/OOM kills postmortem.
echo "=== free -m before pytest ==="; free -m || true
pytest test_platforms.py -v -m "siracusa_tiled and ${{ inputs.pytest-marker }}" ${{ inputs.pytest-extra-args }}
echo "=== free -m after pytest ==="; free -m || true
shell: bash
- name: Build footprint summary
if: always()
env:
FOOTPRINT_SUMMARY_LABEL: ${{ inputs.pytest-marker }}
run: |
cd DeeployTest
python3 ../scripts/ci_footprint_summary.py TEST_SIRACUSA || true
shell: bash
- name: Upload sim out.txt
if: always()
uses: actions/upload-artifact@v4
with:
name: sim-out-${{ inputs.pytest-marker }}
path: DeeployTest/out.txt
if-no-files-found: ignore
retention-days: 7
28 changes: 22 additions & 6 deletions .github/workflows/ci-platform-siracusa-tiled.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,20 +29,36 @@ jobs:
with:
docker_image_deeploy: ${{ inputs.docker_image_deeploy }}

# Training tests - L2 singlebuffer
siracusa-training-tiled-l2-singlebuffer:
# NOTE: L2 singlebuffer still commented out — only need fresh L3
# singlebuffer numbers for the big-CCT tiled cycle (other 3 already
# measured). Restore the L2 entry below before merging.
#
# # Training tests - L2 singlebuffer
# siracusa-training-tiled-l2-singlebuffer:
# needs: select-env
# uses: ./.github/workflows/_runner-siracusa-tiled.yml
# with:
# runner: ${{ needs.select-env.outputs.runner }}
# docker-image: ${{ needs.select-env.outputs.image }}
# pytest-marker: "training and l2 and singlebuffer"

# Training tests - L3 singlebuffer (re-run to pick up big-CCT tiled).
siracusa-training-tiled-l3-singlebuffer:
needs: select-env
uses: ./.github/workflows/_runner-siracusa-tiled.yml
with:
runner: ${{ needs.select-env.outputs.runner }}
docker-image: ${{ needs.select-env.outputs.image }}
pytest-marker: "training and l2 and singlebuffer"
pytest-marker: "training and l3 and singlebuffer"

# Training tests - L3 singlebuffer (models that spill weights/activations to L3)
siracusa-training-tiled-l3-singlebuffer:
# Training tests - L3 untiled baseline. Codegen post-process rewrites
# every L1-annotated buffer to FC L2 so cluster cores access kernel
# buffers via the fabric — "untiled, L2-resident working set" cycle
# semantics for all 4 L3 models.
siracusa-training-tiled-l3-untiled:
needs: select-env
uses: ./.github/workflows/_runner-siracusa-tiled.yml
with:
runner: ${{ needs.select-env.outputs.runner }}
docker-image: ${{ needs.select-env.outputs.image }}
pytest-marker: "training and l3 and singlebuffer"
pytest-marker: "training and l3 and untiled"
9 changes: 3 additions & 6 deletions .github/workflows/ci-platform-siracusa.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,10 @@
---
name: CI • Siracusa

# NOTE: Push / pull_request triggers temporarily disabled on this branch
# so only the L3-untiled job runs while we collect cycle-count data.
# Restore the push: / pull_request: blocks before merging.
"on":
push:
branches:
- "**"
tags:
- "v*.*.*"
pull_request:
workflow_dispatch:
inputs:
docker_image_deeploy:
Expand Down
2 changes: 2 additions & 0 deletions DeeployTest/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,8 @@ def pytest_configure(config: pytest.Config) -> None:
"markers", "train_kernel: mark test as a training-related single-op kernel test (grad / loss / optimizer)")
config.addinivalue_line("markers", "singlebuffer: mark test as single-buffer configuration")
config.addinivalue_line("markers", "doublebuffer: mark test as double-buffer configuration")
config.addinivalue_line(
"markers", "untiled: mark test as untiled-baseline (single-tile-per-tensor schedule via fake-L1 shim)")
config.addinivalue_line("markers", "l2: mark test as L2 default memory level")
config.addinivalue_line("markers", "l3: mark test as L3 default memory level")
config.addinivalue_line("markers", "wmem: mark test as using Neureka weight memory")
Expand Down
99 changes: 99 additions & 0 deletions DeeployTest/test_platforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
#
# SPDX-License-Identifier: Apache-2.0

import os

import pytest
# Import platform-specific test configurations
from test_chimera_config import KERNEL_TESTS as CHIMERA_KERNEL_TESTS
Expand Down Expand Up @@ -40,6 +42,7 @@
from test_siracusa_tiled_config import L2_SINGLEBUFFER_TRAINING_MODELS as SIRACUSA_L2_SINGLEBUFFER_TRAINING_MODELS
from test_siracusa_tiled_config import L3_DOUBLEBUFFER_MODELS, L3_SINGLEBUFFER_MODELS
from test_siracusa_tiled_config import L3_SINGLEBUFFER_TRAINING_MODELS as SIRACUSA_L3_SINGLEBUFFER_TRAINING_MODELS
from test_siracusa_tiled_config import L3_UNTILED_TRAINING_MODELS as SIRACUSA_L3_UNTILED_TRAINING_MODELS
from test_siracusa_tiled_config import TRAINING_MODEL_OVERRIDES as SIRACUSA_TRAINING_MODEL_OVERRIDES
from test_snitch_config import DEFAULT_NUM_CORES as SNITCH_DEFAULT_NUM_CORES
from test_snitch_config import KERNEL_TESTS as SNITCH_KERNEL_TESTS
Expand Down Expand Up @@ -330,6 +333,9 @@ def test_siracusa_train_kernels(test_name, deeploy_test_dir, toolchain, toolchai
@pytest.mark.training
@pytest.mark.parametrize("test_name", SIRACUSA_TRAINING_TESTS, ids = SIRACUSA_TRAINING_TESTS)
def test_siracusa_training(test_name, deeploy_test_dir, toolchain, toolchain_dir, cmake_args, skipgen, skipsim) -> None:
# Reuse the tiled overrides table — same models, same tolerance / data-input
# quirks regardless of whether tiling is on.
overrides = SIRACUSA_TRAINING_MODEL_OVERRIDES.get(test_name, {})
config = create_test_config(
test_name = test_name,
platform = "Siracusa",
Expand All @@ -341,6 +347,8 @@ def test_siracusa_training(test_name, deeploy_test_dir, toolchain, toolchain_dir
tiling = False,
cores = SIRACUSA_DEFAULT_CORES,
training = True,
training_num_data_inputs = overrides.get("num_data_inputs"),
training_tolerance = overrides.get("tolerance"),
)
run_and_assert_test(test_name, config, skipgen, skipsim)

Expand Down Expand Up @@ -413,6 +421,97 @@ def test_siracusa_tiled_training_l3_singlebuffer(test_params, deeploy_test_dir,
run_and_assert_test(test_name, config, skipgen, skipsim)


@pytest.mark.siracusa_tiled
@pytest.mark.training
@pytest.mark.untiled
@pytest.mark.l3
@pytest.mark.parametrize(
"test_name",
list(SIRACUSA_L3_UNTILED_TRAINING_MODELS.keys()),
ids = list(SIRACUSA_L3_UNTILED_TRAINING_MODELS.keys()),
)
def test_siracusa_tiled_training_l3_untiled(test_name, deeploy_test_dir, toolchain, toolchain_dir, cmake_args, skipgen,
skipsim) -> None:
"""Untiled-L3 baseline.

SBTiler picks single-tile-per-tensor schedules (--l1 inflated above the
op working set so no spatial split happens). The generated C is one
kernel call per op with integral L3↔L2 DMA wrappers.

To make the L1 staging buffer physically live in FC L2 (so cycles
represent "kernel actually accessing L2"), we post-process the
generated TrainingNetwork.c / OptimizerNetwork.c after codegen but
before cmake build:

pmsis_l1_malloc -> pi_l2_malloc
PI_L1 -> PI_L2

Every L1-annotated buffer ends up in FC L2. Cluster cores access L2
via the fabric (~7x slower than real L1) — that's the deliberate
semantics of "untiled L2-resident". No fake-L1 shim, no linker wrap,
no SDK pollution.
"""
from pathlib import Path

from testUtils.core.execution import build_binary, configure_cmake, generate_network, run_simulation

fixture = SIRACUSA_L3_UNTILED_TRAINING_MODELS[test_name]
overrides = SIRACUSA_TRAINING_MODEL_OVERRIDES.get(test_name, {})
effective_skipsim = skipsim or (os.environ.get("CI") == "true" and fixture.get("skip_sim_in_ci", False))
# DEEPLOY_L1_AS_L2 is what flips mchan_transfer_1d to memcpy in mchan_v7.h —
# mandatory partner of the codegen sed below.
extra_cmake = list(cmake_args) + ["-DDEEPLOY_L1_AS_L2=ON"]
# Optional per-fixture training-step caps. Some untiled-L3 models hit FC
# L2 heap limits when testinputs.h carries 4-batch data; capping reduces
# the .data footprint while keeping per-step cycle measurement valid.
extra_gen = []
if "n_steps" in fixture:
extra_gen.append(f"--n-steps={fixture['n_steps']}")
if "n_accum" in fixture:
extra_gen.append(f"--n-accum={fixture['n_accum']}")
# Per-fixture num_data_inputs override (lets a fixture force the value
# the model overrides don't set globally — needed when a multi-input
# model triggers a code-path bug only with NUM_DATA_INPUTS > 1).
fixture_num_data = fixture.get("num_data_inputs", overrides.get("num_data_inputs"))
config = create_test_config(
test_name = test_name,
platform = "Siracusa",
simulator = "gvsoc",
deeploy_test_dir = deeploy_test_dir,
toolchain = toolchain,
toolchain_dir = toolchain_dir,
cmake_args = extra_cmake,
tiling = True,
cores = SIRACUSA_DEFAULT_CORES,
l1 = fixture["l1"],
l2 = fixture["l2"],
default_mem_level = "L3",
double_buffer = False,
training = True,
training_num_data_inputs = fixture_num_data,
training_tolerance = overrides.get("tolerance"),
gen_args = extra_gen,
)

# Inline the test runner stages so we can sed between codegen and build.
generate_network(config, skip = skipgen)
for c_name in ("TrainingNetwork.c", "OptimizerNetwork.c"):
c_path = Path(config.gen_dir) / c_name
if not c_path.exists():
continue
text = c_path.read_text()
text = text.replace("pmsis_l1_malloc", "pi_l2_malloc")
text = text.replace("PI_L1 ", "PI_L2 ")
c_path.write_text(text)
configure_cmake(config)
build_binary(config)
result = run_simulation(config, skip = effective_skipsim)
assert result.success, (f"Test {test_name} failed with {result.error_count} errors out of "
f"{result.total_count}\nOutput:\n{result.stdout}")
if result.error_count >= 0:
assert result.error_count == 0, (f"Found {result.error_count} errors out of {result.total_count} tests")


@pytest.mark.siracusa_tiled
@pytest.mark.kernels
@pytest.mark.singlebuffer
Expand Down
12 changes: 12 additions & 0 deletions DeeployTest/test_siracusa_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,18 @@
# Training-enabled models (use deeployTrainingRunner / testMVPTraining pipeline).
# Each entry is the path to a `<model>_train` directory; the matching
# `<model>_optimizer` directory must live next to it.
#
# Untiled-L3 baseline scope:
# The untiled path emits one pi_l2_malloc per buffer; the SUM of these calls
# must fit in the Siracusa FC-L2 heap (~1 MB usable after BSS/stack). Models
# below have a verified untiled L2 footprint within that ceiling:
# - SimpleMLP ~0.05 MB
# - CCT_LoRA ~0.4 MB
# - CCT ~0.7 MB
# ResNet8 (~9.3 MB) and MobileNetV1 (~17 MB) need an L2-heap override that
# lives in a separate task — they stay tiled-only for now.
TRAINING_TESTS = [
"Models/Training/SimpleMLP/simplemlp_train",
"Models/Training/CCT/cct_train",
"Models/Training/CCT_LoRA/cct_lora_train",
]
82 changes: 79 additions & 3 deletions DeeployTest/test_siracusa_tiled_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,11 +165,87 @@

# Training-enabled tiled models that need L3 spill (weights/activations don't
# fit in L2). Same shape: test path -> list of L1 sizes (bytes).
# TEMPORARY: only big-CCT enabled — the other 3 already have tiled cycle
# data from earlier CI runs. Restore the entries below before merging.
L3_SINGLEBUFFER_TRAINING_MODELS = {
"Models/Training/ResNet8/resnet8_train": [128000],
"Models/Training/MobileNetV1/mobilenetv1_train": [128000],
# "Models/Training/ResNet8/resnet8_train": [128000],
# "Models/Training/MobileNetV1/mobilenetv1_train": [128000],
"Models/Training/CCT/cct_train": [128000],
"Models/Training/CCT_LoRA/cct_lora_train": [128000],
# "Models/Training/CCT_LoRA/cct_lora_train": [128000],
}

# Untiled-L3 baseline. Same fixtures as L3_SINGLEBUFFER_TRAINING_MODELS but
# the L1 budget is inflated so the SBTiler picks single-tile-per-tensor
# schedules (numTiles == 1 on every dim) — semantically untiled per op, but
# still uses the tile-codegen DMA wrappers because cluster cores cannot deref
# HyperRAM directly. The L1 working buffer ends up larger than physical
# Siracusa L1 (256 KB), so the deeploy_fake_l1 shim redirects pi_cl_l1_malloc
# into an FC-L2 arena via -Wl,--wrap; size cap = DEEPLOY_FAKE_L1_SIZE (set
# per-fixture below to fit the model's peak L1 working set with headroom).
#
# Maps test_name -> dict with:
# l1: planner-side L1 size (forces single-tile schedules)
# l2: planner-side L2 size
# fake_l1_size: physical bytes for the FC-L2 arena backing pi_cl_l1_malloc
#
# fake_l1_size baselining method: spike with --l1=4_000_000 → read off
# MEMORYARENA_L1 size from generated TrainingNetwork.c → round up.
# Untiled-L3 baseline — single-tile-per-tensor schedules for every L3
# training model so the user can read off "untiled L3 latency" alongside
# the existing tiled-L3 cycles.
#
# Each fixture goes through the same SBTiler infrastructure as the L3
# singlebuffer tests, with --l1 inflated to the smallest value that
# yields the minimal-tile shape (one kernel call per op + integral
# L3↔L2 DMA, no spatial split).
#
# After codegen, the test post-processes TrainingNetwork.c /
# OptimizerNetwork.c to swap pmsis_l1_malloc → pi_l2_malloc and
# PI_L1 → PI_L2, so every L1-annotated buffer physically lives in
# FC L2. Cluster cores access these via the fabric (~7x slower than
# real L1) — that's the deliberate semantics of "untiled L2-resident".
#
# skip_sim_in_ci: True for fixtures where gvsoc has historically OOMed
# during the long single-tile loop. CI still verifies codegen +
# compile + link in that case; sim is deferred to a manual local run
# or a beefier runner.
L3_UNTILED_TRAINING_MODELS = {
"Models/Training/CCT/cct_train": {
# Big-CCT (img_size=32, embedding_dim=128, n_conv_layers=2) — peak
# L1 working = 524 KB > physical L1 (256 KB). --l1=200K..400K
# trip a codegen assert ("Keys should be the same while generating
# DMA transfer for tensor 'data_in'/'data_out'"); 800K is the
# smallest value that gets through to a clean schedule.
"l1": 800_000,
"l2": 2_000_000,
# Use the default training schedule (n_steps=4 / n_accum=1 from
# inputs.npz) so per-step cycles are computed the same way as the
# tiled L3 baseline (BENCH total / 4).
"num_data_inputs": 1,
"skip_sim_in_ci": False,
},
# Other 3 fixtures (CCT_LoRA, ResNet8, MobileNetV1) temporarily
# disabled so this CI run isolates the big-CCT untiled measurement.
# Restore the entries below before merging.
#
# "Models/Training/CCT_LoRA/cct_lora_train": {
# "l1": 64_000,
# "l2": 2_000_000,
# "skip_sim_in_ci": False,
# },
# "Models/Training/ResNet8/resnet8_train": {
# "l1": 800_000,
# "l2": 2_000_000,
# "skip_sim_in_ci": False,
# },
# "Models/Training/MobileNetV1/mobilenetv1_train": {
# "l1": 800_000,
# "l2": 2_000_000,
# "n_steps": 1,
# "n_accum": 1,
# "num_data_inputs": 1,
# "skip_sim_in_ci": False,
# },
}

# Per-model overrides for training tests.
Expand Down
11 changes: 11 additions & 0 deletions TargetLibraries/PULPOpen/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,17 @@ target_link_libraries(deeploypulp PUBLIC pulp-nn-mixed)
target_link_libraries(deeploypulp INTERFACE pulp-sdk)
target_sources(deeploypulp INTERFACE $<TARGET_OBJECTS:pulp-sdk>)

# Untiled-L3 baseline: when ON, mchan_transfer_1d() in mchan_v7.h is replaced
# with a memcpy implementation so the deeploy-generated DMA calls become
# regular memory copies between L2 buffers. Used together with the test-side
# sed that rewrites pmsis_l1_malloc -> pi_l2_malloc and PI_L1 -> PI_L2 in the
# generated TrainingNetwork.c so every L1-annotated buffer physically lives
# in FC L2.
option(DEEPLOY_L1_AS_L2 "Replace mchan_transfer_1d with memcpy (untiled-L3 baseline)" OFF)
if(DEEPLOY_L1_AS_L2)
target_compile_definitions(deeploypulp PUBLIC DEEPLOY_L1_AS_L2)
endif()

set(PULP_NNX_PLATFORMS "Siracusa;Siracusa_w_neureka")
if (platform IN_LIST PULP_NNX_PLATFORMS)
if (platform STREQUAL "Siracusa" OR platform STREQUAL "Siracusa_w_neureka")
Expand Down
Loading
Loading