diff --git a/.github/workflows/_runner-siracusa-tiled.yml b/.github/workflows/_runner-siracusa-tiled.yml index cc09f234..16047857 100644 --- a/.github/workflows/_runner-siracusa-tiled.yml +++ b/.github/workflows/_runner-siracusa-tiled.yml @@ -17,6 +17,10 @@ name: _runner-siracusa-tiled pytest-marker: required: true type: string + pytest-extra-args: + required: false + type: string + default: "" jobs: test-runner-siracusa-tiled: @@ -36,5 +40,24 @@ jobs: - name: Run Test run: | cd DeeployTest - pytest test_platforms.py -v -m "siracusa_tiled and ${{ inputs.pytest-marker }}" + # Memory snapshot helps diagnose 137/OOM kills postmortem. + echo "=== free -m before pytest ==="; free -m || true + pytest test_platforms.py -v -m "siracusa_tiled and ${{ inputs.pytest-marker }}" ${{ inputs.pytest-extra-args }} + echo "=== free -m after pytest ==="; free -m || true shell: bash + - name: Build footprint summary + if: always() + env: + FOOTPRINT_SUMMARY_LABEL: ${{ inputs.pytest-marker }} + run: | + cd DeeployTest + python3 ../scripts/ci_footprint_summary.py TEST_SIRACUSA || true + shell: bash + - name: Upload sim out.txt + if: always() + uses: actions/upload-artifact@v4 + with: + name: sim-out-${{ inputs.pytest-marker }} + path: DeeployTest/out.txt + if-no-files-found: ignore + retention-days: 7 diff --git a/.github/workflows/ci-platform-siracusa-tiled.yml b/.github/workflows/ci-platform-siracusa-tiled.yml index b65cbb75..f443d2b1 100644 --- a/.github/workflows/ci-platform-siracusa-tiled.yml +++ b/.github/workflows/ci-platform-siracusa-tiled.yml @@ -29,20 +29,36 @@ jobs: with: docker_image_deeploy: ${{ inputs.docker_image_deeploy }} - # Training tests - L2 singlebuffer - siracusa-training-tiled-l2-singlebuffer: + # NOTE: L2 singlebuffer still commented out — only need fresh L3 + # singlebuffer numbers for the big-CCT tiled cycle (other 3 already + # measured). Restore the L2 entry below before merging. + # + # # Training tests - L2 singlebuffer + # siracusa-training-tiled-l2-singlebuffer: + # needs: select-env + # uses: ./.github/workflows/_runner-siracusa-tiled.yml + # with: + # runner: ${{ needs.select-env.outputs.runner }} + # docker-image: ${{ needs.select-env.outputs.image }} + # pytest-marker: "training and l2 and singlebuffer" + + # Training tests - L3 singlebuffer (re-run to pick up big-CCT tiled). + siracusa-training-tiled-l3-singlebuffer: needs: select-env uses: ./.github/workflows/_runner-siracusa-tiled.yml with: runner: ${{ needs.select-env.outputs.runner }} docker-image: ${{ needs.select-env.outputs.image }} - pytest-marker: "training and l2 and singlebuffer" + pytest-marker: "training and l3 and singlebuffer" - # Training tests - L3 singlebuffer (models that spill weights/activations to L3) - siracusa-training-tiled-l3-singlebuffer: + # Training tests - L3 untiled baseline. Codegen post-process rewrites + # every L1-annotated buffer to FC L2 so cluster cores access kernel + # buffers via the fabric — "untiled, L2-resident working set" cycle + # semantics for all 4 L3 models. + siracusa-training-tiled-l3-untiled: needs: select-env uses: ./.github/workflows/_runner-siracusa-tiled.yml with: runner: ${{ needs.select-env.outputs.runner }} docker-image: ${{ needs.select-env.outputs.image }} - pytest-marker: "training and l3 and singlebuffer" + pytest-marker: "training and l3 and untiled" diff --git a/.github/workflows/ci-platform-siracusa.yml b/.github/workflows/ci-platform-siracusa.yml index 7a4f415e..c1e7db5d 100644 --- a/.github/workflows/ci-platform-siracusa.yml +++ b/.github/workflows/ci-platform-siracusa.yml @@ -5,13 +5,10 @@ --- name: CI • Siracusa +# NOTE: Push / pull_request triggers temporarily disabled on this branch +# so only the L3-untiled job runs while we collect cycle-count data. +# Restore the push: / pull_request: blocks before merging. "on": - push: - branches: - - "**" - tags: - - "v*.*.*" - pull_request: workflow_dispatch: inputs: docker_image_deeploy: diff --git a/DeeployTest/conftest.py b/DeeployTest/conftest.py index f29891bf..b13a4cfe 100644 --- a/DeeployTest/conftest.py +++ b/DeeployTest/conftest.py @@ -75,6 +75,8 @@ def pytest_configure(config: pytest.Config) -> None: "markers", "train_kernel: mark test as a training-related single-op kernel test (grad / loss / optimizer)") config.addinivalue_line("markers", "singlebuffer: mark test as single-buffer configuration") config.addinivalue_line("markers", "doublebuffer: mark test as double-buffer configuration") + config.addinivalue_line( + "markers", "untiled: mark test as untiled-baseline (single-tile-per-tensor schedule via fake-L1 shim)") config.addinivalue_line("markers", "l2: mark test as L2 default memory level") config.addinivalue_line("markers", "l3: mark test as L3 default memory level") config.addinivalue_line("markers", "wmem: mark test as using Neureka weight memory") diff --git a/DeeployTest/test_platforms.py b/DeeployTest/test_platforms.py index 7eee2085..b814bb89 100644 --- a/DeeployTest/test_platforms.py +++ b/DeeployTest/test_platforms.py @@ -2,6 +2,8 @@ # # SPDX-License-Identifier: Apache-2.0 +import os + import pytest # Import platform-specific test configurations from test_chimera_config import KERNEL_TESTS as CHIMERA_KERNEL_TESTS @@ -40,6 +42,7 @@ from test_siracusa_tiled_config import L2_SINGLEBUFFER_TRAINING_MODELS as SIRACUSA_L2_SINGLEBUFFER_TRAINING_MODELS from test_siracusa_tiled_config import L3_DOUBLEBUFFER_MODELS, L3_SINGLEBUFFER_MODELS from test_siracusa_tiled_config import L3_SINGLEBUFFER_TRAINING_MODELS as SIRACUSA_L3_SINGLEBUFFER_TRAINING_MODELS +from test_siracusa_tiled_config import L3_UNTILED_TRAINING_MODELS as SIRACUSA_L3_UNTILED_TRAINING_MODELS from test_siracusa_tiled_config import TRAINING_MODEL_OVERRIDES as SIRACUSA_TRAINING_MODEL_OVERRIDES from test_snitch_config import DEFAULT_NUM_CORES as SNITCH_DEFAULT_NUM_CORES from test_snitch_config import KERNEL_TESTS as SNITCH_KERNEL_TESTS @@ -330,6 +333,9 @@ def test_siracusa_train_kernels(test_name, deeploy_test_dir, toolchain, toolchai @pytest.mark.training @pytest.mark.parametrize("test_name", SIRACUSA_TRAINING_TESTS, ids = SIRACUSA_TRAINING_TESTS) def test_siracusa_training(test_name, deeploy_test_dir, toolchain, toolchain_dir, cmake_args, skipgen, skipsim) -> None: + # Reuse the tiled overrides table — same models, same tolerance / data-input + # quirks regardless of whether tiling is on. + overrides = SIRACUSA_TRAINING_MODEL_OVERRIDES.get(test_name, {}) config = create_test_config( test_name = test_name, platform = "Siracusa", @@ -341,6 +347,8 @@ def test_siracusa_training(test_name, deeploy_test_dir, toolchain, toolchain_dir tiling = False, cores = SIRACUSA_DEFAULT_CORES, training = True, + training_num_data_inputs = overrides.get("num_data_inputs"), + training_tolerance = overrides.get("tolerance"), ) run_and_assert_test(test_name, config, skipgen, skipsim) @@ -413,6 +421,97 @@ def test_siracusa_tiled_training_l3_singlebuffer(test_params, deeploy_test_dir, run_and_assert_test(test_name, config, skipgen, skipsim) +@pytest.mark.siracusa_tiled +@pytest.mark.training +@pytest.mark.untiled +@pytest.mark.l3 +@pytest.mark.parametrize( + "test_name", + list(SIRACUSA_L3_UNTILED_TRAINING_MODELS.keys()), + ids = list(SIRACUSA_L3_UNTILED_TRAINING_MODELS.keys()), +) +def test_siracusa_tiled_training_l3_untiled(test_name, deeploy_test_dir, toolchain, toolchain_dir, cmake_args, skipgen, + skipsim) -> None: + """Untiled-L3 baseline. + + SBTiler picks single-tile-per-tensor schedules (--l1 inflated above the + op working set so no spatial split happens). The generated C is one + kernel call per op with integral L3↔L2 DMA wrappers. + + To make the L1 staging buffer physically live in FC L2 (so cycles + represent "kernel actually accessing L2"), we post-process the + generated TrainingNetwork.c / OptimizerNetwork.c after codegen but + before cmake build: + + pmsis_l1_malloc -> pi_l2_malloc + PI_L1 -> PI_L2 + + Every L1-annotated buffer ends up in FC L2. Cluster cores access L2 + via the fabric (~7x slower than real L1) — that's the deliberate + semantics of "untiled L2-resident". No fake-L1 shim, no linker wrap, + no SDK pollution. + """ + from pathlib import Path + + from testUtils.core.execution import build_binary, configure_cmake, generate_network, run_simulation + + fixture = SIRACUSA_L3_UNTILED_TRAINING_MODELS[test_name] + overrides = SIRACUSA_TRAINING_MODEL_OVERRIDES.get(test_name, {}) + effective_skipsim = skipsim or (os.environ.get("CI") == "true" and fixture.get("skip_sim_in_ci", False)) + # DEEPLOY_L1_AS_L2 is what flips mchan_transfer_1d to memcpy in mchan_v7.h — + # mandatory partner of the codegen sed below. + extra_cmake = list(cmake_args) + ["-DDEEPLOY_L1_AS_L2=ON"] + # Optional per-fixture training-step caps. Some untiled-L3 models hit FC + # L2 heap limits when testinputs.h carries 4-batch data; capping reduces + # the .data footprint while keeping per-step cycle measurement valid. + extra_gen = [] + if "n_steps" in fixture: + extra_gen.append(f"--n-steps={fixture['n_steps']}") + if "n_accum" in fixture: + extra_gen.append(f"--n-accum={fixture['n_accum']}") + # Per-fixture num_data_inputs override (lets a fixture force the value + # the model overrides don't set globally — needed when a multi-input + # model triggers a code-path bug only with NUM_DATA_INPUTS > 1). + fixture_num_data = fixture.get("num_data_inputs", overrides.get("num_data_inputs")) + config = create_test_config( + test_name = test_name, + platform = "Siracusa", + simulator = "gvsoc", + deeploy_test_dir = deeploy_test_dir, + toolchain = toolchain, + toolchain_dir = toolchain_dir, + cmake_args = extra_cmake, + tiling = True, + cores = SIRACUSA_DEFAULT_CORES, + l1 = fixture["l1"], + l2 = fixture["l2"], + default_mem_level = "L3", + double_buffer = False, + training = True, + training_num_data_inputs = fixture_num_data, + training_tolerance = overrides.get("tolerance"), + gen_args = extra_gen, + ) + + # Inline the test runner stages so we can sed between codegen and build. + generate_network(config, skip = skipgen) + for c_name in ("TrainingNetwork.c", "OptimizerNetwork.c"): + c_path = Path(config.gen_dir) / c_name + if not c_path.exists(): + continue + text = c_path.read_text() + text = text.replace("pmsis_l1_malloc", "pi_l2_malloc") + text = text.replace("PI_L1 ", "PI_L2 ") + c_path.write_text(text) + configure_cmake(config) + build_binary(config) + result = run_simulation(config, skip = effective_skipsim) + assert result.success, (f"Test {test_name} failed with {result.error_count} errors out of " + f"{result.total_count}\nOutput:\n{result.stdout}") + if result.error_count >= 0: + assert result.error_count == 0, (f"Found {result.error_count} errors out of {result.total_count} tests") + + @pytest.mark.siracusa_tiled @pytest.mark.kernels @pytest.mark.singlebuffer diff --git a/DeeployTest/test_siracusa_config.py b/DeeployTest/test_siracusa_config.py index d2f25708..5f839a89 100644 --- a/DeeployTest/test_siracusa_config.py +++ b/DeeployTest/test_siracusa_config.py @@ -113,6 +113,18 @@ # Training-enabled models (use deeployTrainingRunner / testMVPTraining pipeline). # Each entry is the path to a `_train` directory; the matching # `_optimizer` directory must live next to it. +# +# Untiled-L3 baseline scope: +# The untiled path emits one pi_l2_malloc per buffer; the SUM of these calls +# must fit in the Siracusa FC-L2 heap (~1 MB usable after BSS/stack). Models +# below have a verified untiled L2 footprint within that ceiling: +# - SimpleMLP ~0.05 MB +# - CCT_LoRA ~0.4 MB +# - CCT ~0.7 MB +# ResNet8 (~9.3 MB) and MobileNetV1 (~17 MB) need an L2-heap override that +# lives in a separate task — they stay tiled-only for now. TRAINING_TESTS = [ "Models/Training/SimpleMLP/simplemlp_train", + "Models/Training/CCT/cct_train", + "Models/Training/CCT_LoRA/cct_lora_train", ] diff --git a/DeeployTest/test_siracusa_tiled_config.py b/DeeployTest/test_siracusa_tiled_config.py index bafa6635..30511320 100644 --- a/DeeployTest/test_siracusa_tiled_config.py +++ b/DeeployTest/test_siracusa_tiled_config.py @@ -165,11 +165,87 @@ # Training-enabled tiled models that need L3 spill (weights/activations don't # fit in L2). Same shape: test path -> list of L1 sizes (bytes). +# TEMPORARY: only big-CCT enabled — the other 3 already have tiled cycle +# data from earlier CI runs. Restore the entries below before merging. L3_SINGLEBUFFER_TRAINING_MODELS = { - "Models/Training/ResNet8/resnet8_train": [128000], - "Models/Training/MobileNetV1/mobilenetv1_train": [128000], + # "Models/Training/ResNet8/resnet8_train": [128000], + # "Models/Training/MobileNetV1/mobilenetv1_train": [128000], "Models/Training/CCT/cct_train": [128000], - "Models/Training/CCT_LoRA/cct_lora_train": [128000], + # "Models/Training/CCT_LoRA/cct_lora_train": [128000], +} + +# Untiled-L3 baseline. Same fixtures as L3_SINGLEBUFFER_TRAINING_MODELS but +# the L1 budget is inflated so the SBTiler picks single-tile-per-tensor +# schedules (numTiles == 1 on every dim) — semantically untiled per op, but +# still uses the tile-codegen DMA wrappers because cluster cores cannot deref +# HyperRAM directly. The L1 working buffer ends up larger than physical +# Siracusa L1 (256 KB), so the deeploy_fake_l1 shim redirects pi_cl_l1_malloc +# into an FC-L2 arena via -Wl,--wrap; size cap = DEEPLOY_FAKE_L1_SIZE (set +# per-fixture below to fit the model's peak L1 working set with headroom). +# +# Maps test_name -> dict with: +# l1: planner-side L1 size (forces single-tile schedules) +# l2: planner-side L2 size +# fake_l1_size: physical bytes for the FC-L2 arena backing pi_cl_l1_malloc +# +# fake_l1_size baselining method: spike with --l1=4_000_000 → read off +# MEMORYARENA_L1 size from generated TrainingNetwork.c → round up. +# Untiled-L3 baseline — single-tile-per-tensor schedules for every L3 +# training model so the user can read off "untiled L3 latency" alongside +# the existing tiled-L3 cycles. +# +# Each fixture goes through the same SBTiler infrastructure as the L3 +# singlebuffer tests, with --l1 inflated to the smallest value that +# yields the minimal-tile shape (one kernel call per op + integral +# L3↔L2 DMA, no spatial split). +# +# After codegen, the test post-processes TrainingNetwork.c / +# OptimizerNetwork.c to swap pmsis_l1_malloc → pi_l2_malloc and +# PI_L1 → PI_L2, so every L1-annotated buffer physically lives in +# FC L2. Cluster cores access these via the fabric (~7x slower than +# real L1) — that's the deliberate semantics of "untiled L2-resident". +# +# skip_sim_in_ci: True for fixtures where gvsoc has historically OOMed +# during the long single-tile loop. CI still verifies codegen + +# compile + link in that case; sim is deferred to a manual local run +# or a beefier runner. +L3_UNTILED_TRAINING_MODELS = { + "Models/Training/CCT/cct_train": { + # Big-CCT (img_size=32, embedding_dim=128, n_conv_layers=2) — peak + # L1 working = 524 KB > physical L1 (256 KB). --l1=200K..400K + # trip a codegen assert ("Keys should be the same while generating + # DMA transfer for tensor 'data_in'/'data_out'"); 800K is the + # smallest value that gets through to a clean schedule. + "l1": 800_000, + "l2": 2_000_000, + # Use the default training schedule (n_steps=4 / n_accum=1 from + # inputs.npz) so per-step cycles are computed the same way as the + # tiled L3 baseline (BENCH total / 4). + "num_data_inputs": 1, + "skip_sim_in_ci": False, + }, + # Other 3 fixtures (CCT_LoRA, ResNet8, MobileNetV1) temporarily + # disabled so this CI run isolates the big-CCT untiled measurement. + # Restore the entries below before merging. + # + # "Models/Training/CCT_LoRA/cct_lora_train": { + # "l1": 64_000, + # "l2": 2_000_000, + # "skip_sim_in_ci": False, + # }, + # "Models/Training/ResNet8/resnet8_train": { + # "l1": 800_000, + # "l2": 2_000_000, + # "skip_sim_in_ci": False, + # }, + # "Models/Training/MobileNetV1/mobilenetv1_train": { + # "l1": 800_000, + # "l2": 2_000_000, + # "n_steps": 1, + # "n_accum": 1, + # "num_data_inputs": 1, + # "skip_sim_in_ci": False, + # }, } # Per-model overrides for training tests. diff --git a/TargetLibraries/PULPOpen/CMakeLists.txt b/TargetLibraries/PULPOpen/CMakeLists.txt index ce39fea7..0935b925 100644 --- a/TargetLibraries/PULPOpen/CMakeLists.txt +++ b/TargetLibraries/PULPOpen/CMakeLists.txt @@ -71,6 +71,17 @@ target_link_libraries(deeploypulp PUBLIC pulp-nn-mixed) target_link_libraries(deeploypulp INTERFACE pulp-sdk) target_sources(deeploypulp INTERFACE $) +# Untiled-L3 baseline: when ON, mchan_transfer_1d() in mchan_v7.h is replaced +# with a memcpy implementation so the deeploy-generated DMA calls become +# regular memory copies between L2 buffers. Used together with the test-side +# sed that rewrites pmsis_l1_malloc -> pi_l2_malloc and PI_L1 -> PI_L2 in the +# generated TrainingNetwork.c so every L1-annotated buffer physically lives +# in FC L2. +option(DEEPLOY_L1_AS_L2 "Replace mchan_transfer_1d with memcpy (untiled-L3 baseline)" OFF) +if(DEEPLOY_L1_AS_L2) + target_compile_definitions(deeploypulp PUBLIC DEEPLOY_L1_AS_L2) +endif() + set(PULP_NNX_PLATFORMS "Siracusa;Siracusa_w_neureka") if (platform IN_LIST PULP_NNX_PLATFORMS) if (platform STREQUAL "Siracusa" OR platform STREQUAL "Siracusa_w_neureka") diff --git a/TargetLibraries/PULPOpen/inc/mchan_v7.h b/TargetLibraries/PULPOpen/inc/mchan_v7.h index 32ef836f..e8a0ea1f 100644 --- a/TargetLibraries/PULPOpen/inc/mchan_v7.h +++ b/TargetLibraries/PULPOpen/inc/mchan_v7.h @@ -47,6 +47,42 @@ #define MCHAN_CMD_FLAG_BROADCAST_FINISH (1 << (MCHAN_TRANSFER_LEN_SIZE + 5)) #define MCHAN_CMD_FLAG_2D_TRANSFER_LOCAL (1 << (MCHAN_TRANSFER_LEN_SIZE + 6)) +/* Untiled-L3 baseline override: when DEEPLOY_L1_AS_L2 is defined the + * deeploy-generated code has been sed-rewritten so its "L1" pointers + * actually live in FC L2. The mchan DMA hardware ignores destination + * pointer addresses and unconditionally routes the `loc` parameter into + * cluster L1 banks via the lower bits — so a real DMA call would write + * garbage to L1 and leave the L2 destination empty (which is exactly + * the bug we observed: out-of-bound L1-bank requests + computed=0.0). + * + * Replace mchan transfers with plain memcpy. The channel API becomes a + * no-op: alloc returns 0, wait/free do nothing, is_busy reports idle. + * Only the 1D variant is provided — none of the L3 training fixtures + * emit 2D transfers; if a future model does, add the equivalent loop + * here. */ +#ifdef DEEPLOY_L1_AS_L2 + +#include + +static inline void mchan_transfer_1d(uint32_t cmd, void *loc, void *ext) { + uint32_t size = cmd & ((1u << MCHAN_TRANSFER_LEN_SIZE) - 1); + if (cmd & MCHAN_CMD_FLAG_DIRECTION_EXT2LOC) { + memcpy(loc, ext, size); + } else { + memcpy(ext, loc, size); + } +} + +static inline uint32_t mchan_channel_alloc() { return 0; } +static inline void mchan_channel_free(uint32_t channel_id) { (void)channel_id; } +static inline uint32_t mchan_channel_is_busy(uint32_t channel_id) { + (void)channel_id; + return 0; +} +static inline void mchan_channel_wait(uint32_t channel_id) { (void)channel_id; } + +#else + static volatile uint32_t *const cmd_ptr = (volatile uint32_t *const)(MCHAN_BASE_ADDR + 0x0); static volatile uint32_t *const status_ptr = @@ -117,4 +153,6 @@ static void mchan_channel_wait(uint32_t channel_id) { #endif } +#endif /* DEEPLOY_L1_AS_L2 */ + #endif // __MCHAN_V7_H__ diff --git a/scripts/ci_footprint_summary.py b/scripts/ci_footprint_summary.py new file mode 100644 index 00000000..b8330082 --- /dev/null +++ b/scripts/ci_footprint_summary.py @@ -0,0 +1,150 @@ +#!/usr/bin/env python3 +# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +# SPDX-License-Identifier: Apache-2.0 +""" +Emit a per-test footprint + cycle summary to GITHUB_STEP_SUMMARY. + +Two passes: + +1. **Build footprint** — walks `DeeployTest/TEST_SIRACUSA/` for generated + `TrainingNetwork.c` files and reports per fixture: MEMORYARENA_L1/L2/L3 + sizes (peak working sets + L3 storage) and distinct numTiles shapes. +2. **Cycle counts** — parses `DeeployTest/out.txt` (where the test runner + appends every sim's stdout) for `BENCH train_cycles=… opt_cycles=… + weight_sram=…` lines, correlating each line to its preceding `Testing + ` banner. Skipped fixtures contribute no cycle row. + +Used in the siracusa-tiled CI workflow. Safe to run with no matching +files (just emits an empty section). +""" + +import os +import re +import sys +from pathlib import Path + +ARENA_RE = re.compile(r"MEMORYARENA_(L1|L2|L3)\s*=.*\*\s*(\d+)") +TILES_RE = re.compile(r"numTiles\[\d+\]\s*=\s*\{[^}]+\}") +TESTING_RE = re.compile(r"Testing\s+(\S+)\s+on\s+\S+\s+Platform") +BENCH_RE = re.compile(r"BENCH\s+train_cycles=(\d+)\s+opt_cycles=(\d+)\s+weight_sram=(\d+)") + + +def parse_one(c_path: Path) -> dict: + arenas = {"L1": 0, "L2": 0, "L3": 0} + tile_shapes = set() + for line in c_path.read_text(errors="replace").splitlines(): + m = ARENA_RE.search(line) + if m: + arenas[m.group(1)] = max(arenas[m.group(1)], int(m.group(2))) + for t in TILES_RE.findall(line): + tile_shapes.add(t) + return {"arenas": arenas, "tile_shapes": len(tile_shapes)} + + +def fmt_kb(n: int) -> str: + if n == 0: + return "—" + return f"{n / 1024:.1f} KB" + + +def fmt_cycles(n: int) -> str: + if n == 0: + return "—" + if n >= 1_000_000: + return f"{n / 1e6:.2f}M" + if n >= 1_000: + return f"{n / 1e3:.1f}K" + return str(n) + + +def parse_cycles(out_txt: Path) -> dict: + """Returns {test_dir: {train_cycles, opt_cycles, weight_sram}}. + + Each `Testing ` banner in out.txt opens a section; the next + `BENCH …` line in that section is the cycle row for that fixture. + Sections without a BENCH line (skipsim, sim crash) get no entry. + """ + if not out_txt.is_file(): + return {} + out: dict = {} + current = None + for line in out_txt.read_text(errors="replace").splitlines(): + m = TESTING_RE.search(line) + if m: + current = m.group(1) + continue + m = BENCH_RE.search(line) + if m and current is not None: + out[current] = { + "train_cycles": int(m.group(1)), + "opt_cycles": int(m.group(2)), + "weight_sram": int(m.group(3)), + } + return out + + +def main() -> int: + test_root = Path(sys.argv[1]) if len(sys.argv) > 1 else Path("DeeployTest/TEST_SIRACUSA") + if not test_root.is_dir(): + print(f"[footprint-summary] {test_root} not a directory; skipping", file=sys.stderr) + return 0 + + rows = [] + for c_path in sorted(test_root.rglob("TrainingNetwork.c")): + rel = c_path.relative_to(test_root).parent + info = parse_one(c_path) + rows.append((str(rel), info)) + + cycles = parse_cycles(test_root.parent / "out.txt") + + # Pick the pytest marker label (passed by the workflow) for the section title. + label = os.environ.get("FOOTPRINT_SUMMARY_LABEL", "training") + + out_lines = [ + f"### Build footprint — `{label}`", + "", + "| Fixture | L1 working | L2 working | L3 storage | Distinct tile shapes |", + "|---|--:|--:|--:|--:|", + ] + for path, info in rows: + a = info["arenas"] + out_lines.append( + f"| `{path}` | {fmt_kb(a['L1'])} | {fmt_kb(a['L2'])} | {fmt_kb(a['L3'])} | {info['tile_shapes']} |") + if not rows: + out_lines.append("| _(no TrainingNetwork.c found)_ | | | | |") + out_lines.append("") + + # Cycle table — only renders if at least one fixture actually simulated. + cycle_rows = [] + for path, _info in rows: + # The `Testing` banner uses the absolute test_dir path; match by basename. + match_key = next((k for k in cycles if k.endswith(path) or path.endswith(Path(k).name)), None) + if match_key: + cycle_rows.append((path, cycles[match_key])) + out_lines.append(f"### Cycle counts (gvsoc) — `{label}`") + out_lines.append("") + out_lines.append("| Fixture | train_cycles | opt_cycles | weight_sram |") + out_lines.append("|---|--:|--:|--:|") + if cycle_rows: + for path, c in cycle_rows: + out_lines.append( + f"| `{path}` | {fmt_cycles(c['train_cycles'])} | " + f"{fmt_cycles(c['opt_cycles'])} | {fmt_kb(c['weight_sram'])} |") + else: + out_lines.append( + "| _(no BENCH lines in out.txt — sim was --skipsim'd or crashed)_ | | | |") + out_lines.append("") + + summary_path = os.environ.get("GITHUB_STEP_SUMMARY") + if summary_path: + with open(summary_path, "a") as f: + f.write("\n".join(out_lines) + "\n") + print(f"[footprint-summary] wrote {len(rows)} rows to {summary_path}", file=sys.stderr) + else: + # Local invocation: print to stdout for visibility. + print("\n".join(out_lines)) + return 0 + + +if __name__ == "__main__": + sys.exit(main())