From 7b587004912b63584f652ea3bbf89ee5a3ea0abc Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 24 Jun 2026 21:29:28 +0900
Subject: [PATCH] [CI] Make wrapper2/wrapper3 exercise small systolic-array
 configs

The wrapper2 test job passed vpu_num_lanes and vpu_spad_size_kb_per_lane
as docker env vars, but since the unified-config refactor (9db0f2ce) the
frontend reads these values only from the TOGSim YAML, never from the
environment. The env vars were therefore silently ignored and wrapper2
ran with the default 128x128 config, making it an exact duplicate of
wrapper1.

Switch the reusable workflow to select a config YAML via TOGSIM_CONFIG so
that a single file drives both codegen and the TOGSim cycle model, in
line with the unified-config design. vpu_num_lanes also sets the systolic
array dimension, so each config exercises a different array size:

- configs: add systolic_ws_32x32_c1_simple_noc_tpuv3.yml (32x32 array,
  32 KB/lane SPAD) and systolic_ws_8x8_c1_simple_noc_tpuv3.yml (8x8
  array, 32 KB/lane SPAD); both identical to the tpuv3 default otherwise
- pytorchsim_test.yml: replace vector_lane/spad_size inputs with
  togsim_config (string) and run_accuracy (bool); every job now sets
  -e TOGSIM_CONFIG instead of the dead vpu_* env vars
- docker-image.yml: wrapper1 -> 128x128 config + run_accuracy true,
  wrapper2 -> 32x32 config, wrapper3 -> 8x8 config (run_accuracy false)
---
 .github/workflows/docker-image.yml            |  16 +-
 .github/workflows/pytorchsim_test.yml         | 161 +++++++-----------
 .../systolic_ws_32x32_c1_simple_noc_tpuv3.yml |  28 +++
 .../systolic_ws_8x8_c1_simple_noc_tpuv3.yml   |  28 +++
 4 files changed, 125 insertions(+), 108 deletions(-)
 create mode 100644 configs/systolic_ws_32x32_c1_simple_noc_tpuv3.yml
 create mode 100644 configs/systolic_ws_8x8_c1_simple_noc_tpuv3.yml

diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml
index 11f9dbb1..dca06709 100644
--- a/.github/workflows/docker-image.yml
+++ b/.github/workflows/docker-image.yml
@@ -141,13 +141,21 @@ jobs:
     uses: ./.github/workflows/pytorchsim_test.yml
     with:
       image_name: ghcr.io/psal-postech/torchsim-test:${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      vector_lane: 128
-      spad_size: 128
+      togsim_config: /workspace/PyTorchSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml
+      run_accuracy: true
 
   test-pytorchsim-wrapper2:
     needs: build-and-test
     uses: ./.github/workflows/pytorchsim_test.yml
     with:
       image_name: ghcr.io/psal-postech/torchsim-test:${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      vector_lane: 32
-      spad_size: 32
+      togsim_config: /workspace/PyTorchSim/configs/systolic_ws_32x32_c1_simple_noc_tpuv3.yml
+      run_accuracy: false
+
+  test-pytorchsim-wrapper3:
+    needs: build-and-test
+    uses: ./.github/workflows/pytorchsim_test.yml
+    with:
+      image_name: ghcr.io/psal-postech/torchsim-test:${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      togsim_config: /workspace/PyTorchSim/configs/systolic_ws_8x8_c1_simple_noc_tpuv3.yml
+      run_accuracy: false
diff --git a/.github/workflows/pytorchsim_test.yml b/.github/workflows/pytorchsim_test.yml
index 33e279fe..345c716e 100644
--- a/.github/workflows/pytorchsim_test.yml
+++ b/.github/workflows/pytorchsim_test.yml
@@ -6,14 +6,15 @@ on:
       image_name:
         required: true
         type: string
-      vector_lane:
-        description: "Vector lane size (use empty string for server TPU)"
+      togsim_config:
+        description: "TOGSim hardware config YAML (single source of truth; drives both codegen and the cycle sim)"
         required: true
-        type: number
-      spad_size:
-        description: "SPAD size (use empty string for server TPU)"
-        required: true
-        type: number
+        type: string
+      run_accuracy:
+        description: "Run the accuracy + speedup artifact job (only meaningful for the 128x128 config)"
+        required: false
+        default: false
+        type: boolean
 
 # Runner policy: the CPU-only CI image is small enough to pull on GitHub-hosted
 # runners, so op and model tests run on ubuntu-latest. The memory/time-intensive
@@ -35,8 +36,7 @@ jobs:
         run: |
           echo "Running test_add.py"
           docker run --rm \
-            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
-            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
+            -e TOGSIM_CONFIG="${{ inputs.togsim_config }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/ops/elementwise/test_add.py
 
   test_transcendental:
@@ -54,8 +54,7 @@ jobs:
         run: |
           echo "Running test_transcendental.py"
           docker run --rm \
-            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
-            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
+            -e TOGSIM_CONFIG="${{ inputs.togsim_config }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/ops/elementwise/test_transcendental.py
 
   test_activation:
@@ -73,8 +72,7 @@ jobs:
         run: |
           echo "Running test_activation.py"
           docker run --rm \
-            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
-            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
+            -e TOGSIM_CONFIG="${{ inputs.togsim_config }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/ops/elementwise/test_activation.py
 
   test_batchnorm:
@@ -92,8 +90,7 @@ jobs:
         run: |
           echo "Running test_batchnorm.py"
           docker run --rm \
-            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
-            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
+            -e TOGSIM_CONFIG="${{ inputs.togsim_config }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/ops/reduce/test_batchnorm.py
 
   test_bmm:
@@ -111,8 +108,7 @@ jobs:
         run: |
           echo "Running test_bmm.py"
           docker run --rm \
-            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
-            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
+            -e TOGSIM_CONFIG="${{ inputs.togsim_config }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/ops/gemm/test_bmm.py
 
   test_cnn:
@@ -130,8 +126,7 @@ jobs:
         run: |
           echo "Running test_cnn.py"
           docker run --rm \
-            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
-            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
+            -e TOGSIM_CONFIG="${{ inputs.togsim_config }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/ops/conv/test_cnn.py
 
   test_conv2d:
@@ -149,8 +144,7 @@ jobs:
         run: |
           echo "Running test_conv2d.py"
           docker run --rm \
-            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
-            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
+            -e TOGSIM_CONFIG="${{ inputs.togsim_config }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/ops/conv/test_conv2d.py
 
   test_cat:
@@ -168,8 +162,7 @@ jobs:
         run: |
           echo "Running test_cat.py"
           docker run --rm \
-            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
-            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
+            -e TOGSIM_CONFIG="${{ inputs.togsim_config }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/ops/view/test_cat.py
 
   test_floormod_axis_split:
@@ -187,8 +180,7 @@ jobs:
         run: |
           echo "Running test_floormod_axis_split.py"
           docker run --rm \
-            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
-            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
+            -e TOGSIM_CONFIG="${{ inputs.togsim_config }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/ops/view/test_floormod_axis_split.py
 
   test_widen_dtype:
@@ -206,8 +198,7 @@ jobs:
         run: |
           echo "Running test_widen_dtype.py"
           docker run --rm \
-            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
-            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
+            -e TOGSIM_CONFIG="${{ inputs.togsim_config }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/ops/misc/test_widen_dtype.py
 
   test_matmul:
@@ -225,8 +216,7 @@ jobs:
         run: |
           echo "Running test_matmul.py"
           docker run --rm \
-            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
-            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
+            -e TOGSIM_CONFIG="${{ inputs.togsim_config }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/ops/gemm/test_matmul.py
 
   test_reduce:
@@ -244,8 +234,7 @@ jobs:
         run: |
           echo "Running test_reduce.py"
           docker run --rm \
-            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
-            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
+            -e TOGSIM_CONFIG="${{ inputs.togsim_config }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/ops/reduce/test_reduce.py
 
   test_softmax:
@@ -263,8 +252,7 @@ jobs:
         run: |
           echo "Running test_softmax.py"
           docker run --rm \
-            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
-            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
+            -e TOGSIM_CONFIG="${{ inputs.togsim_config }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/ops/reduce/test_softmax.py
 
   test_transpose2D:
@@ -282,8 +270,7 @@ jobs:
         run: |
           echo "Running test_transpose2D.py"
           docker run --rm \
-            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
-            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
+            -e TOGSIM_CONFIG="${{ inputs.togsim_config }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/ops/view/test_transpose2D.py
 
   test_view3D_2D:
@@ -301,8 +288,7 @@ jobs:
         run: |
           echo "Running test_view3D_2D.py"
           docker run --rm \
-            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
-            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
+            -e TOGSIM_CONFIG="${{ inputs.togsim_config }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/ops/view/test_view3D_2D.py
 
   test_layernorm:
@@ -320,8 +306,7 @@ jobs:
         run: |
           echo "Running test_layernorm.py"
           docker run --rm \
-            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
-            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
+            -e TOGSIM_CONFIG="${{ inputs.togsim_config }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/ops/reduce/test_layernorm.py
 
   test_mlp:
@@ -339,8 +324,7 @@ jobs:
         run: |
           echo "Running test_mlp.py"
           docker run --rm \
-            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
-            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
+            -e TOGSIM_CONFIG="${{ inputs.togsim_config }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/models/test_mlp.py
 
   test_resnet:
@@ -358,16 +342,14 @@ jobs:
         run: |
           echo "Running test_resnet.py"
           docker run --rm \
-            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
-            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
+            -e TOGSIM_CONFIG="${{ inputs.togsim_config }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/models/test_resnet.py
 
       - name: Run test_resnet50.py
         run: |
           echo "Running test_resnet.py"
           docker run --rm \
-            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
-            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
+            -e TOGSIM_CONFIG="${{ inputs.togsim_config }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/models/test_resnet.py --model_type resnet50
 
   test_mobilenet:
@@ -385,8 +367,7 @@ jobs:
         run: |
           echo "Running test_mobilenet.py"
           docker run --rm \
-            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
-            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
+            -e TOGSIM_CONFIG="${{ inputs.togsim_config }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/models/MobileNet/test_mobilenet.py
 
   test_transformer:
@@ -404,8 +385,7 @@ jobs:
         run: |
           echo "Running test_transformer.py"
           docker run --rm \
-            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
-            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
+            -e TOGSIM_CONFIG="${{ inputs.togsim_config }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/models/test_transformer.py
 
   test_transpose3D:
@@ -423,8 +403,7 @@ jobs:
         run: |
           echo "Running test_transpose3D.py"
           docker run --rm \
-            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
-            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
+            -e TOGSIM_CONFIG="${{ inputs.togsim_config }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/ops/view/test_transpose3D.py
 
   test_sparsity:
@@ -442,8 +421,7 @@ jobs:
         run: |
           echo "Running test_sparsity.py"
           docker run --rm \
-            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
-            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
+            -e TOGSIM_CONFIG="${{ inputs.togsim_config }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/ops/sparsity/test_sparsity.py
 
   test_pool:
@@ -461,8 +439,7 @@ jobs:
         run: |
           echo "Running test_pool.py"
           docker run --rm \
-            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
-            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
+            -e TOGSIM_CONFIG="${{ inputs.togsim_config }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/ops/conv/test_pool.py
 
   test_perceptron:
@@ -480,8 +457,7 @@ jobs:
         run: |
           echo "Running test_single_perceptron.py"
           docker run --rm \
-            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
-            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
+            -e TOGSIM_CONFIG="${{ inputs.togsim_config }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/models/test_single_perceptron.py
 
   test_fusion:
@@ -499,80 +475,70 @@ jobs:
         run: |
           echo "Running test_addmm_residual.py"
           docker run --rm \
-            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
-            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
+            -e TOGSIM_CONFIG="${{ inputs.togsim_config }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/ops/fusion/test_addmm_residual.py
 
       - name: Run test_matmul_activation.py
         run: |
           echo "Running test_matmul_activation.py"
           docker run --rm \
-            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
-            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
+            -e TOGSIM_CONFIG="${{ inputs.togsim_config }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/ops/fusion/test_matmul_activation.py
 
       - name: Run test_matmul_scalar.py
         run: |
           echo "Running test_matmul_scalar.py"
           docker run --rm \
-            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
-            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
+            -e TOGSIM_CONFIG="${{ inputs.togsim_config }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/ops/fusion/test_matmul_scalar.py
 
       - name: Run test_matmul_reduction.py
         run: |
           echo "Running test_matmul_reduction.py"
           docker run --rm \
-            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
-            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
+            -e TOGSIM_CONFIG="${{ inputs.togsim_config }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/ops/fusion/test_matmul_reduction.py
 
       - name: Run test_bmm_reduction.py
         run: |
           echo "Running test_bmm_reduction.py"
           docker run --rm \
-            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
-            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
+            -e TOGSIM_CONFIG="${{ inputs.togsim_config }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/ops/fusion/test_bmm_reduction.py
 
       - name: Run test_prologue_fusion.py
         run: |
           echo "Running test_prologue_fusion.py"
           docker run --rm \
-            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
-            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
+            -e TOGSIM_CONFIG="${{ inputs.togsim_config }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/ops/fusion/test_prologue_fusion.py
 
       - name: Run test_transformer_fusion.py
         run: |
           echo "Running test_transformer_fusion.py"
           docker run --rm \
-            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
-            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
+            -e TOGSIM_CONFIG="${{ inputs.togsim_config }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/ops/fusion/test_transformer_fusion.py
 
       - name: Run test_conv_fusion.py
         run: |
           echo "Running test_conv_fusion.py"
           docker run --rm \
-            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
-            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
+            -e TOGSIM_CONFIG="${{ inputs.togsim_config }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/ops/fusion/test_conv_fusion.py
 
       - name: Run test_attention_fusion.py
         run: |
           echo "Running test_attention_fusion.py"
           docker run --rm \
-            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
-            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
+            -e TOGSIM_CONFIG="${{ inputs.togsim_config }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/ops/fusion/test_attention_fusion.py
 
       - name: Run test_matmul_vector.py
         run: |
           echo "Running test_matmul_vector.py"
           docker run --rm \
-            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
-            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
+            -e TOGSIM_CONFIG="${{ inputs.togsim_config }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/ops/fusion/test_matmul_vector.py
 
   test_moe:
@@ -590,8 +556,7 @@ jobs:
         run: |
           echo "Running test_moe.py"
           docker run --rm \
-            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
-            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
+            -e TOGSIM_CONFIG="${{ inputs.togsim_config }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/models/MoE/test_moe.py
 
   test_mistral:
@@ -609,8 +574,7 @@ jobs:
         run: |
           echo "Running test_mistral.py"
           docker run --rm \
-            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
-            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
+            -e TOGSIM_CONFIG="${{ inputs.togsim_config }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/models/Mixtral8x7B/test_attention.py
 
   test_vit:
@@ -628,8 +592,7 @@ jobs:
         run: |
           echo "Running test_vit.py"
           docker run --rm \
-            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
-            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
+            -e TOGSIM_CONFIG="${{ inputs.togsim_config }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/models/test_vit.py
 
   test_diffusion:
@@ -649,8 +612,7 @@ jobs:
         run: |
           echo "Running test_diffusion.py"
           docker run --rm \
-            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
-            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
+            -e TOGSIM_CONFIG="${{ inputs.togsim_config }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/models/Diffusion/test_diffusion.py
 
   test_indirect:
@@ -668,8 +630,7 @@ jobs:
         run: |
           echo "Running test_indirect.py"
           docker run --rm \
-            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
-            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
+            -e TOGSIM_CONFIG="${{ inputs.togsim_config }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/ops/misc/test_indirect_access.py
 
   test_scheduler:
@@ -687,8 +648,7 @@ jobs:
         run: |
           echo "Running test_scheduler.py"
           docker run --rm \
-            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
-            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
+            -e TOGSIM_CONFIG="${{ inputs.togsim_config }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/system/test_scheduler.py
 
   test_llama:
@@ -706,8 +666,7 @@ jobs:
         run: |
           echo "Running test_llama.py"
           docker run --rm \
-            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
-            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
+            -e TOGSIM_CONFIG="${{ inputs.togsim_config }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/models/Llama/test_llama.py
 
   test_yolov5:
@@ -725,8 +684,7 @@ jobs:
         run: |
           echo "Running test_yolov5.py"
           docker run --rm \
-            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
-            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
+            -e TOGSIM_CONFIG="${{ inputs.togsim_config }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/models/Yolov5/test_yolov5.py
 
   test_deepseek:
@@ -746,8 +704,7 @@ jobs:
         run: |
           echo "Running test_deepseek_v3_base.py"
           docker run --rm \
-            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
-            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
+            -e TOGSIM_CONFIG="${{ inputs.togsim_config }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/models/DeepSeek/test_deepseek_v3_base.py
 
   test_eager:
@@ -765,8 +722,7 @@ jobs:
         run: |
           echo "Running test_eager.py"
           docker run --rm \
-            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
-            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
+            -e TOGSIM_CONFIG="${{ inputs.togsim_config }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/system/test_eager.py
 
   test_exponent:
@@ -784,8 +740,7 @@ jobs:
         run: |
           echo "Running test_exponent.py"
           docker run --rm \
-            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
-            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
+            -e TOGSIM_CONFIG="${{ inputs.togsim_config }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/ops/elementwise/test_exponent.py
 
   test_sort:
@@ -803,8 +758,7 @@ jobs:
         run: |
           echo "Running test_sort.py"
           docker run --rm \
-            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
-            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
+            -e TOGSIM_CONFIG="${{ inputs.togsim_config }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/ops/sort/test_sort.py
 
   test_accuracy:
@@ -812,7 +766,7 @@ jobs:
     # Accuracy + speedup runs many model simulations end to end; it is the most
     # time- and memory-intensive job, so keep it on a self-hosted runner.
     runs-on: self-hosted
-    if: inputs.vector_lane == 128
+    if: inputs.run_accuracy
     steps:
       - name: Log in to GitHub Container Registry
         uses: docker/login-action@v3
@@ -830,8 +784,7 @@ jobs:
           set -o pipefail
           mkdir -p "$ART_DIR"
           docker run --rm \
-            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
-            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
+            -e TOGSIM_CONFIG="${{ inputs.togsim_config }}" \
             -e SKIP_ILS=1 \
             -e SPEEDUP_ITERS=2 \
             -v "$ART_DIR:/artifacts" \
diff --git a/configs/systolic_ws_32x32_c1_simple_noc_tpuv3.yml b/configs/systolic_ws_32x32_c1_simple_noc_tpuv3.yml
new file mode 100644
index 00000000..7bcbd763
--- /dev/null
+++ b/configs/systolic_ws_32x32_c1_simple_noc_tpuv3.yml
@@ -0,0 +1,28 @@
+num_cores: 1
+core_freq_mhz: 940
+core_stats_print_period_cycles: 10000
+num_systolic_array_per_core: 2
+
+vpu_num_lanes: 32
+vpu_spad_size_kb_per_lane: 32
+vpu_vector_length_bits: 256
+
+dram_type: ramulator2
+dram_freq_mhz: 940
+dram_channels: 16
+dram_stats_print_period_cycles: 10000
+ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml
+
+icnt_type: simple
+icnt_latency_cycles: 10
+icnt_freq_mhz: 940
+icnt_injection_ports_per_core: 16
+
+pytorchsim_functional_mode: 1
+pytorchsim_timing_mode: 1
+
+codegen_mapping_strategy: heuristic
+codegen_external_mapping_file: ''
+codegen_autotune_max_retry: 10
+codegen_autotune_template_topk: 4
+codegen_compiler_optimization: all
diff --git a/configs/systolic_ws_8x8_c1_simple_noc_tpuv3.yml b/configs/systolic_ws_8x8_c1_simple_noc_tpuv3.yml
new file mode 100644
index 00000000..0353ef45
--- /dev/null
+++ b/configs/systolic_ws_8x8_c1_simple_noc_tpuv3.yml
@@ -0,0 +1,28 @@
+num_cores: 1
+core_freq_mhz: 940
+core_stats_print_period_cycles: 10000
+num_systolic_array_per_core: 2
+
+vpu_num_lanes: 8
+vpu_spad_size_kb_per_lane: 32
+vpu_vector_length_bits: 256
+
+dram_type: ramulator2
+dram_freq_mhz: 940
+dram_channels: 16
+dram_stats_print_period_cycles: 10000
+ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml
+
+icnt_type: simple
+icnt_latency_cycles: 10
+icnt_freq_mhz: 940
+icnt_injection_ports_per_core: 16
+
+pytorchsim_functional_mode: 1
+pytorchsim_timing_mode: 1
+
+codegen_mapping_strategy: heuristic
+codegen_external_mapping_file: ''
+codegen_autotune_max_retry: 10
+codegen_autotune_template_topk: 4
+codegen_compiler_optimization: all