Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
2d1f430
[Frontend] MLIR Python bindings: in-process passes and lowering
YWHyuk Jun 17, 2026
3da3a7c
[Frontend] decompose togsim.transfer to <=4D dma_start
YWHyuk Jun 17, 2026
1d0c888
[Frontend] axis-split: remove aligned floor/mod at the scheduling layer
YWHyuk Jun 17, 2026
5ee55e4
[Frontend] graph-copy: relayout operands on incompatible / cross-axis…
YWHyuk Jun 17, 2026
e39bfc0
[Frontend] build_tog: port test-tile-operation-graph to Python
YWHyuk Jun 17, 2026
3c56c6b
[Test] floor/mod axis-split + graph-copy coverage; deterministic deep…
YWHyuk Jun 17, 2026
a6b7ebb
[Frontend] decompose-transfer: affine.for peel with lane-banked physi…
YWHyuk Jun 17, 2026
04e4256
[Frontend] Unify all DMA codegen on togsim.transfer
YWHyuk Jun 17, 2026
80633c7
[Docs] axis-split: rank guard removed, >4D peel via affine.for
YWHyuk Jun 17, 2026
ab0beaa
[Frontend] dma-fine-grained: port the C++ pass to Python (MLIR bindings)
YWHyuk Jun 17, 2026
41a365c
[Frontend] pytorchsim-to-vcix: port the C++ pass to Python (MLIR bind…
YWHyuk Jun 17, 2026
e5cc6a5
[Docs] lower_to_llvm: only test-loop-padding remains in mlir-opt
YWHyuk Jun 17, 2026
190dd04
[Frontend] Retire dead floor/mod recompile branches in codegen
YWHyuk Jun 17, 2026
82d47f9
[Frontend] Retire implicit_dim_ops tile-forcing (redundant under axis…
YWHyuk Jun 17, 2026
093a591
[Docs] padding model + test-loop-padding plan; TPU layout/padding report
YWHyuk Jun 18, 2026
af79689
[Frontend] Fix floor/mod guard + decompose unit-dim/vlane-axis edge c…
YWHyuk Jun 18, 2026
a6ef5f8
[Frontend] graph-copy per-dim ranges + vcix C++-parity guards (review)
YWHyuk Jun 18, 2026
417e4f2
[Frontend] vcix: lower fused matmul whose operand is vector_store'd, …
YWHyuk Jun 18, 2026
716654b
[Frontend] floor/mod: make axis-split + graph-copy always-on, drop de…
YWHyuk Jun 18, 2026
0161c64
[Frontend] Unify in-process MLIR passes under one phase-list driver
YWHyuk Jun 18, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions .github/workflows/pytorchsim_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,25 @@ jobs:
-e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
${{ inputs.image_name }} python3 PyTorchSim/tests/ops/view/test_cat.py

test_floormod_axis_split:
name: Run test_floormod_axis_split.py
runs-on: ubuntu-latest
steps:
- name: Log in to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Run test_floormod_axis_split.py
run: |
echo "Running test_floormod_axis_split.py"
docker run --rm \
-e vpu_num_lanes="${{ inputs.vector_lane }}" \
-e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
${{ inputs.image_name }} python3 PyTorchSim/tests/ops/view/test_floormod_axis_split.py

test_matmul:
name: Run test_matmul.py
runs-on: ubuntu-latest
Expand Down
5 changes: 5 additions & 0 deletions Dockerfile.base
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,11 @@ RUN curl -L -H "Accept: application/octet-stream" https://api.github.com/repos/P

# Store RISC-V LLVM for TorchSim
ENV TORCHSIM_LLVM_PATH=/riscv-llvm/bin
# MLIR Python bindings shipped inside the LLVM release artifact (built by the
# llvm-project CI with -DMLIR_ENABLE_BINDINGS_PYTHON=ON). Lets PyTorchSim load
# mlir.ir / dialects for Python-side MLIR passes. The artifact must be built
# against this image's Python (3.11) or `import mlir` fails on ABI mismatch.
ENV PYTHONPATH=/riscv-llvm/python_packages/mlir_core:$PYTHONPATH
ENV TORCHSIM_DIR=/workspace/PyTorchSim

# Download Spike simulator
Expand Down
86 changes: 43 additions & 43 deletions PyTorchSimFrontend/extension_codecache.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,29 +38,16 @@ def dump_metadata(args, arg_attributes, path):
return

def mlir_compile_command(filename, vectorlane_size, vlen=256):
# The C++ -dma-fine-grained and -test-pytorchsim-to-vcix passes are ported to
# Python (passes/dma_fine_grained.py, lower_to_vcix.py), run in-process between
# loop-padding and the standard lowering. So mlir-opt now runs only loop-padding
# (-> _padded.mlir); the Python fine-grained + vcix passes produce _custom.mlir.
return [re.sub(r"[ \n]+", " ",
f"""
{extension_config.CONFIG_TORCHSIM_LLVM_PATH}/mlir-opt \
-test-loop-padding \
-dma-fine-grained='systolic-array-size={vectorlane_size}' \
-global-idx='vlen={vlen}' \
-test-pytorchsim-to-vcix='systolic-array-size={vectorlane_size} vlen={vlen}' \
-test-memref-to-gemmini="vectorlane={vectorlane_size}" \
-convert-linalg-to-loops \
-convert-vector-to-scf='full-unroll' \
-lower-affine \
-finalize-memref-to-llvm \
-lower-vector-multi-reduction \
-convert-vector-to-llvm \
-convert-arith-to-llvm \
-convert-math-to-llvm \
-convert-scf-to-cf \
-convert-cf-to-llvm \
-convert-func-to-llvm \
-convert-index-to-llvm \
-reconcile-unrealized-casts \
{'--mlir-print-ir-after-all' if extension_config.CONFIG_TORCHSIM_DUMP_MLIR_IR else ''} \
{filename}.mlir -o {filename}_llvm.mlir
{filename}.mlir -o {filename}_padded.mlir
""",
).strip(),
re.sub(r"[ \n]+", " ",
Expand Down Expand Up @@ -88,30 +75,14 @@ def mlir_compile_command(filename, vectorlane_size, vlen=256):
).strip()]

def mlir_gem5_compile_command(filename, sample_filename, tog_file, vectorlane_size, vlen=256):
# See mlir_compile_command: -dma-fine-grained and -test-pytorchsim-to-vcix are
# Python passes run in-process; mlir-opt runs only loop-padding here.
return [re.sub(r"[ \n]+", " ",
f"""
{extension_config.CONFIG_TORCHSIM_LLVM_PATH}/mlir-opt \
-test-loop-padding='timing_mode=1' \
-dma-fine-grained='systolic-array-size={vectorlane_size}' \
-global-idx='vlen={vlen}' \
-test-pytorchsim-to-vcix='systolic-array-size={vectorlane_size} vlen={vlen}' \
-test-tile-operation-graph='vectorlane={vectorlane_size} sample-mode={extension_config.CONFIG_TLS_MODE}' \
-test-memref-to-gemmini="vectorlane={vectorlane_size} timing=1" \
-convert-linalg-to-loops \
-convert-vector-to-scf='full-unroll' \
-lower-affine \
-finalize-memref-to-llvm \
-lower-vector-multi-reduction \
-convert-vector-to-llvm \
-convert-arith-to-llvm \
-convert-math-to-llvm \
-convert-scf-to-cf \
-convert-cf-to-llvm \
-convert-func-to-llvm \
-convert-index-to-llvm \
-reconcile-unrealized-casts \
{'--mlir-print-ir-after-all' if extension_config.CONFIG_TORCHSIM_DUMP_MLIR_IR else ''} \
{filename}.mlir -o {sample_filename}_llvm.mlir
{filename}.mlir -o {sample_filename}_padded.mlir
""",
).strip(),
re.sub(r"[ \n]+", " ",
Expand Down Expand Up @@ -158,6 +129,14 @@ def load(cls, source_code,
vlenb = vlen // 8
write_path = get_write_path(source_code)
key, input_path = write(source_code, "mlir", specified_dir=write_path)
# Run the Python out-of-line MLIR passes (MLIR bindings) on the kernel
# .mlir in place, before mlir-opt. Currently lowers torchsim.vlane_idx
# (replaces the old C++ -global-idx pass); add more in passes/__init__.py.
from PyTorchSimFrontend.mlir.passes import (
run_python_passes, run_module_passes, POST_OPT_PASSES,
run_standard_lowering, run_tog,
)
run_python_passes(input_path, vectorlane=vectorlane_size)
new_input_path = os.path.splitext(input_path)[0]
raw_tog_path = new_input_path + "_tog.py"
tog_path = os.path.join(write_path, "tile_graph.onnx")
Expand All @@ -178,13 +157,21 @@ def load(cls, source_code,
# Use custom malloc to avoid size error
new_link_option = link_option + " -Wl,--wrap=malloc -Wl,--wrap=free"
cmds = mlir_compile_command(new_input_path, vectorlane_size, vlen=vlen)
opt_cmd = shlex.split(cmds[0])
opt_pad_cmd = shlex.split(cmds[0])
translate_cmd = shlex.split(cmds[1])
llc_cmd = shlex.split(cmds[2])
llc_asm_cmd = shlex.split(cmds[3])
with lock:
try:
subprocess.check_call(opt_cmd)
# loop-padding (mlir-opt) -> Python fine-grained + vcix (one parse/print)
subprocess.check_call(opt_pad_cmd)
run_module_passes(new_input_path + "_padded.mlir",
new_input_path + "_custom.mlir",
POST_OPT_PASSES, vectorlane=vectorlane_size, vlen=vlen)
# Standard MLIR -> LLVM-dialect lowering (registered upstream
# passes) runs in-process via the bindings PassManager, picking
# up after the custom mlir-opt passes (memref-to-gemmini).
run_standard_lowering(new_input_path + "_custom.mlir", new_input_path + "_llvm.mlir")
subprocess.check_call(translate_cmd)
subprocess.check_call(llc_cmd)
subprocess.check_call(llc_asm_cmd)
Expand Down Expand Up @@ -213,16 +200,29 @@ def load(cls, source_code,
return key

# Launch tile graph generator
gem5_sample_cmd = shlex.split(gem5_cmds[0])
gem5_pad_cmd = shlex.split(gem5_cmds[0])
gem5_translate_cmd = shlex.split(gem5_cmds[1])
gem5_llc_cmd = shlex.split(gem5_cmds[2])

lock = FileLock(get_lock_path(write_path), timeout=LOCK_TIMEOUT)
with lock:
try:
result = subprocess.check_output(gem5_sample_cmd)
with open(raw_tog_path, "wb") as file:
file.write(result)
# mlir-opt now runs only loop-padding/dma-fine-grained/pytorchsim-to-vcix
# and writes the post-vcix IR. The tile-operation-graph pass is ported
# to Python: run_tog reads that IR, writes the TOG (_tog.py) and the
# mutated IR (_custom.mlir: sample-mode step rewrite + compute markers),
# replacing the C++ -test-tile-operation-graph pass.
# loop-padding(timing, mlir-opt) -> Python fine-grained + vcix (one parse/print)
subprocess.check_call(gem5_pad_cmd)
run_module_passes(sample_mlir_path + "_padded.mlir",
sample_mlir_path + "_postvcix.mlir",
POST_OPT_PASSES, vectorlane=vectorlane_size, vlen=vlen)
run_tog(sample_mlir_path + "_postvcix.mlir", raw_tog_path,
sample_mlir_path + "_custom.mlir",
sample_mode=extension_config.CONFIG_TLS_MODE,
vectorlane=vectorlane_size)
# Standard MLIR -> LLVM-dialect lowering in-process (see functional path).
run_standard_lowering(sample_mlir_path + "_custom.mlir", sample_mlir_path + "_llvm.mlir", timing=True)
subprocess.check_call(gem5_translate_cmd)
subprocess.check_call(gem5_llc_cmd)
except subprocess.CalledProcessError as e:
Expand Down
Loading
Loading