Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions AsmParser/tog_generator.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
# DEPRECATED (timing path): legacy ONNX Tile-Operation-Graph producer. Builds
# the TOG and serializes it to ONNX for the C++ TileGraphParser. Superseded by
# the C++ trace pipeline (PyTorchSimFrontend/mlir/passes/build_skeleton.py +
# lower_to_emitc.py + cycle_table.py -> a compiled trace .so). Kept live so the
# current pipeline does not break; to be retired once the trace pipeline (P3+)
# stabilizes. See docs/design/togsim_cpp_trace.md.
import os
import sys
import importlib.util
Expand Down
1 change: 1 addition & 0 deletions CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ Note: `TOGSIM_CONFIG` is **overwritten** while inside a `with TOGSimulator(confi
Located under `configs/*.yml`:

- `num_cores`, `core_freq_mhz`, `num_systolic_array_per_core`
- `sa_weight_buffer_depth` (per-SA resident weight slots; **must be > 0** — the simulator errors on 0. Raise it to effectively disable the preload run-ahead throttle. Defaults to 2 if the key is absent.)
- `vpu_num_lanes`, `vpu_spad_size_kb_per_lane`, `vpu_vector_length_bits`
- `dram_type` (`ramulator2` | `simple`), `dram_channels`, `dram_freq_mhz`, `ramulator_config_path`
- `icnt_type` (`simple` | `booksim`), `icnt_latency_cycles`, `icnt_freq_mhz`, `icnt_config_path`
Expand Down
129 changes: 90 additions & 39 deletions PyTorchSimFrontend/extension_codecache.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import torch

from PyTorchSimFrontend import extension_config
from torch._inductor.codecache import get_hash, write
from torch._inductor.codecache import get_hash, write, write_atomic
from torch._inductor.async_compile import AsyncCompile
from AsmParser.tog_generator import tog_generator
from PyTorchSimFrontend.mlir.mlir_caller_codegen import MLIRKernelCallerCodeGen
Expand All @@ -23,6 +23,13 @@ def get_write_path(src_code):
return os.path.join(extension_config.get_dump_path(), hash_prefix(get_hash(src_code.strip())))


_HEADER_BY_HASH = {}
def store_header(src_code, spike_header, gem5_header):
_HEADER_BY_HASH[get_hash(src_code.strip())] = (spike_header, gem5_header)
def get_header(src_code):
return _HEADER_BY_HASH.get(get_hash(src_code.strip()))


def get_lock_path(write_path):
"""Return lock file path for the given write_path (per-source_code lock)."""
return os.path.join(write_path, ".compile.lock")
Expand Down Expand Up @@ -128,40 +135,52 @@ def load(cls, source_code,
vlen = kwargs['vlen']
vlenb = vlen // 8
write_path = get_write_path(source_code)
key, input_path = write(source_code, "mlir", specified_dir=write_path)
# Run the Python out-of-line MLIR passes (MLIR bindings) on the kernel
# .mlir in place, before mlir-opt. Currently lowers torchsim.vlane_idx
# (replaces the old C++ -global-idx pass); add more in passes/__init__.py.
os.makedirs(write_path, exist_ok=True)
global_var_header = kwargs.get("global_var_header")
if global_var_header is not None:
write_atomic(os.path.join(write_path, "global_var.h"), global_var_header)
gem5_global_var_header = kwargs.get("gem5_global_var_header")
if gem5_global_var_header is not None:
write_atomic(os.path.join(write_path, "gem5_global_var.h"), gem5_global_var_header)
# The compile rewrites the kernel .mlir in place (run_python_passes) and reads
# it back (mlir-opt). Two compiles of the same source -- the autotune's chosen
# candidate and the final kernel -- share a write_path, so hold the per-path
# lock across the whole build to keep them from interleaving, and skip the
# rebuild when a prior build already finished (its tile_graph.onnx exists).
from filelock import FileLock
from PyTorchSimFrontend.mlir.passes import (
run_python_passes, run_module_passes, POST_OPT_PASSES,
run_standard_lowering, run_tog,
)
run_python_passes(input_path, vectorlane=vectorlane_size)
new_input_path = os.path.splitext(input_path)[0]
raw_tog_path = new_input_path + "_tog.py"
tog_path = os.path.join(write_path, "tile_graph.onnx")
sample_mlir_path = new_input_path + "_sample"
validation_binary_path = os.path.join(write_path, validation_binary_name)
gem5_cmds = mlir_gem5_compile_command(new_input_path, sample_mlir_path, raw_tog_path, vectorlane_size)

from filelock import FileLock
os.makedirs(write_path, exist_ok=True)
lock = FileLock(get_lock_path(write_path), timeout=LOCK_TIMEOUT)

if spad_info is not None:
link_option = f"-Wl,--section-start=.spad=0x{spad_info['spad_vaddr']:x}"
else:
link_option = ""
# Generate LLVM kernel calller and binary for validation
if extension_config.pytorchsim_functional_mode:
# Use custom malloc to avoid size error
new_link_option = link_option + " -Wl,--wrap=malloc -Wl,--wrap=free"
cmds = mlir_compile_command(new_input_path, vectorlane_size, vlen=vlen)
opt_pad_cmd = shlex.split(cmds[0])
translate_cmd = shlex.split(cmds[1])
llc_cmd = shlex.split(cmds[2])
llc_asm_cmd = shlex.split(cmds[3])
with lock:
with lock:
key, input_path = write(source_code, "mlir", specified_dir=write_path)
if os.path.isfile(tog_path):
return key
# Run the Python out-of-line MLIR passes (MLIR bindings) on the kernel
# .mlir in place, before mlir-opt. Currently lowers torchsim.vlane_idx
# (replaces the old C++ -global-idx pass); add more in passes/__init__.py.
run_python_passes(input_path, vectorlane=vectorlane_size)
new_input_path = os.path.splitext(input_path)[0]
raw_tog_path = new_input_path + "_tog.py"
sample_mlir_path = new_input_path + "_sample"
validation_binary_path = os.path.join(write_path, validation_binary_name)
gem5_cmds = mlir_gem5_compile_command(new_input_path, sample_mlir_path, raw_tog_path, vectorlane_size)

if spad_info is not None:
link_option = f"-Wl,--section-start=.spad=0x{spad_info['spad_vaddr']:x}"
else:
link_option = ""
# Generate LLVM kernel calller and binary for validation
if extension_config.pytorchsim_functional_mode:
# Use custom malloc to avoid size error
new_link_option = link_option + " -Wl,--wrap=malloc -Wl,--wrap=free"
cmds = mlir_compile_command(new_input_path, vectorlane_size, vlen=vlen)
opt_pad_cmd = shlex.split(cmds[0])
translate_cmd = shlex.split(cmds[1])
llc_cmd = shlex.split(cmds[2])
llc_asm_cmd = shlex.split(cmds[3])
try:
# loop-padding (mlir-opt) -> Python fine-grained + vcix (one parse/print)
subprocess.check_call(opt_pad_cmd)
Expand Down Expand Up @@ -195,17 +214,11 @@ def load(cls, source_code,
)
raise SpadOverflowError()

# Skip if TOG file already exists
if os.path.isfile(tog_path):
return key

# Launch tile graph generator
gem5_pad_cmd = shlex.split(gem5_cmds[0])
gem5_translate_cmd = shlex.split(gem5_cmds[1])
gem5_llc_cmd = shlex.split(gem5_cmds[2])
# Launch tile graph generator
gem5_pad_cmd = shlex.split(gem5_cmds[0])
gem5_translate_cmd = shlex.split(gem5_cmds[1])
gem5_llc_cmd = shlex.split(gem5_cmds[2])

lock = FileLock(get_lock_path(write_path), timeout=LOCK_TIMEOUT)
with lock:
try:
# mlir-opt now runs only loop-padding/dma-fine-grained/pytorchsim-to-vcix
# and writes the post-vcix IR. The tile-operation-graph pass is ported
Expand Down Expand Up @@ -241,8 +254,19 @@ def load(cls, source_code,
# Run cyclesim
cyclesim = CycleSimulator()
cycle_list = cyclesim.compile_and_simulate(os.path.join(write_path, cycle_binary_name), vectorlane_size, silent_mode=silent_mode)
# Snapshot for the P3-trace hook below: generate_tile_graph consumes
# cycle_list in place (cycle_list.pop(0) per tile), leaving it empty.
cycle_list_for_trace = list(cycle_list)

# Create TOG
# DEPRECATED (timing path): this ONNX-TOG producer -- run_tog ->
# tog_generator.generate_tile_graph -> ONNX -> C++ TileGraphParser --
# is being superseded by the C++ trace pipeline (build_skeleton +
# lower_to_emitc -> compiled .so, + the cycle_table sidecar). The
# per-tile cycle_list / x_offset / w_offset computed here are exactly
# what cycle_table.build_cycle_table will reuse, so both paths stay
# cycle-consistent during the transition. Kept live (pipeline must not
# break); to be retired once the trace pipeline (P3+) stabilizes.
w_offset, x_offset = vectorlane_size, vectorlane_size
if kwargs['loop_size'] is not None and kwargs['loop_size'][-3] < vectorlane_size:
x_offset = kwargs['loop_size'][-3]
Expand All @@ -258,6 +282,33 @@ def load(cls, source_code,
w_offset=w_offset, # FIXME.
vector_lane=vectorlane_size
)

# Trace pipeline (DEFAULT): emit the compiled trace producer .so + the
# cycle-table TSV from the post-vcix IR and gem5 cycle_list/offsets. This
# is the default simulation path (the C++ TOG); the legacy ONNX TOG is
# DEPRECATED, an opt-in fallback via TORCHSIM_LEGACY_TOG=1, in which case the
# .so is unused so skip emitting it. Best-effort: never breaks the compile.
if os.environ.get("TORCHSIM_LEGACY_TOG") != "1":
try:
import mlir.ir as ir
from PyTorchSimFrontend.mlir.passes import (
build_skeleton as _bs, cycle_table as _ct, lower_to_emitc as _l2e)
pv = sample_mlir_path + "_postvcix.mlir"
_ctx = ir.Context(); _ctx.allow_unregistered_dialects = True
with _ctx:
_mod = ir.Module.parse(open(pv).read(), _ctx)
_bs.build_skeleton(_mod)
_ntiles = len(_ct._compute_types(_mod))
# align lengths: gem5 gives one numCycles per compute node;
# pad with the last value / truncate if it disagrees.
_cl = list(cycle_list_for_trace)
if _cl and len(_cl) != _ntiles:
_cl = (_cl + [_cl[-1]] * _ntiles)[:_ntiles]
_tbl = _ct.build_cycle_table(_mod, _cl, x_offset, w_offset)
_ct.dump_cycle_table_tsv(_tbl, os.path.join(write_path, "trace_cycles.tsv"))
_l2e.build_trace_so(pv, os.path.join(write_path, "trace.so"))
except Exception as e:
logger.warning(f"[P3-trace] trace .so/sidecar dump skipped: {e}")
return key

class CustomAsyncCompile(AsyncCompile):
Expand Down
7 changes: 5 additions & 2 deletions PyTorchSimFrontend/mlir/mlir_autotune.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def __str__(self) -> str:
def make_run_fn(
self, input_tensors: torch.Tensor, output_tensors: torch.Tensor
) -> Callable[[], None]:
from PyTorchSimFrontend.extension_codecache import CustomAsyncCompile
from PyTorchSimFrontend.extension_codecache import CustomAsyncCompile, get_header
custom_async_compile = CustomAsyncCompile()

# Check already cached result.
Expand All @@ -80,12 +80,15 @@ def cached_run_fn(*args, autotune_subprocess_timeout_sec=None, **kwargs):
return cached_run_fn

# Run a candidate code
_headers = get_header(self.source_code)
_header_kwargs = {} if _headers is None else {
"global_var_header": _headers[0], "gem5_global_var_header": _headers[1]}
run_method = custom_async_compile.mlir(
self.source_code, vectorlane_size=self.extra_args["vector_lane"],
loop_size=self.extra_args["loop_size"], spad_info=self.extra_args["spad_info"],
vlen=self.extra_args["vlen"], arg_attributes=self.extra_args["arg_attributes"],
origins=self.extra_args["origins"], silent_mode=True,
autotune=self.extra_args['autotune'])
autotune=self.extra_args['autotune'], **_header_kwargs)

args = [
tensor
Expand Down
24 changes: 9 additions & 15 deletions PyTorchSimFrontend/mlir/mlir_codegen_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
from torch._inductor.codegen import cpp, wrapper, common, memory_planning
from torch._inductor.ir import GraphPartitionSignature
from torch._inductor.virtualized import V, _ops as ops
from torch._inductor.codecache import write_atomic
from torch._inductor.utils import (
IndentedBuffer,
is_welford_reduction,
Expand Down Expand Up @@ -1120,28 +1119,23 @@ def codegen_nodes(self, nodes, kernel_name):
src_code, meta_code = super().codegen_nodes(nodes, kernel_name)
self._prepare_simulator_headers(src_code)
if "autotune" in extension_config.codegen_mapping_strategy and extension_config.pytorchsim_timing_mode:
optimal_src_code, meta_code = self.autotune(nodes, kernel_name)[:2]
# Use temporaries: autotune returns [None, None, None] when it cannot
# autotune (e.g. a size-1 pointwise kernel with ranges == [1]), and
# unpacking into meta_code would clobber the valid arg_attributes that
# the fall-through below returns.
optimal_src_code, optimal_meta_code = self.autotune(nodes, kernel_name)[:2]
if optimal_src_code is not None:
return optimal_src_code, meta_code
return optimal_src_code, optimal_meta_code
return src_code, meta_code

def _prepare_simulator_headers(self, src_code):
from filelock import FileLock

write_path = extension_codecache.get_write_path(src_code)
os.makedirs(write_path, exist_ok=True)

spike_write_path = os.path.join(write_path, "global_var.h")
gem5_write_path = os.path.join(write_path, "gem5_global_var.h")

spad_end_symbol = "int spad_end[0] __attribute__ ((section(\".spad\")));\n"
spad_section_end_symbol = (
f"int spad_section_end[0] __attribute__ ((section(\".spad\"), aligned({self.spad_info['spad_size']*self.vector_lane})));"
)
lock = FileLock(extension_codecache.get_lock_path(write_path), timeout=extension_codecache.LOCK_TIMEOUT)
with lock:
write_atomic(spike_write_path, self.header.getvalue() + spad_end_symbol + spad_section_end_symbol)
write_atomic(gem5_write_path, self.gem5_header.getvalue())
spike_content = self.header.getvalue() + spad_end_symbol + spad_section_end_symbol
gem5_content = self.gem5_header.getvalue()
extension_codecache.store_header(src_code, spike_content, gem5_content)

def get_arg_info(self, name):
arg_info = dict()
Expand Down
5 changes: 5 additions & 0 deletions PyTorchSimFrontend/mlir/mlir_scheduling.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import operator
from sympy import symbols, sympify
from PyTorchSimFrontend import extension_config
from PyTorchSimFrontend import extension_codecache
from PyTorchSimFrontend.mlir.mlir_codegen_backend import MLIRKernel

from torch.utils._ordered_set import OrderedSet
Expand Down Expand Up @@ -333,6 +334,10 @@ def define_kernel(self, src_code, meta_code, kernel_name, vector_lane, spad_info
codecache_def.writeline(f"spad_info={spad_info},")
codecache_def.writeline(f"origins={origins},")
codecache_def.writeline(f"arg_attributes={meta_code},")
headers = extension_codecache.get_header(src_code)
if headers is not None:
codecache_def.writeline(f"global_var_header='''{headers[0]}''',")
codecache_def.writeline(f"gem5_global_var_header='''{headers[1]}''',")
codecache_def.writeline(f"vlen={extension_config.vpu_vector_length_bits})")
wrapper.define_kernel(kernel_name, codecache_def.getvalue(), gpu=False)
return kernel_name
Expand Down
18 changes: 3 additions & 15 deletions PyTorchSimFrontend/mlir/mlir_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
from torch._inductor.autotune_process import TensorMeta
from torch._inductor.virtualized import V, NullHandler, _ops as ops
from torch._inductor.utils import IndentedBuffer
from torch._inductor.codecache import write_atomic

import PyTorchSimFrontend.extension_codecache as extension_codecache
from PyTorchSimFrontend.mlir.mlir_autotune import MLIRBenchmarkRequest
Expand Down Expand Up @@ -613,22 +612,11 @@ def codegen_nodes(self, tile_candidates, render, template_node, prologue_nodes,
return src_code, meta_code

def _prepare_simulator_headers(self, src_code):
from filelock import FileLock

spad_end_symbol = f"int spad_end[0] __attribute__ ((section(\".spad\")));\n"
spad_section_end_symbol = f"int spad_section_end[0] __attribute__ ((section(\".spad\"), aligned({self.spad_info['spad_size']*self.vector_lane})));"

write_path = extension_codecache.get_write_path(src_code)
os.makedirs(write_path, exist_ok=True)
spike_write_path = os.path.join(write_path, "global_var.h")
gem5_write_path = os.path.join(write_path, "gem5_global_var.h")

lock = FileLock(extension_codecache.get_lock_path(write_path), timeout=extension_codecache.LOCK_TIMEOUT)
with lock:
if not os.path.exists(spike_write_path):
write_atomic(spike_write_path, self.header.getvalue()+spad_end_symbol+spad_section_end_symbol)
if not os.path.exists(gem5_write_path):
write_atomic(gem5_write_path, self.gem5_header.getvalue())
spike_content = self.header.getvalue()+spad_end_symbol+spad_section_end_symbol
gem5_content = self.gem5_header.getvalue()
extension_codecache.store_header(src_code, spike_content, gem5_content)

def codegen_prologue_body(self):
body = IndentedBuffer()
Expand Down
8 changes: 6 additions & 2 deletions PyTorchSimFrontend/mlir/passes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,12 @@ def run_module_passes(in_path, out_path, passes, **opts):
p.run(module, **opts)
out = str(module)

with open(out_path, "w") as f:
f.write(out)
# Atomic write: run_python_passes rewrites the kernel .mlir in place outside
# load()'s FileLock, so a concurrent compile of the same source must never see a
# truncated file -- mlir-opt would parse it to an empty module and silently drop
# the kernel (-> undefined reference to wrapper_kernel at link).
from torch._inductor.codecache import write_atomic
write_atomic(out_path, out)
return True


Expand Down
Loading
Loading