diff --git a/AsmParser/tog_generator.py b/AsmParser/tog_generator.py index a12460e3..0de76246 100644 --- a/AsmParser/tog_generator.py +++ b/AsmParser/tog_generator.py @@ -1,3 +1,9 @@ +# DEPRECATED (timing path): legacy ONNX Tile-Operation-Graph producer. Builds +# the TOG and serializes it to ONNX for the C++ TileGraphParser. Superseded by +# the C++ trace pipeline (PyTorchSimFrontend/mlir/passes/build_skeleton.py + +# lower_to_emitc.py + cycle_table.py -> a compiled trace .so). Kept live so the +# current pipeline does not break; to be retired once the trace pipeline (P3+) +# stabilizes. See docs/design/togsim_cpp_trace.md. import os import sys import importlib.util diff --git a/CLAUDE.md b/CLAUDE.md index 12d48082..5a3a47cd 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -85,6 +85,7 @@ Note: `TOGSIM_CONFIG` is **overwritten** while inside a `with TOGSimulator(confi Located under `configs/*.yml`: - `num_cores`, `core_freq_mhz`, `num_systolic_array_per_core` +- `sa_weight_buffer_depth` (per-SA resident weight slots; **must be > 0** — the simulator errors on 0. Raise it to effectively disable the preload run-ahead throttle. Defaults to 2 if the key is absent.) - `vpu_num_lanes`, `vpu_spad_size_kb_per_lane`, `vpu_vector_length_bits` - `dram_type` (`ramulator2` | `simple`), `dram_channels`, `dram_freq_mhz`, `ramulator_config_path` - `icnt_type` (`simple` | `booksim`), `icnt_latency_cycles`, `icnt_freq_mhz`, `icnt_config_path` diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py index 492133a3..785a3d95 100644 --- a/PyTorchSimFrontend/extension_codecache.py +++ b/PyTorchSimFrontend/extension_codecache.py @@ -5,7 +5,7 @@ import torch from PyTorchSimFrontend import extension_config -from torch._inductor.codecache import get_hash, write +from torch._inductor.codecache import get_hash, write, write_atomic from torch._inductor.async_compile import AsyncCompile from AsmParser.tog_generator import tog_generator from PyTorchSimFrontend.mlir.mlir_caller_codegen import MLIRKernelCallerCodeGen @@ -23,6 +23,13 @@ def get_write_path(src_code): return os.path.join(extension_config.get_dump_path(), hash_prefix(get_hash(src_code.strip()))) +_HEADER_BY_HASH = {} +def store_header(src_code, spike_header, gem5_header): + _HEADER_BY_HASH[get_hash(src_code.strip())] = (spike_header, gem5_header) +def get_header(src_code): + return _HEADER_BY_HASH.get(get_hash(src_code.strip())) + + def get_lock_path(write_path): """Return lock file path for the given write_path (per-source_code lock).""" return os.path.join(write_path, ".compile.lock") @@ -128,40 +135,52 @@ def load(cls, source_code, vlen = kwargs['vlen'] vlenb = vlen // 8 write_path = get_write_path(source_code) - key, input_path = write(source_code, "mlir", specified_dir=write_path) - # Run the Python out-of-line MLIR passes (MLIR bindings) on the kernel - # .mlir in place, before mlir-opt. Currently lowers torchsim.vlane_idx - # (replaces the old C++ -global-idx pass); add more in passes/__init__.py. + os.makedirs(write_path, exist_ok=True) + global_var_header = kwargs.get("global_var_header") + if global_var_header is not None: + write_atomic(os.path.join(write_path, "global_var.h"), global_var_header) + gem5_global_var_header = kwargs.get("gem5_global_var_header") + if gem5_global_var_header is not None: + write_atomic(os.path.join(write_path, "gem5_global_var.h"), gem5_global_var_header) + # The compile rewrites the kernel .mlir in place (run_python_passes) and reads + # it back (mlir-opt). Two compiles of the same source -- the autotune's chosen + # candidate and the final kernel -- share a write_path, so hold the per-path + # lock across the whole build to keep them from interleaving, and skip the + # rebuild when a prior build already finished (its tile_graph.onnx exists). + from filelock import FileLock from PyTorchSimFrontend.mlir.passes import ( run_python_passes, run_module_passes, POST_OPT_PASSES, run_standard_lowering, run_tog, ) - run_python_passes(input_path, vectorlane=vectorlane_size) - new_input_path = os.path.splitext(input_path)[0] - raw_tog_path = new_input_path + "_tog.py" tog_path = os.path.join(write_path, "tile_graph.onnx") - sample_mlir_path = new_input_path + "_sample" - validation_binary_path = os.path.join(write_path, validation_binary_name) - gem5_cmds = mlir_gem5_compile_command(new_input_path, sample_mlir_path, raw_tog_path, vectorlane_size) - - from filelock import FileLock - os.makedirs(write_path, exist_ok=True) lock = FileLock(get_lock_path(write_path), timeout=LOCK_TIMEOUT) - - if spad_info is not None: - link_option = f"-Wl,--section-start=.spad=0x{spad_info['spad_vaddr']:x}" - else: - link_option = "" - # Generate LLVM kernel calller and binary for validation - if extension_config.pytorchsim_functional_mode: - # Use custom malloc to avoid size error - new_link_option = link_option + " -Wl,--wrap=malloc -Wl,--wrap=free" - cmds = mlir_compile_command(new_input_path, vectorlane_size, vlen=vlen) - opt_pad_cmd = shlex.split(cmds[0]) - translate_cmd = shlex.split(cmds[1]) - llc_cmd = shlex.split(cmds[2]) - llc_asm_cmd = shlex.split(cmds[3]) - with lock: + with lock: + key, input_path = write(source_code, "mlir", specified_dir=write_path) + if os.path.isfile(tog_path): + return key + # Run the Python out-of-line MLIR passes (MLIR bindings) on the kernel + # .mlir in place, before mlir-opt. Currently lowers torchsim.vlane_idx + # (replaces the old C++ -global-idx pass); add more in passes/__init__.py. + run_python_passes(input_path, vectorlane=vectorlane_size) + new_input_path = os.path.splitext(input_path)[0] + raw_tog_path = new_input_path + "_tog.py" + sample_mlir_path = new_input_path + "_sample" + validation_binary_path = os.path.join(write_path, validation_binary_name) + gem5_cmds = mlir_gem5_compile_command(new_input_path, sample_mlir_path, raw_tog_path, vectorlane_size) + + if spad_info is not None: + link_option = f"-Wl,--section-start=.spad=0x{spad_info['spad_vaddr']:x}" + else: + link_option = "" + # Generate LLVM kernel calller and binary for validation + if extension_config.pytorchsim_functional_mode: + # Use custom malloc to avoid size error + new_link_option = link_option + " -Wl,--wrap=malloc -Wl,--wrap=free" + cmds = mlir_compile_command(new_input_path, vectorlane_size, vlen=vlen) + opt_pad_cmd = shlex.split(cmds[0]) + translate_cmd = shlex.split(cmds[1]) + llc_cmd = shlex.split(cmds[2]) + llc_asm_cmd = shlex.split(cmds[3]) try: # loop-padding (mlir-opt) -> Python fine-grained + vcix (one parse/print) subprocess.check_call(opt_pad_cmd) @@ -195,17 +214,11 @@ def load(cls, source_code, ) raise SpadOverflowError() - # Skip if TOG file already exists - if os.path.isfile(tog_path): - return key - - # Launch tile graph generator - gem5_pad_cmd = shlex.split(gem5_cmds[0]) - gem5_translate_cmd = shlex.split(gem5_cmds[1]) - gem5_llc_cmd = shlex.split(gem5_cmds[2]) + # Launch tile graph generator + gem5_pad_cmd = shlex.split(gem5_cmds[0]) + gem5_translate_cmd = shlex.split(gem5_cmds[1]) + gem5_llc_cmd = shlex.split(gem5_cmds[2]) - lock = FileLock(get_lock_path(write_path), timeout=LOCK_TIMEOUT) - with lock: try: # mlir-opt now runs only loop-padding/dma-fine-grained/pytorchsim-to-vcix # and writes the post-vcix IR. The tile-operation-graph pass is ported @@ -241,8 +254,19 @@ def load(cls, source_code, # Run cyclesim cyclesim = CycleSimulator() cycle_list = cyclesim.compile_and_simulate(os.path.join(write_path, cycle_binary_name), vectorlane_size, silent_mode=silent_mode) + # Snapshot for the P3-trace hook below: generate_tile_graph consumes + # cycle_list in place (cycle_list.pop(0) per tile), leaving it empty. + cycle_list_for_trace = list(cycle_list) # Create TOG + # DEPRECATED (timing path): this ONNX-TOG producer -- run_tog -> + # tog_generator.generate_tile_graph -> ONNX -> C++ TileGraphParser -- + # is being superseded by the C++ trace pipeline (build_skeleton + + # lower_to_emitc -> compiled .so, + the cycle_table sidecar). The + # per-tile cycle_list / x_offset / w_offset computed here are exactly + # what cycle_table.build_cycle_table will reuse, so both paths stay + # cycle-consistent during the transition. Kept live (pipeline must not + # break); to be retired once the trace pipeline (P3+) stabilizes. w_offset, x_offset = vectorlane_size, vectorlane_size if kwargs['loop_size'] is not None and kwargs['loop_size'][-3] < vectorlane_size: x_offset = kwargs['loop_size'][-3] @@ -258,6 +282,33 @@ def load(cls, source_code, w_offset=w_offset, # FIXME. vector_lane=vectorlane_size ) + + # Trace pipeline (DEFAULT): emit the compiled trace producer .so + the + # cycle-table TSV from the post-vcix IR and gem5 cycle_list/offsets. This + # is the default simulation path (the C++ TOG); the legacy ONNX TOG is + # DEPRECATED, an opt-in fallback via TORCHSIM_LEGACY_TOG=1, in which case the + # .so is unused so skip emitting it. Best-effort: never breaks the compile. + if os.environ.get("TORCHSIM_LEGACY_TOG") != "1": + try: + import mlir.ir as ir + from PyTorchSimFrontend.mlir.passes import ( + build_skeleton as _bs, cycle_table as _ct, lower_to_emitc as _l2e) + pv = sample_mlir_path + "_postvcix.mlir" + _ctx = ir.Context(); _ctx.allow_unregistered_dialects = True + with _ctx: + _mod = ir.Module.parse(open(pv).read(), _ctx) + _bs.build_skeleton(_mod) + _ntiles = len(_ct._compute_types(_mod)) + # align lengths: gem5 gives one numCycles per compute node; + # pad with the last value / truncate if it disagrees. + _cl = list(cycle_list_for_trace) + if _cl and len(_cl) != _ntiles: + _cl = (_cl + [_cl[-1]] * _ntiles)[:_ntiles] + _tbl = _ct.build_cycle_table(_mod, _cl, x_offset, w_offset) + _ct.dump_cycle_table_tsv(_tbl, os.path.join(write_path, "trace_cycles.tsv")) + _l2e.build_trace_so(pv, os.path.join(write_path, "trace.so")) + except Exception as e: + logger.warning(f"[P3-trace] trace .so/sidecar dump skipped: {e}") return key class CustomAsyncCompile(AsyncCompile): diff --git a/PyTorchSimFrontend/mlir/mlir_autotune.py b/PyTorchSimFrontend/mlir/mlir_autotune.py index 396396f3..e4876b5b 100644 --- a/PyTorchSimFrontend/mlir/mlir_autotune.py +++ b/PyTorchSimFrontend/mlir/mlir_autotune.py @@ -54,7 +54,7 @@ def __str__(self) -> str: def make_run_fn( self, input_tensors: torch.Tensor, output_tensors: torch.Tensor ) -> Callable[[], None]: - from PyTorchSimFrontend.extension_codecache import CustomAsyncCompile + from PyTorchSimFrontend.extension_codecache import CustomAsyncCompile, get_header custom_async_compile = CustomAsyncCompile() # Check already cached result. @@ -80,12 +80,15 @@ def cached_run_fn(*args, autotune_subprocess_timeout_sec=None, **kwargs): return cached_run_fn # Run a candidate code + _headers = get_header(self.source_code) + _header_kwargs = {} if _headers is None else { + "global_var_header": _headers[0], "gem5_global_var_header": _headers[1]} run_method = custom_async_compile.mlir( self.source_code, vectorlane_size=self.extra_args["vector_lane"], loop_size=self.extra_args["loop_size"], spad_info=self.extra_args["spad_info"], vlen=self.extra_args["vlen"], arg_attributes=self.extra_args["arg_attributes"], origins=self.extra_args["origins"], silent_mode=True, - autotune=self.extra_args['autotune']) + autotune=self.extra_args['autotune'], **_header_kwargs) args = [ tensor diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py index 725e0dc6..8f695395 100644 --- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py +++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py @@ -17,7 +17,6 @@ from torch._inductor.codegen import cpp, wrapper, common, memory_planning from torch._inductor.ir import GraphPartitionSignature from torch._inductor.virtualized import V, _ops as ops -from torch._inductor.codecache import write_atomic from torch._inductor.utils import ( IndentedBuffer, is_welford_reduction, @@ -1120,28 +1119,23 @@ def codegen_nodes(self, nodes, kernel_name): src_code, meta_code = super().codegen_nodes(nodes, kernel_name) self._prepare_simulator_headers(src_code) if "autotune" in extension_config.codegen_mapping_strategy and extension_config.pytorchsim_timing_mode: - optimal_src_code, meta_code = self.autotune(nodes, kernel_name)[:2] + # Use temporaries: autotune returns [None, None, None] when it cannot + # autotune (e.g. a size-1 pointwise kernel with ranges == [1]), and + # unpacking into meta_code would clobber the valid arg_attributes that + # the fall-through below returns. + optimal_src_code, optimal_meta_code = self.autotune(nodes, kernel_name)[:2] if optimal_src_code is not None: - return optimal_src_code, meta_code + return optimal_src_code, optimal_meta_code return src_code, meta_code def _prepare_simulator_headers(self, src_code): - from filelock import FileLock - - write_path = extension_codecache.get_write_path(src_code) - os.makedirs(write_path, exist_ok=True) - - spike_write_path = os.path.join(write_path, "global_var.h") - gem5_write_path = os.path.join(write_path, "gem5_global_var.h") - spad_end_symbol = "int spad_end[0] __attribute__ ((section(\".spad\")));\n" spad_section_end_symbol = ( f"int spad_section_end[0] __attribute__ ((section(\".spad\"), aligned({self.spad_info['spad_size']*self.vector_lane})));" ) - lock = FileLock(extension_codecache.get_lock_path(write_path), timeout=extension_codecache.LOCK_TIMEOUT) - with lock: - write_atomic(spike_write_path, self.header.getvalue() + spad_end_symbol + spad_section_end_symbol) - write_atomic(gem5_write_path, self.gem5_header.getvalue()) + spike_content = self.header.getvalue() + spad_end_symbol + spad_section_end_symbol + gem5_content = self.gem5_header.getvalue() + extension_codecache.store_header(src_code, spike_content, gem5_content) def get_arg_info(self, name): arg_info = dict() diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py index 41ec61af..8520596c 100644 --- a/PyTorchSimFrontend/mlir/mlir_scheduling.py +++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py @@ -5,6 +5,7 @@ import operator from sympy import symbols, sympify from PyTorchSimFrontend import extension_config +from PyTorchSimFrontend import extension_codecache from PyTorchSimFrontend.mlir.mlir_codegen_backend import MLIRKernel from torch.utils._ordered_set import OrderedSet @@ -333,6 +334,10 @@ def define_kernel(self, src_code, meta_code, kernel_name, vector_lane, spad_info codecache_def.writeline(f"spad_info={spad_info},") codecache_def.writeline(f"origins={origins},") codecache_def.writeline(f"arg_attributes={meta_code},") + headers = extension_codecache.get_header(src_code) + if headers is not None: + codecache_def.writeline(f"global_var_header='''{headers[0]}''',") + codecache_def.writeline(f"gem5_global_var_header='''{headers[1]}''',") codecache_def.writeline(f"vlen={extension_config.vpu_vector_length_bits})") wrapper.define_kernel(kernel_name, codecache_def.getvalue(), gpu=False) return kernel_name diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py index 529a49b5..2b8a0676 100644 --- a/PyTorchSimFrontend/mlir/mlir_template.py +++ b/PyTorchSimFrontend/mlir/mlir_template.py @@ -21,7 +21,6 @@ from torch._inductor.autotune_process import TensorMeta from torch._inductor.virtualized import V, NullHandler, _ops as ops from torch._inductor.utils import IndentedBuffer -from torch._inductor.codecache import write_atomic import PyTorchSimFrontend.extension_codecache as extension_codecache from PyTorchSimFrontend.mlir.mlir_autotune import MLIRBenchmarkRequest @@ -613,22 +612,11 @@ def codegen_nodes(self, tile_candidates, render, template_node, prologue_nodes, return src_code, meta_code def _prepare_simulator_headers(self, src_code): - from filelock import FileLock - spad_end_symbol = f"int spad_end[0] __attribute__ ((section(\".spad\")));\n" spad_section_end_symbol = f"int spad_section_end[0] __attribute__ ((section(\".spad\"), aligned({self.spad_info['spad_size']*self.vector_lane})));" - - write_path = extension_codecache.get_write_path(src_code) - os.makedirs(write_path, exist_ok=True) - spike_write_path = os.path.join(write_path, "global_var.h") - gem5_write_path = os.path.join(write_path, "gem5_global_var.h") - - lock = FileLock(extension_codecache.get_lock_path(write_path), timeout=extension_codecache.LOCK_TIMEOUT) - with lock: - if not os.path.exists(spike_write_path): - write_atomic(spike_write_path, self.header.getvalue()+spad_end_symbol+spad_section_end_symbol) - if not os.path.exists(gem5_write_path): - write_atomic(gem5_write_path, self.gem5_header.getvalue()) + spike_content = self.header.getvalue()+spad_end_symbol+spad_section_end_symbol + gem5_content = self.gem5_header.getvalue() + extension_codecache.store_header(src_code, spike_content, gem5_content) def codegen_prologue_body(self): body = IndentedBuffer() diff --git a/PyTorchSimFrontend/mlir/passes/__init__.py b/PyTorchSimFrontend/mlir/passes/__init__.py index 82cadc2f..ab3cdcd3 100644 --- a/PyTorchSimFrontend/mlir/passes/__init__.py +++ b/PyTorchSimFrontend/mlir/passes/__init__.py @@ -76,8 +76,12 @@ def run_module_passes(in_path, out_path, passes, **opts): p.run(module, **opts) out = str(module) - with open(out_path, "w") as f: - f.write(out) + # Atomic write: run_python_passes rewrites the kernel .mlir in place outside + # load()'s FileLock, so a concurrent compile of the same source must never see a + # truncated file -- mlir-opt would parse it to an empty module and silently drop + # the kernel (-> undefined reference to wrapper_kernel at link). + from torch._inductor.codecache import write_atomic + write_atomic(out_path, out) return True diff --git a/PyTorchSimFrontend/mlir/passes/_mlir_util.py b/PyTorchSimFrontend/mlir/passes/_mlir_util.py new file mode 100644 index 00000000..e39f9d6f --- /dev/null +++ b/PyTorchSimFrontend/mlir/passes/_mlir_util.py @@ -0,0 +1,87 @@ +"""Small, dependency-light helpers shared across the MLIR passes. + +Every pass had its own copy of the same op-walk generator (named variously +`_iter_ops` / `_walk` / `_walk_ops`) and the same one-line attribute builders +(`_i32` / `_i64` / ...). This module is the single source for both. + +Import-safety: `walk_ops` is pure block/op attribute access and needs no MLIR +bindings, so this module does NOT import `mlir.ir` at top level -- some passes +(e.g. lower_vlane_idx, decompose_transfer) are deliberately importable without +the bindings present and only touch `mlir.ir` inside their run functions. The +attribute builders therefore import `mlir.ir` lazily; they require an active +MLIR context (the caller's `with ctx:`), exactly as the per-pass copies did. +""" + + +def walk_ops(block): + """Yield every op under `block` in program order, recursing into regions. + + Snapshots each block's operation list, so a caller may erase ops while + iterating (the strictest of the former copies; a superset of the rest).""" + for op in list(block.operations): + yield op + for region in op.operation.regions: + for b in region.blocks: + yield from walk_ops(b) + + +def _ir(): + import mlir.ir as ir + return ir + + +def i32(v): + """`i32` IntegerAttr for `v` (uses the active MLIR context).""" + ir = _ir() + return ir.IntegerAttr.get(ir.IntegerType.get_signless(32), int(v)) + + +def i64(v): + """`i64` IntegerAttr for `v`.""" + ir = _ir() + return ir.IntegerAttr.get(ir.IntegerType.get_signless(64), int(v)) + + +def i64_array(vals): + """ArrayAttr of `i64` IntegerAttrs for `vals`.""" + ir = _ir() + i = ir.IntegerType.get_signless(64) + return ir.ArrayAttr.get([ir.IntegerAttr.get(i, int(v)) for v in vals]) + + +def str_attr(v): + """StringAttr of `str(v)`.""" + ir = _ir() + return ir.StringAttr.get(str(v)) + + +# --------------------------------------------------------------------------- +# attribute readers -- accept an OpView or an Operation; `default` is returned +# when `key` is absent (callers that want the strict "must be present" behaviour +# simply never pass an absent key). +# --------------------------------------------------------------------------- +def _attrs(op): + return getattr(op, "operation", op).attributes + + +def attr_int(op, key, default=None): + """Integer value of `op`'s `key` attribute, or `default` if absent.""" + ir = _ir() + a = _attrs(op) + return ir.IntegerAttr(a[key]).value if key in a else default + + +def attr_bool(op, key, default=False): + """Bool value of `op`'s `key` attribute, or `default` if absent.""" + ir = _ir() + a = _attrs(op) + return bool(ir.BoolAttr(a[key]).value) if key in a else default + + +def attr_i64_array(op, key, default=None): + """`op`'s `key` ArrayAttr of integers as a Python list, or `default` if + absent (pass `default=[]` for the "missing -> empty" convention).""" + ir = _ir() + a = _attrs(op) + return ([ir.IntegerAttr(x).value for x in ir.ArrayAttr(a[key])] + if key in a else default) diff --git a/PyTorchSimFrontend/mlir/passes/build_skeleton.py b/PyTorchSimFrontend/mlir/passes/build_skeleton.py new file mode 100644 index 00000000..4c3d89cb --- /dev/null +++ b/PyTorchSimFrontend/mlir/passes/build_skeleton.py @@ -0,0 +1,566 @@ +"""build_skeleton pass (C2): reduce a kernel's post-vcix MLIR to the +*skeleton + API* form, in place. + +The trace pipeline (docs/design/togsim_cpp_trace.md) compiles a kernel to a +shape-parametric C++ trace producer. The producer is just the kernel's loop +skeleton with the data computation replaced by calls to the event-based runtime +API. This pass performs that reduction at the MLIR level: + + * `memref.dma_start` -> `togsim.dma(...) {tag_id, is_async, ...}` carrying the + runtime tag index operand (`%tag[%idx]`). + * `memref.dma_wait` -> `togsim.memory_barrier(tag_idx) {tag_id, write_bufs}`, + the explicit async-DMA sync. It pairs with its dma by + the RUNTIME tag slot (tag_id + the tag index), not a + compile-time id: one static dma op runs once per loop + iteration with a different `%tag[%idx]`, so only the + runtime slot can pair iteration i's dma with its wait. + * each compute node -> a single `togsim.compute {tile_id, compute_type}` + * everything else -> removed by a use-based DCE, keeping the loops and the + index/address arithmetic the survivors depend on. + +It reuses build_tog's traversal (`TogBuilder` / `_build`): loops, DMAs and +compute blocks are already identified there, each with a back-pointer to its +MLIR op(s), so this pass only adds the *rewrite*. Keeping a single traversal +guarantees the skeleton and the legacy TOG see the same structure. + +Counterpart to `build_tog.build_tog_and_mutate`. + +The DCE is safe by construction: it never erases an op whose results still have +uses, so at worst it leaves extra ops in the dump (visible for diagnosis) rather +than producing invalid IR. + +Requires the MLIR Python bindings (importing `build_tog` pulls in `mlir.ir`). +""" + +from . import togsim_ops as ts +from ._mlir_util import walk_ops, i32, i64, i64_array, str_attr +from .build_tog import ( + ir, + TogBuilder, + _build, + _reset_ids, + _find_kernel, + _value_key, + TOGDMANode, + TOGDMAWaitNode, + _COMPUTE_TYPE_NAME, +) + +#: Marker op names for the passes/__init__ fast-path (skip parsing if absent). +MARKERS = ("memref.dma_start", "memref.dma_wait") + +#: Ops the DCE must never remove (loops, terminators, our API ops). +_KEEP = { + "affine.for", "scf.for", "scf.while", + "affine.yield", "scf.yield", "func.return", + ts.DMA, ts.COMPUTE, ts.MEMORY_BAR, +} + + +def _kernel_block(module): + func_op = _find_kernel(module) + if func_op is None: + return None + return func_op.regions[0].blocks[0] + + +# --------------------------------------------------------------------------- +# op construction +# --------------------------------------------------------------------------- +def _arg_id_of(base_addr): + """Tensor func-arg ordinal from a build_tog base name ("arg3" -> 3); -1 if + it is not a plain block-arg base.""" + s = str(base_addr) + return int(s[3:]) if s.startswith("arg") and s[3:].isdigit() else -1 + + +def _emit_dma(ctx, dma_node, tag_id, dram_index, tag_index, read_bufs, write_bufs): + """Insert a `togsim.dma` before the original `memref.dma_start`. + + `tag_id` is the identity of this DMA's tag memref. An async DMA pairs with + its `togsim.memory_barrier` (the original dma_wait) by the RUNTIME tag slot + -- (tag_id, tag_index) -- not a compile-time identifier: one static dma op runs + once per loop iteration, each with a different runtime `%tag[%idx]` slot, so + only a runtime key can pair iteration i's dma with iteration i's wait. + + `dram_index` is the original linear DRAM index Value (the `affine.apply` + result that indexed the tensor in the `memref.dma_start`) -- carried as an + operand so the DCE keeps the address arithmetic live and the C4 lowering can + compute the real `base_addr = base[arg_id] + index*elem` (P3, approach A). + + `tag_index` is the original SRAM tag index Value (`%tag[%idx]`), carried as a + second operand: the runtime tag slot, used both to pair with the barrier and + for the double-buffer / SRAM-capacity (WAR) model. + Operand order: [dram_index, tag_index] (each omitted if absent).""" + op = dma_node.op + attrs = { + ts.ATTR_DIR: i32(ts.DIR_STORE if dma_node.is_write else ts.DIR_LOAD), + ts.ATTR_DIMS: i64_array(dma_node.tile_size), + ts.ATTR_STRIDES: i64_array(dma_node.tile_stride), + ts.ATTR_ELEM_BITS: i32(dma_node.element_size), + ts.ATTR_IS_ASYNC: ir.BoolAttr.get(bool(dma_node.is_async)), + ts.ATTR_TAG_ID: i32(tag_id), + ts.ATTR_ARG_ID: i32(_arg_id_of(dma_node.base_addr)), + "base": str_attr(dma_node.base_addr), + # SRAM spad this DMA touches (load writes it, store reads it) -- sec 10. + ts.ATTR_READ_BUFS: i64_array(read_bufs), + ts.ATTR_WRITE_BUFS: i64_array(write_bufs), + } + operands = [v for v in (dram_index, tag_index) if v is not None] + ir.Operation.create( + ts.DMA, + results=[], + operands=operands, + attributes=attrs, + loc=ir.Location.unknown(ctx), + ip=ir.InsertionPoint(op), + ) + + +def _emit_memory_bar(ctx, anchor_op, tag_id, tag_index, write_bufs): + """Insert a `togsim.memory_barrier` before `anchor_op` -- the explicit + async-DMA sync that was the original `memref.dma_wait`. It pairs with its + async `togsim.dma` by the RUNTIME tag slot (tag_id + tag_index), and carries + the SRAM buffer that dma loaded so consumers gate on data-arrival, not on the + async dma's issue-complete.""" + attrs = { + ts.ATTR_TAG_ID: i32(tag_id), + ts.ATTR_WRITE_BUFS: i64_array(write_bufs), + } + operands = [tag_index] if tag_index is not None else [] + ir.Operation.create( + ts.MEMORY_BAR, results=[], operands=operands, attributes=attrs, + loc=ir.Location.unknown(ctx), ip=ir.InsertionPoint(anchor_op)) + + +def _flatten_add(expr): + """Top-level additive summands of an AffineExpr (`.lhs`/`.rhs` come back typed + as the base AffineExpr, so use the `isinstance`/cast pattern, not Python + isinstance).""" + if ir.AffineAddExpr.isinstance(expr): + a = ir.AffineAddExpr(expr) + return _flatten_add(a.lhs) + _flatten_add(a.rhs) + return [expr] + + +def _neg_coeff_dim(summand): + """If `summand` is `dim * c` with a negative constant `c`, return that dim's + position; else None. lower_to_vcix tags each accumulation (reduction) loop var + with coefficient -1 in the dma_wait tag index -- a SENTINEL marking the + reduction axis, not an arithmetic offset (legacy TileGraphParser skips stride + -1 for the same reason).""" + if not ir.AffineMulExpr.isinstance(summand): + return None + mul = ir.AffineMulExpr(summand) + l, r = mul.lhs, mul.rhs + dim = l if ir.AffineDimExpr.isinstance(l) else (r if ir.AffineDimExpr.isinstance(r) else None) + con = l if ir.AffineConstantExpr.isinstance(l) else (r if ir.AffineConstantExpr.isinstance(r) else None) + if dim is None or con is None or ir.AffineConstantExpr(con).value >= 0: + return None + return ir.AffineDimExpr(dim).position + + +def _strip_accum_terms(ctx, tag_index, anchor_op): + """Return a tag-index Value with the accumulation-marked (-1 coefficient) terms + dropped, so a memory_barrier waits on the SAME subtile slot its async load + wrote. + + The wait tag index built by lower_to_vcix carries `-acc_iv` for each reduction + loop var; the matching load index (dma_fine_grained) is subtile-only. Without + this, at reduction iteration > 0 the producer EVALUATES `-acc_iv` to a negative + slot, so the recorded barrier slot diverges from the load slot and the runtime + tag pairing fails (TOGSim aborts with "Key does not exist in ... tag table"). + Dropping the -1 terms mirrors legacy TileGraphParser.cc, which skips stride -1 + and routes the reduction axis to a separate accum tag component; here the + per-iteration tag alloc (dma_fine_grained) already separates the reductions, so + the barrier only needs the subtile slot. + + Falls through (returns `tag_index` unchanged) for anything that is not an + affine.apply whose single result carries such a term -- e.g. the single-tile + case, whose index has no reduction term.""" + if tag_index is None: + return None + try: + apply_op = tag_index.owner + if apply_op.name != "affine.apply": + return tag_index + amap = ir.AffineMapAttr(apply_op.attributes["map"]).value + except Exception: + return tag_index + if amap.n_dims == 0 or amap.n_symbols != 0 or len(amap.results) != 1: + return tag_index + expr = amap.results[0] + dropped = sorted({p for p in (_neg_coeff_dim(s) for s in _flatten_add(expr)) + if p is not None}) + if not dropped: + return tag_index + n = amap.n_dims + kept = [i for i in range(n) if i not in dropped] + new_pos = {old: i for i, old in enumerate(kept)} + # compose the original expr with a selector that sends each dropped dim to 0 + # and renumbers the kept dims 0..k-1. + sel = [ir.AffineConstantExpr.get(0) if i in dropped + else ir.AffineDimExpr.get(new_pos[i]) for i in range(n)] + new_expr = expr.compose(ir.AffineMap.get(len(kept), 0, sel)) + new_map = ir.AffineMap.get(len(kept), 0, [new_expr]) + operands = list(apply_op.operands) + new_operands = [operands[i] for i in kept] + new_apply = ir.Operation.create( + "affine.apply", + results=[ir.IndexType.get(ctx)], + operands=new_operands, + attributes={"map": ir.AffineMapAttr.get(new_map)}, + loc=ir.Location.unknown(ctx), + ip=ir.InsertionPoint(anchor_op), + ) + return new_apply.results[0] + + +def _emit_compute(ctx, compute_node, tile_id, read_bufs, write_bufs): + front = compute_node.operations[0] + attrs = { + ts.ATTR_TILE_ID: i64(tile_id), + # int code (0 vector / 1 matmul / 2 preload) consumed by the C4 lowering; + # maps directly to the Core compute-unit enum. Keep the readable name too. + ts.ATTR_COMPUTE_TYPE: i32(int(compute_node.compute_type)), + "compute_type_name": str_attr(_COMPUTE_TYPE_NAME[compute_node.compute_type]), + # SRAM buffer ids read/written (sec 10 dataflow); the bridge builds the + # dependency DAG by last-writer per buffer. + ts.ATTR_READ_BUFS: i64_array(read_bufs), + ts.ATTR_WRITE_BUFS: i64_array(write_bufs), + } + ir.Operation.create( + ts.COMPUTE, + results=[], + operands=[], + attributes=attrs, + loc=ir.Location.unknown(ctx), + ip=ir.InsertionPoint(front), + ) + + +# --------------------------------------------------------------------------- +# DCE +# --------------------------------------------------------------------------- +def _has_nonempty_region(op): + for region in op.operation.regions: + for b in region.blocks: + if len(list(b.operations)) > 0: + return True + return False + + +def _results_unused(op): + for r in op.operation.results: + if len(list(r.uses)) > 0: + return False + return True + + +def _strip_loop_iter_args(block): + """Drop loop-carried values (iter_args) from every affine.for/scf.for. + + The skeleton only needs the loop STRUCTURE (iteration counts) and the + togsim.* markers -- not the data flowing through the loop. Reduction kernels + carry a *vector* accumulator as an iter_arg; EmitC/C++ cannot represent a + loop carrying a vector, so the trace .so emission fails. Since the trace is + timing-only (values come from the recorded run), we rebuild each loop without + iter_args: body uses of an iter_arg become its init value, the loop result + becomes its init, and the now-orphaned accumulate ops are removed by _dce. + """ + # Only strip a loop whose RESULTS are unused (dead for the trace): the carried + # value goes nowhere live, so dropping it is safe. A loop whose result still + # feeds a kept op (e.g. an index accumulator consumed by a togsim.dma address) + # is left untouched. Run after _dce so the result store is already gone; then + # nested reductions free up inner results round by round (outer stripped first). + while True: + tgt = None + for op in walk_ops(block): + n = op.operation.name + if (n in ("affine.for", "scf.for") and len(op.operation.results) > 0 + and _results_unused(op)): + tgt = op + break + if tgt is None: + return + _rebuild_loop_no_iter(tgt) + + +def _rebuild_loop_no_iter(op): + o = op.operation + nres = len(o.results) + n_in = len(o.operands) + inits = [o.operands[n_in - nres + i] for i in range(nres)] + keep_operands = [o.operands[i] for i in range(n_in - nres)] # bound operands only + old_block = o.regions[0].blocks[0] + oargs = list(old_block.arguments) # [iv, *iter_args] + + attrs = {na.name: na.attr for na in o.attributes} + # affine.for tags its operand groups; zero the iter-arg group (last entry). + if "operandSegmentSizes" in attrs: + seg = [int(x) for x in str(attrs["operandSegmentSizes"]).split(":")[1].strip(" >").split(",")] + seg[-1] = 0 + attrs["operandSegmentSizes"] = ir.Attribute.parse( + "array") + + loc = ir.Location.unknown(o.context) + with loc: # default loc for new block args + new = ir.Operation.create(o.name, results=[], operands=keep_operands, + attributes=attrs, regions=1, loc=loc, + ip=ir.InsertionPoint(o)) + nb = new.regions[0].blocks.append(oargs[0].type) # block with the iv arg only + + oargs[0].replace_all_uses_with(nb.arguments[0]) # iv + for ba, ini in zip(oargs[1:], inits): # iter-arg uses -> init + ba.replace_all_uses_with(ini) + for res, ini in zip(o.results, inits): # loop result -> init + res.replace_all_uses_with(ini) + + term_name = "affine.yield" if o.name == "affine.for" else "scf.yield" + with ir.InsertionPoint(nb): + ir.Operation.create(term_name, results=[], operands=[], loc=loc) + new_term = list(nb.operations)[0] + for bop in list(old_block.operations)[:-1]: # move body (drop old yield) + bop.operation.move_before(new_term) + o.erase() + + +def _dce(block): + """Erase non-kept ops with no used results, to a fixed point. Safe: an op + with live SSA uses is never touched.""" + changed = True + while changed: + changed = False + victims = [] + for op in walk_ops(block): + name = op.operation.name + if name in _KEEP: + continue + if _has_nonempty_region(op): + continue + if _results_unused(op): + victims.append(op) + for op in victims: + try: + op.operation.erase() + changed = True + except Exception: + # Still referenced via something we will erase next round; retry. + pass + + +# --------------------------------------------------------------------------- +# driver +# --------------------------------------------------------------------------- +def _collect_dma_nodes(builder): + """Map op-identity -> DMA/DMAWait node, by walking the built tree.""" + by_op = {} + seen = set() + + def visit(n): + if id(n) in seen: + return + seen.add(id(n)) + if isinstance(n, (TOGDMANode, TOGDMAWaitNode)) and n.op is not None: + by_op[id(n.op.operation)] = n + for c in n.children: + visit(c) + + for ln in builder.loop_nodes: + visit(ln) + return by_op + + +class _BufferIds: + """Assigns each SRAM buffer name a stable small int id, shared by DMA and + compute so the bridge can match a reader to its buffer's writer (sec 10). + The virtual SA_WEIGHTS buffer (preload -> matmul) is numbered here too, on + first sight. `None` (a non-buffer base) is -1.""" + + def __init__(self): + self._ids = {} + + def of(self, name): + if name is None: + return -1 + return self._ids.setdefault(name, len(self._ids)) + + +class _TagIds: + """Identity of a DMA's tag memref -> stable small int, plus the SRAM buffer + that tag's async DMA loads. An async dma and its memory_barrier (the original + dma_wait) share a tag memref; this assigns it a tag_id (so the runtime can + pair them by the runtime tag slot) and remembers the loaded buffer so the + barrier can release it to consumers. Pairing is by tag, never a static id.""" + + def __init__(self): + self._ids = {} # tag value-key -> tag_id + self._buf = {} # tag value-key -> SRAM buffer id the dma loads + + def bind(self, key, buf): + tag_id = self._ids.setdefault(key, len(self._ids)) + self._buf[key] = buf + return tag_id + + def lookup(self, key): + """(tag_id, buffer) for a tag memref, or None if no dma used it.""" + if key not in self._ids: + return None + return self._ids[key], self._buf[key] + + +def _emit_computes(ctx, builder, bufs): + """Step 1: each compute node -> one togsim.compute carrying its tile_id and + the ids of the SRAM buffers it reads/writes. Returns the count.""" + from . import dep_analysis as dep # lazy: dep_analysis imports build_skeleton + n = 0 + for tile_id, cn in enumerate(builder.compute_nodes): + if not cn.operations: + continue + reads, writes = dep.compute_buffers(cn) + _emit_compute(ctx, cn, tile_id, + sorted(bufs.of(b) for b in reads), + sorted(bufs.of(b) for b in writes)) + n += 1 + return n + + +def _emit_one_dma(ctx, op, node, builder, bufs, tags): + """Rewrite one memref.dma_start as togsim.dma. A load reads DRAM and writes + its SRAM spad; a store reads the spad and writes DRAM -- which sets the + read/write buffer that drives the dependency edge (sec 10). The tag memref is + bound to a tag_id (with its loaded buffer) so the paired memory_barrier finds + it by the runtime tag slot.""" + from . import dep_analysis as dep # lazy: dep_analysis imports build_skeleton + f = builder._dma_start_fields(op) + dram_indices = f["dst_indices"] if node.is_write else f["src_indices"] + dram_index = dram_indices[0] if dram_indices else None + tag_indices = f["tag_indices"] + tag_index = tag_indices[0] if tag_indices else None + # the spad is the SRAM side of the copy: dst for a load, src for a store. + spad_id = bufs.of(dep._global_of(f["src"] if node.is_write else f["dst"])) + read_bufs = [spad_id] if node.is_write else [] + write_bufs = [] if node.is_write else [spad_id] + tag_id = tags.bind(_value_key(f["tag"]), spad_id) + _emit_dma(ctx, node, tag_id, dram_index, tag_index, read_bufs, write_bufs) + + +def _emit_one_wait(ctx, op, tags): + """Rewrite one memref.dma_wait as togsim.memory_barrier -- the explicit + async-DMA sync already in the IR. Paired with its dma by the tag memref + (tag_id) and the runtime tag index; carries the buffer the dma loaded. + Returns True iff emitted (a wait whose tag no dma used is dropped).""" + operands = list(op.operation.operands) + tag = operands[0] + tag_index = operands[1] if len(operands) >= 2 else None + binding = tags.lookup(_value_key(tag)) + if binding is None: + return False + tag_id, buf = binding + # honor lower_to_vcix's -1 accumulation marker: strip the reduction terms so + # the barrier slot equals the subtile slot the paired async load wrote. + tag_index = _strip_accum_terms(ctx, tag_index, op) + _emit_memory_bar(ctx, op, tag_id, tag_index, [buf]) + return True + + +def _emit_dmas_and_waits(ctx, block, builder, dma_by_op, bufs): + """Step 2: rewrite memref.dma_start -> togsim.dma and memref.dma_wait -> + togsim.memory_barrier in program order. An async dma and its barrier are + paired by the RUNTIME tag slot (tag_id + tag index), not a compile-time id: + one static dma op runs per loop iteration with a different `%tag[%idx]`, so + only the runtime slot can pair iteration i's dma with iteration i's wait. + Returns the original ops to erase and the (dma, wait) counts.""" + tags = _TagIds() + originals = [] + n_dma = n_wait = 0 + for op in list(walk_ops(block)): + name = op.operation.name + if name == "memref.dma_start": + node = dma_by_op.get(id(op.operation)) + if node is None: + continue + _emit_one_dma(ctx, op, node, builder, bufs, tags) + originals.append(op) + n_dma += 1 + elif name == "memref.dma_wait": + if _emit_one_wait(ctx, op, tags): + n_wait += 1 + originals.append(op) + return originals, n_dma, n_wait + + +def build_skeleton(module): + """Reduce `func.func @kernel` in `module` to the skeleton+API form, in place. + + Four steps: analyze the kernel into loop/compute/DMA nodes, emit a + togsim.compute per compute node, rewrite the DMAs/waits to togsim.dma/wait, + then DCE the leftover data computation. Returns a short text report (counts). + """ + _reset_ids() + builder = TogBuilder() + _build(module, builder) # populates loop/compute nodes + op back-pointers + + block = _kernel_block(module) + if block is None: + return "no @kernel found" + ctx = module.context + dma_by_op = _collect_dma_nodes(builder) + bufs = _BufferIds() + + n_compute = _emit_computes(ctx, builder, bufs) + originals, n_dma, n_wait = _emit_dmas_and_waits(ctx, block, builder, dma_by_op, bufs) + + # erase the now-replaced originals (result-less -> safe), then strip the + # leftover data computation. + for op in originals: + try: + op.operation.erase() + except Exception: + pass + _dce(block) # drop dead consumers (e.g. the result store) first, + _strip_loop_iter_args(block) # so a now-unused loop result lets us strip its iter_args + _dce(block) # then clean the orphaned accumulate ops + + return ("skeleton: compute=%d dma=%d wait=%d (unpaired waits dropped)" + % (n_compute, n_dma, n_wait)) + + +def run(module, vectorlane=128): + """passes/__init__ pass protocol entry (vectorlane unused; kept for parity).""" + build_skeleton(module) + + +def run_skeleton(in_path, out_path=None): + """Read post-vcix MLIR at `in_path`, reduce to skeleton+API, write it out. + + Requires the MLIR bindings. + """ + if out_path is None: + out_path = in_path + ctx = ir.Context() + ctx.allow_unregistered_dialects = True + with ctx: + module = ir.Module.parse(open(in_path).read(), ctx) + report = build_skeleton(module) + with open(out_path, "w") as fh: + fh.write(str(module)) + return report + + +def main(argv): + import argparse + + parser = argparse.ArgumentParser(prog="build_skeleton.py") + parser.add_argument("input") + parser.add_argument("--out", default=None) + args = parser.parse_args(argv[1:]) + report = run_skeleton(args.input, args.out) + import sys + sys.stderr.write(report + "\n") + return 0 + + +if __name__ == "__main__": + import sys + sys.exit(main(sys.argv)) diff --git a/PyTorchSimFrontend/mlir/passes/cycle_table.py b/PyTorchSimFrontend/mlir/passes/cycle_table.py new file mode 100644 index 00000000..40dd3459 --- /dev/null +++ b/PyTorchSimFrontend/mlir/passes/cycle_table.py @@ -0,0 +1,103 @@ +"""cycle_table (C3): the precomputed tile_id -> (cycle, overlapping_cycle) table +the C++ trace pipeline looks up at runtime (docs/design/togsim_cpp_trace.md sec +6, sec 9.8 task 4). + +A `togsim.compute(tile_id=...)` in the trace says *which* tile to compute, not +how long it takes. Because tiles are fixed size, each tile's cost is invariant +(only the trip count varies with shape), so it is sampled once and stored here, +keyed by `tile_id`. Two numbers per tile, mirroring the legacy TOG: + + * `cycle` -- full compute latency, sampled by gem5 sample-mode + (the existing measurement: `_rewrite_loop_steps` + + `_insert_compute_markers` in build_tog, run through + CycleSimulator -> the per-tile `cycle_list`). + * `overlapping_cycle` -- the portion that overlaps the previous instruction in + the systolic pipeline; the timing core uses it as + `finish = prev.finish + cycle - overlapped` (Core.cc). + Derived exactly as the legacy path does + (tog_generator.generate_tile_graph): + type 0 (VectorCompute) -> 0 + type 1 (MatmulCompute) -> max(cycle - x_offset, 0) + type 2 (MatmulPreload) -> max(cycle - w_offset, 0) + +This module only *builds/serializes* the table from a cycle_list; obtaining the +cycle_list reuses the existing sample-mode + gem5 path (wired in P3 task 5). The +`tile_id` order matches build_skeleton's `compute_nodes` order, which matches the +legacy TOG, so the same sampling keys both paths. + +Requires the MLIR Python bindings (to read the skeleton's togsim.compute ops). +""" + +import json + +from . import togsim_ops as ts +from ._mlir_util import walk_ops +from .build_tog import ( + ir, + VECTOR_COMPUTE, + MATMUL_COMPUTE, # noqa: F401 (documents the type enum used by the formula) + MATMUL_PRELOAD, +) + + +def overlapping_cycle(cycle, compute_type, x_offset, w_offset): + """Hideable (pipeline-overlapped) portion of `cycle`. Mirrors + tog_generator.generate_tile_graph.""" + if compute_type <= VECTOR_COMPUTE: # VectorCompute: no systolic overlap + return 0 + offset = w_offset if compute_type == MATMUL_PRELOAD else x_offset + return max(int(cycle) - int(offset), 0) + + +def _compute_types(skeleton_module): + """tile_id-ordered list of compute_type ints, from the skeleton's + togsim.compute ops.""" + items = [] + for op in walk_ops(skeleton_module.body): + if op.operation.name != ts.COMPUTE: + continue + tid = ir.IntegerAttr(op.operation.attributes[ts.ATTR_TILE_ID]).value + ct = ir.IntegerAttr(op.operation.attributes[ts.ATTR_COMPUTE_TYPE]).value + items.append((tid, ct)) + items.sort() + return [t for _, t in items] + + +def build_cycle_table(skeleton_module, cycle_list, x_offset, w_offset): + """Return `[(cycle, overlapping_cycle), ...]` indexed by tile_id. + + `cycle_list` is the per-tile gem5 measurement (compute_nodes order == + tile_id order). `x_offset`/`w_offset` are the systolic-fill offsets the + legacy path computes from the vector-lane size / loop size.""" + types = _compute_types(skeleton_module) + if len(cycle_list) != len(types): + raise ValueError( + "cycle_list (%d) does not match #compute tiles (%d)" + % (len(cycle_list), len(types))) + return [(int(c), overlapping_cycle(c, t, x_offset, w_offset)) + for c, t in zip(cycle_list, types)] + + +def dump_cycle_table(table, path, x_offset=None, w_offset=None): + """Serialize the table as a sidecar JSON next to the trace `.so`. The P3 C6 + loader reads it and sets compute_cycle + overlapping_cycle on each emitted + Instruction.""" + with open(path, "w") as fh: + json.dump({"x_offset": x_offset, "w_offset": w_offset, + "table": [list(e) for e in table]}, fh) + return path + + +def load_cycle_table(path): + with open(path) as fh: + return json.load(fh) + + +def dump_cycle_table_tsv(table, path): + """Plain `cycleoverlapping` per line, in tile_id order -- the trivial + format the C++ `--cycle_table` loader (main.cc, P3 trace pipeline) reads with + ifstream (no JSON dependency in TOGSim).""" + with open(path, "w") as fh: + for cycle, overlapping in table: + fh.write("%d\t%d\n" % (int(cycle), int(overlapping))) + return path diff --git a/PyTorchSimFrontend/mlir/passes/decompose_transfer.py b/PyTorchSimFrontend/mlir/passes/decompose_transfer.py index c0e82b66..10b2edfb 100644 --- a/PyTorchSimFrontend/mlir/passes/decompose_transfer.py +++ b/PyTorchSimFrontend/mlir/passes/decompose_transfer.py @@ -32,13 +32,7 @@ OP_NAME = "togsim.transfer" MARKERS = (OP_NAME,) - -def _iter_ops(block): - for op in list(block.operations): - yield op - for region in op.operation.regions: - for b in region.blocks: - yield from _iter_ops(b) +from ._mlir_util import walk_ops def _int_array(attr): @@ -92,7 +86,7 @@ def run(module, vectorlane=128, **_): targets = [] for region in module.operation.regions: for b in region.blocks: - for op in _iter_ops(b): + for op in walk_ops(b): if op.operation.name == OP_NAME: targets.append(op.operation) diff --git a/PyTorchSimFrontend/mlir/passes/dep_analysis.py b/PyTorchSimFrontend/mlir/passes/dep_analysis.py new file mode 100644 index 00000000..06d8270d --- /dev/null +++ b/PyTorchSimFrontend/mlir/passes/dep_analysis.py @@ -0,0 +1,234 @@ +"""dep_analysis.py -- dependency-edge analysis for the C++ trace pipeline (P3, sec 10). + +The current TOG pass does NO dependency analysis (it emits a lexical loop tree + +runtime tags). This module derives the producer->consumer edges that the explicit +dataflow trace needs, from two sources available on the post-vcix IR (before +build_skeleton collapses the compute regions): + + 1. SRAM access: each DMA/compute's read/write SRAM buffer(s), recovered by + following SSA (a vcix.iv's input vector -> its vector.transfer_read -> the + memref -> @global), and the DMA's spad operand. Edge: a reader depends on + the last node that wrote the same buffer. + 2. vcix preload/matmul pairing: a matmul (vcix opcode 0) consumes the weights a + preceding preload (opcode 1) loaded into the systolic array -- an SA-internal + dependency NOT visible as a memref access, so it comes from the opcode order. + +This is a node-level analysis (one node per build_tog compute/DMA node); the loops +replay the nodes, so loop-carried edges (the Y_spad accumulator) are materialized +per iteration downstream. First cut: buffer granularity (slot-level value matching +is a later refinement). Output is an edge list for validation / to drive emit. +""" +import sys +import os + +from .build_tog import TogBuilder, ir, _reset_ids +from . import build_skeleton as _bs + + +def _global_of(memref_val): + """memref SSA value -> @global symbol name (e.g. 'X_spad'), or None.""" + owner = memref_val.owner + op = owner if isinstance(owner, ir.Operation) else getattr(owner, "operation", None) + if op is None: + return None + if op.name == "memref.get_global": + return str(op.attributes["name"]).strip('@" ') + # walk through view-like ops (subview/cast) to their source + if op.operands: + try: + return _global_of(op.operands[0]) + except Exception: + return None + return None + + +# Ops that touch SRAM-buffer DATA, by category. A view op (subview/reinterpret_cast) +# instead PRODUCES a memref -- pure address computation, skipped here; the real access +# is the load/store using it, whose memref operand _global_of traces back through the +# view to the @global. Anything else carrying a memref operand raises, so a NEW fusion +# pattern is caught at compile time rather than as a silent runtime deadlock. +_LOAD_OPS = {"vector.transfer_read", "affine.vector_load", "vector.load", + "memref.load", "affine.load"} +_STORE_OPS = {"vector.transfer_write", "affine.vector_store", "vector.store", + "memref.store", "affine.store"} +_IGNORE_OPS = {"memref.dealloc"} # lifetime, not a data access + + +def _is_memref(v): + try: + return ir.MemRefType.isinstance(v.type) + except Exception: + return False + + +def _walk_compute_ops(cn): + """Every op in the compute node, recursing into nested regions (loop bodies). A + fused epilogue (BatchNorm/ReLU) keeps its ops inside an un-unrolled affine.for, so + a top-level-only scan (cn.operations) sees just the loop and misses every access.""" + for top in cn.operations: + stack = [top] + while stack: + op = stack.pop() + yield op + for region in op.operation.regions: + for block in region.blocks: + stack.extend(block.operations) + + +def _rw_buffers_of_compute(cn): + """(reads, writes): the @global SRAM buffers a compute node reads/writes, walking + nested regions and classifying each op that touches a memref.""" + reads, writes = set(), set() + def rd(v): + b = _global_of(v) + if b: + reads.add(b) + def wr(v): + b = _global_of(v) + if b: + writes.add(b) + for op in _walk_compute_ops(cn): + if any(_is_memref(r) for r in op.results): + continue # view/cast/alloc -- address only + mrefs = [v for v in op.operands if _is_memref(v)] + if not mrefs: + continue + name = op.name + if name in _LOAD_OPS: + for v in mrefs: + rd(v) + elif name in _STORE_OPS: + for v in mrefs: + wr(v) # the store target memref + elif name == "memref.copy": + rd(mrefs[0]) + wr(mrefs[-1]) + elif name.startswith("linalg."): # DPS: ins read, outs read+write + for v in op.inputs: + if _is_memref(v): + rd(v) + for v in op.outputs: + if _is_memref(v): + rd(v) + wr(v) + elif name in _IGNORE_OPS: + continue + else: + raise RuntimeError( + f"dep_analysis: unclassified memref op '{name}' in a compute node -- " + f"it touches an SRAM buffer; classify it in _LOAD_OPS/_STORE_OPS") + return reads, writes + + +def _dma_buffer(builder, dma_node): + """The SRAM spad buffer a DMA touches (dst for load, src for store).""" + try: + f = builder._dma_start_fields(dma_node.op) + except Exception: + return None + val = f["dst"] if not dma_node.is_write else f["src"] + return _global_of(val) + + +# Virtual buffer for the systolic-array weight registers: a preload writes it, +# the following matmul reads it. This folds the SA-internal preload->matmul +# dependency (not a memref access) into the uniform "last-writer per buffer" rule. +SA_WEIGHTS = "__SA_WEIGHTS__" + + +def compute_buffers(cn): + """(read_buffers, write_buffers) for one compute node, including the virtual + SA_WEIGHTS edge (preload writes it, matmul reads it).""" + reads, writes = _rw_buffers_of_compute(cn) + if cn.compute_type == 1: # MATMUL consumes the preloaded weights + reads.add(SA_WEIGHTS) + elif cn.compute_type == 2: # PRELOAD loads them + writes.add(SA_WEIGHTS) + return reads, writes + + +def analyze(module): + """Return (nodes, edges). nodes: list of dicts; edges: list of (consumer_idx, + producer_idx, reason).""" + _reset_ids() + builder = TogBuilder() + _bs._build(module, builder) + + nodes = [] + # DMA nodes only (the map also contains TOGDMAWaitNode; keep real DMAs). + dma_nodes = [dn for dn in dict.fromkeys(_bs._collect_dma_nodes(builder).values()) + if hasattr(dn, "is_write")] + for dn in dma_nodes: + buf = _dma_buffer(builder, dn) + nodes.append({ + "kind": "STORE" if dn.is_write else "LOAD", + "buf": buf, "arg": str(dn.base_addr), + "reads": {buf} if dn.is_write else set(), + "writes": {buf} if not dn.is_write else set(), + "node": dn, + }) + for cn in builder.compute_nodes: + if not cn.operations: + continue + ct = {0: "VECTOR", 1: "MATMUL", 2: "PRELOAD"}.get(cn.compute_type, f"c{cn.compute_type}") + creads, cwrites = _rw_buffers_of_compute(cn) + nodes.append({ + "kind": ct, + "reads": creads, + "writes": cwrites, + "node": cn, + "compute_type": cn.compute_type, + }) + + # Order nodes by program position (last-writer needs program order: e.g. the + # store reads Y_spad written by the matmul, which lexically precedes it). + pos = {} + idx = [0] + def _index(op): + pos[op] = idx[0]; idx[0] += 1 + for r in op.regions: + for b in r.blocks: + for o in b.operations: + _index(o) + _index(module.operation) + def _key(n): + node = n["node"] + op = getattr(node, "op", None) or (node.operations[0] if getattr(node, "operations", None) else None) + return pos.get(op, 1 << 30) + nodes.sort(key=_key) + + # Edges: (1) buffer last-writer, (2) preload->matmul. + edges = [] + last_writer = {} # buffer -> node idx + prev_preload = None + for i, n in enumerate(nodes): + for b in sorted(n["reads"]): + if b in last_writer: + edges.append((i, last_writer[b], f"reads {b}")) + if n["kind"] == "MATMUL" and prev_preload is not None: + edges.append((i, prev_preload, "uses preloaded weights (vcix op1->op0)")) + for b in n["writes"]: + last_writer[b] = i + if n["kind"] == "PRELOAD": + prev_preload = i + return nodes, edges + + +def _main(): + path = sys.argv[1] + ctx = ir.Context(); ctx.allow_unregistered_dialects = True + with ctx: + module = ir.Module.parse(open(path).read(), ctx) + nodes, edges = analyze(module) + print("=== nodes ===") + for i, n in enumerate(nodes): + r = ",".join(sorted(n["reads"])) or "-" + w = ",".join(sorted(n["writes"])) or "-" + print(f" #{i:<2} {n['kind']:<8} reads[{r}] writes[{w}]") + print("=== edges (consumer -> producer) ===") + for c, p, why in edges: + print(f" #{c} ({nodes[c]['kind']}) -> #{p} ({nodes[p]['kind']}) [{why}]") + + +if __name__ == "__main__": + _main() diff --git a/PyTorchSimFrontend/mlir/passes/dma_fine_grained.py b/PyTorchSimFrontend/mlir/passes/dma_fine_grained.py index 3f583ef2..d7571d2b 100644 --- a/PyTorchSimFrontend/mlir/passes/dma_fine_grained.py +++ b/PyTorchSimFrontend/mlir/passes/dma_fine_grained.py @@ -21,6 +21,7 @@ Pipeline entry point: run_fine_grained(in_path, out_path, vectorlane). """ +import itertools import os import sys @@ -30,6 +31,8 @@ import mlir.ir as ir # noqa: E402 +from ._mlir_util import walk_ops, attr_i64_array + MARKERS = ("subtile_size",) # only subtile DMAs are split MVIN, MVIN2, MVIN3, MVOUT = 2, 1, 14, 3 @@ -54,12 +57,6 @@ def _const_int(value, default=-1): return default -def _int_array_attr(op, key): - if key not in op.attributes: - return [] - return [ir.IntegerAttr(a).value for a in ir.ArrayAttr(op.attributes[key])] - - def _is_block_arg(v): return isinstance(v, ir.BlockArgument) @@ -106,13 +103,13 @@ def tile_shape(self): return list(mt.shape) def subtile_size(self): - return _int_array_attr(self.op, "subtile_size") + return attr_i64_array(self.op, "subtile_size", default=[]) def sram_stride(self): - return _int_array_attr(self.op, "sram_stride") + return attr_i64_array(self.op, "sram_stride", default=[]) def dram_stride(self): - return _int_array_attr(self.op, "dram_stride") + return attr_i64_array(self.op, "dram_stride", default=[]) def is_async(self): a = self.op.attributes @@ -244,6 +241,27 @@ def _const_index(v, ip): ir.IntegerAttr.get(ir.IndexType.get(), v), ip=ip).result +def _fresh_tag(dma): + """Give this DMA a fresh tag memref.alloc right BEFORE the (pre-split) coarse + dma_start, and rewire every use of the old tag -- the dma_start re-emitted + below AND its dma_wait -- to it. The coarse dma sits at the reduction-loop body + level (it has not been wrapped in a subtile load nest yet), so the alloc there + dominates both the load nest fine-grained is about to build and the sibling + wait nest. Each reduction iteration thus allocates its own tag -> successive + iterations are distinct (multi-tile-K / conv) and the per-iteration tag + semantics is in the IR, not reconstructed downstream. Old alloc becomes dead.""" + old = dma.tag + new_tag = ir.Operation.create("memref.alloc", results=[old.type], + operands=[], ip=ir.InsertionPoint(dma.op)).results[0] + old.replace_all_uses_with(new_tag) + dma.tag = new_tag + # the old (func-entry, per-tensor unique) alloc is now dead -- erase it. + try: + old.owner.erase() + except Exception: + pass + + # --------------------------------------------------------------------------- # Loop-nest construction # --------------------------------------------------------------------------- @@ -293,20 +311,12 @@ def _reaches(value, target): # --------------------------------------------------------------------------- # Pass driver # --------------------------------------------------------------------------- -def _iter_ops(block): - for op in list(block.operations): - yield op - for region in op.operation.regions: - for b in region.blocks: - yield from _iter_ops(b) - - def _run_func(func, vectorlane): from mlir.dialects import linalg # First matmul only. matmul = None dmas = [] - for op in _iter_ops(func.regions[0].blocks[0]): + for op in walk_ops(func.regions[0].blocks[0]): name = op.operation.name if name == "linalg.matmul" and matmul is None: matmul = op @@ -363,16 +373,30 @@ def _run_func(func, vectorlane): for d, f in enumerate(fuse["w_to_fused"]): bounds[f] = w_counts[d] + # Give each load a fresh per-iteration tag alloc just before its coarse dma + # (rewiring its dma_wait via the old tag's uses), so the tag is distinct per + # reduction iteration -- positioned to match the per-iteration tag semantics. + _fresh_tag(mvin_input) + _fresh_tag(mvin_weight) + # Insert the fused nest at the weight DMA (the later of the two): both DMAs' # original DRAM base indices (src_idx[0], computed in the enclosing loops) must # dominate the nest. Codegen emits input before weight, matching the C++ pass # which fuses after the weight subtile loop. ip = ir.InsertionPoint(mvin_weight.op) - fused_ivs, body_ip = _build_for_nest(bounds, ip) - in_ivs = [fused_ivs[fuse["in_to_fused"][d]] for d in range(rank)] - w_ivs = [fused_ivs[fuse["w_to_fused"][d]] for d in range(rank)] - _emit_dma(mvin_input, in_ivs, vectorlane, body_ip) - _emit_dma(mvin_weight, w_ivs, vectorlane, body_ip) + # Unroll the fused nest, emitting each distinct input/weight subtile ONCE (a load + # is invariant to the other operand's dims, so the cross-product re-emits it + # identically). Dedup by the operand's own coords; keep the fused issue order. + seen_in, seen_w = set(), set() + for it in itertools.product(*[range(b) for b in bounds]): + in_key = tuple(it[fuse["in_to_fused"][d]] for d in range(rank)) + if in_key not in seen_in: + seen_in.add(in_key) + _emit_dma(mvin_input, [_const_index(c, ip) for c in in_key], vectorlane, ip) + w_key = tuple(it[fuse["w_to_fused"][d]] for d in range(rank)) + if w_key not in seen_w: + seen_w.add(w_key) + _emit_dma(mvin_weight, [_const_index(c, ip) for c in w_key], vectorlane, ip) mvin_input.op.erase() mvin_weight.op.erase() diff --git a/PyTorchSimFrontend/mlir/passes/lower_dma_to_gemmini.py b/PyTorchSimFrontend/mlir/passes/lower_dma_to_gemmini.py index f5b841bb..998a6db5 100644 --- a/PyTorchSimFrontend/mlir/passes/lower_dma_to_gemmini.py +++ b/PyTorchSimFrontend/mlir/passes/lower_dma_to_gemmini.py @@ -22,6 +22,8 @@ WAIT_NAME = "memref.dma_wait" MARKERS = (OP_NAME, WAIT_NAME) +from ._mlir_util import attr_i64_array + # func7 instruction codes (CustomDMAAttribute.h) CONFIG, CONFIG2, CONFIG3, CONFIG4 = 0, 4, 5, 6 MVIN, MVIN2, MVIN3, MVOUT = 2, 1, 14, 3 @@ -124,8 +126,8 @@ def elem_addr_i64(memref_val, indices, mtype, elem_bytes): tile_shape = _subtile(op) if tile_shape is None: tile_shape = list(dst_ty.shape) if is_mvin else list(src_ty.shape) - dram_strides = _int_array(op, "dram_stride") - spad_strides = _int_array(op, "sram_stride") + dram_strides = attr_i64_array(op, "dram_stride") + spad_strides = attr_i64_array(op, "sram_stride") assert len(tile_shape) == len(dram_strides) == len(spad_strides), \ f"shape/stride rank mismatch: {tile_shape} {dram_strides} {spad_strides}" @@ -180,11 +182,6 @@ def _subtile(op): return [IntegerAttr(a).value for a in ArrayAttr(op.attributes["subtile_size"])] -def _int_array(op, name): - from mlir.ir import ArrayAttr, IntegerAttr - return [IntegerAttr(a).value for a in ArrayAttr(op.attributes[name])] - - def _elem_bytes(elem_type): from mlir.ir import IntegerType, FloatType bits = (IntegerType(elem_type).width if IntegerType.isinstance(elem_type) diff --git a/PyTorchSimFrontend/mlir/passes/lower_to_emitc.py b/PyTorchSimFrontend/mlir/passes/lower_to_emitc.py new file mode 100644 index 00000000..3d1f7cde --- /dev/null +++ b/PyTorchSimFrontend/mlir/passes/lower_to_emitc.py @@ -0,0 +1,607 @@ +"""lower_to_emitc pass (C4): skeleton+API MLIR -> EmitC -> C++ -> trace `.so`. + +Second stage of the C++ trace pipeline (docs/design/togsim_cpp_trace.md, sec +5-7). Takes the skeleton+API module from `build_skeleton` (loop nest + +`togsim.*` ops) and produces an EmitC module whose single entry function + + extern "C" void togsim_kernel(EmitCtx* ctx, int64_t* shape_args, int32_t n) + +mirrors the loop skeleton, with every `togsim.*` op as an `emitc.call_opaque` +to the matching `togsim_runtime.h` free function (`togsim_ops.EMITC_CALLEE`). +`mlir-translate --mlir-to-cpp` renders it to C++, compiled to a `.so` that +exports `togsim_kernel` and leaves `togsim_dma/wait/compute/signal` undefined for +the TOGSim loader to resolve at `dlopen`. + +How the lowering is done -- it drives the *upstream* EmitC conversion passes and +adds only the glue they cannot do: + + 1. (python) Rewrite the unregistered `togsim.*` ops to `emitc.call_opaque`. + Unregistered ops have no registered conversion patterns, so this must be a + custom rewrite (design sec 8). Also rewrite the kernel's signature to the + ABI form (drop the memref tensor args -- the trace producer never touches + tensor data; base addresses are deferred to P3) and drop the aux + globals / wrapper func. + 2. (upstream passes, in-process PassManager) + func.func(lower-affine) -> convert-scf-to-emitc + -> convert-arith-to-emitc -> convert-func-to-emitc + This is the EmitC infrastructure: it lowers the affine/scf loop nest to + `emitc.for`, the index/arith (loop bounds, and in P3 the address + arithmetic) to EmitC, and the func to `emitc.func`. + 3. (python) Two small fixups the passes leave behind in this LLVM 20 build: + * `convert-scf-to-emitc` emits `emitc.for` with `index`-typed bounds, so + `convert-arith-to-emitc` (which makes constants `!emitc.size_t`) leaves + `builtin.unrealized_conversion_cast` on the bounds that nothing folds + and `mlir-to-cpp` cannot print (design sec 8 "EmitC coverage" risk). + `_fold_for_bound_casts` rewrites those casts away. + * add the `extern "C"` specifier so `dlsym` finds the entry unmangled. + +Requires the MLIR Python bindings (incl. `mlir.passmanager`); the .cpp/.so +steps additionally require `mlir-translate` (TORCHSIM_LLVM_PATH) and a host C++ +compiler. +""" + +import os +import subprocess + +from mlir.passmanager import PassManager + +from . import togsim_ops as ts +from ._mlir_util import walk_ops, i32, i64, attr_int, attr_i64_array +from .build_tog import ir, _find_kernel + +#: emitted entry symbol (== ts.ENTRY_SYMBOL == "togsim_kernel"). +ENTRY = ts.ENTRY_SYMBOL + +#: EmitC type of the opaque EmitCtx* threaded through every call. +CTX_TYPE = '!emitc.ptr>' + +#: upstream EmitC conversion pipeline (the infrastructure this pass drives). +_PIPELINE = ("builtin.module(" + "convert-vector-to-scf{full-unroll=true}," + "func.func(lower-affine)," + "func.func(lower-vector-multi-reduction)," + "convert-scf-to-emitc," + "convert-arith-to-emitc," + "convert-func-to-emitc)") + +#: prepended to the mlir-to-cpp output; pulls in size_t/intN_t and the ABI. +_PRELUDE = ( + "#include \n" + "#include \n" + "using std::size_t;\n" + '#include "togsim_runtime.h"\n' +) + + +# --------------------------------------------------------------------------- +# attribute builders / readers +# --------------------------------------------------------------------------- +def _idx(v): + return ir.IntegerAttr.get(ir.IndexType.get(), int(v)) + + +def _opaque(ctx, text): + return ir.Attribute.parse('#emitc.opaque<"%s">' % text, ctx) + + +def _arr(ctx, vals): + """A C compound-literal `(const int64_t[]){...}` arg, or `nullptr` if empty + (the call site decays it to a `const int64_t*`).""" + vals = list(vals) + if not vals: + return _opaque(ctx, "nullptr") + return _opaque(ctx, "(const int64_t[]){%s}" % ", ".join(str(int(v)) for v in vals)) + + +def _attr_bool(op, key): + return 1 if ir.BoolAttr(op.operation.attributes[key]).value else 0 + + +# --------------------------------------------------------------------------- +# step 1: rewrite signature + togsim.* ops (the unregistered-op glue) +# --------------------------------------------------------------------------- +def _strip_aux(module): + """Erase memref.global decls and every func except @kernel (the wrapper).""" + victims = [] + for op in module.body.operations: + name = op.operation.name + if name == "memref.global": + victims.append(op) + elif name == "func.func": + if ir.StringAttr(op.operation.attributes["sym_name"]).value != "kernel": + victims.append(op) + for op in victims: + op.operation.erase() + + +def _rewrite_signature(kernel, ctx): + """Replace @kernel's memref tensor args with the ABI args + (EmitCtx*, int64_t* shape_args, int32_t n) and rename it to togsim_kernel. + Returns the ctx Value.""" + block = kernel.regions[0].blocks[0] + for arg in block.arguments: + if len(list(arg.uses)) > 0: + raise ValueError( + "kernel arg still used after build_skeleton; cannot drop it " + "(expected the DCE to have removed all tensor-data ops)") + # erase existing (memref) args high-to-low, then append the ABI args. + for i in reversed(range(len(block.arguments))): + block.erase_argument(i) + ptr = ir.Type.parse(CTX_TYPE, ctx) + i64ptr = ir.Type.parse("!emitc.ptr", ctx) + i32 = ir.IntegerType.get_signless(32) + loc = ir.Location.unknown(ctx) + block.add_argument(ptr, loc) + block.add_argument(i64ptr, loc) + block.add_argument(i32, loc) + kernel.operation.attributes["function_type"] = ir.TypeAttr.get( + ir.FunctionType.get([ptr, i64ptr, i32], [])) + kernel.operation.attributes["sym_name"] = ir.StringAttr.get(ENTRY) + return block.arguments[0] + + +def _call(ctx, ctx_val, op, callee, arg_attrs): + """Insert emitc.call_opaque (ctx) {args=[0:index, ...]} before `op`. + The leading `0 : index` references operand 0 (ctx); other entries are + literal C args (integer attr -> literal, #emitc.opaque -> verbatim).""" + ir.Operation.create( + "emitc.call_opaque", results=[], operands=[ctx_val], + attributes={"callee": ir.StringAttr.get(callee), + "args": ir.ArrayAttr.get([_idx(0)] + arg_attrs)}, + loc=ir.Location.unknown(ctx), ip=ir.InsertionPoint(op)) + + +def _innermost_outer_loop(block): + """Deepest `affine.for {outer_loop=true}` (the PARALLEL/ACCUMULATION + boundary). Returns the op or None if the kernel has no parallel loop.""" + found = [None] + + def is_outer(op): + a = op.operation.attributes + return "outer_loop" in a and ir.BoolAttr(a["outer_loop"]).value + + def walk(b): + for op in b.operations: + if op.operation.name == "affine.for" and is_outer(op): + found[0] = op # nested outer loops overwrite -> deepest wins + for r in op.operation.regions: + for bb in r.blocks: + walk(bb) + + walk(block) + return found[0] + + +def _is_outer(forop): + a = forop.operation.attributes + return "outer_loop" in a and ir.BoolAttr(a["outer_loop"]).value + + +def _parallel_loop_chain(block): + """The nested chain of `affine.for {outer_loop}` from `block` inward (one + work-item's parallel indices). Empty if the kernel has no parallel loop.""" + chain = [] + cur = block + while True: + nxt = None + for op in cur.operations: + if op.operation.name == "affine.for" and _is_outer(op): + nxt = op + break + if nxt is None: + break + chain.append(nxt) + cur = nxt.operation.regions[0].blocks[0] + return chain + + +def _const_op(value): + """The defining arith/emitc constant Operation if `value` is a constant + result, else None (block args / other ops).""" + owner = value.owner + if isinstance(owner, ir.Block): + return None + return owner if owner.name in ("arith.constant", "emitc.constant") else None + + +def _outline_work_item(ctx, kernel, ctx_val): + """Outline the innermost parallel work-item body into a uniform + `togsim_kernel_tile(ctx, iv, n)` func, replacing it with a + `togsim_dispatch(ctx, togsim_kernel_tile, iv, n)` call (sec 9.3). The + work-item SCOPE becomes the function body; the runtime wrapper owns the + core-alloc + the TILE_BEGIN/TILE_END boundary (a decorator). One uniform tile + signature -> a single general dispatcher serves every kernel. + + Runs after `_rewrite_togsim_ops`, so the moved body holds emitc.call_opaque + (not togsim.* ops). The only values captured from outside the body are ctx, + the enclosing parallel induction vars, and constants -- threaded via the iv + array (parallel IVs) / cloned (constants); anything else is unsupported + (dynamic shape -> P4).""" + kblk = kernel.regions[0].blocks[0] + chain = _parallel_loop_chain(kblk) + if chain: + L = chain[-1] + Lbody = L.operation.regions[0].blocks[0] + ivs = [c.operation.regions[0].blocks[0].arguments[0] for c in chain] + else: # no parallel loop -> the whole kernel body is one work-item + L = None + Lbody = kblk + ivs = [] + + i64 = ir.IntegerType.get_signless(64) + i32 = ir.IntegerType.get_signless(32) + idxty = ir.IndexType.get() + ctxty = ir.Type.parse(CTX_TYPE, ctx) + i64ptr = ir.Type.parse("!emitc.ptr", ctx) + loc = ir.Location.unknown(ctx) + + # --- the outlined tile function (before the kernel so C defines it first) --- + tile = ir.Operation.create( + "func.func", results=[], regions=1, + attributes={ + "function_type": ir.TypeAttr.get(ir.FunctionType.get([ctxty, i64ptr, i32], [])), + "sym_name": ir.StringAttr.get(ts.TILE_SYMBOL), + "sym_visibility": ir.StringAttr.get("private")}, + loc=loc, ip=ir.InsertionPoint(kernel)) + with loc: + tblk = tile.regions[0].blocks.append(ctxty, i64ptr, i32) + ctx2, iv2, _n2 = tblk.arguments + with ir.InsertionPoint(tblk): + tret = ir.Operation.create("func.return", results=[], operands=[], loc=loc) + + # in the tile fn: recover each parallel index = index_cast(iv[k]). + idx_vals = [] + with ir.InsertionPoint(tret): + for k in range(len(ivs)): + kc = ir.Operation.create("emitc.constant", results=[i64], + attributes={"value": ir.IntegerAttr.get(i64, k)}, loc=loc).results[0] + elem = ir.Operation.create("emitc.subscript", results=[i64], + operands=[iv2, kc], loc=loc).results[0] + idx_vals.append(ir.Operation.create("arith.index_cast", results=[idxty], + operands=[elem], loc=loc).results[0]) + + # move the work-item body into the tile fn (terminators stay behind). + for op in [o for o in Lbody.operations + if o.operation.name not in ("affine.yield", "func.return")]: + op.operation.move_before(tret) + + # remap captures (Value `==` is identity): ctx -> ctx2, each parallel IV -> + # its index_cast, each external constant -> a clone inside the tile fn. A + # constant defined inside the tile fn (moved/read) is internal -> left alone. + caps = [(ctx_val, ctx2)] + list(zip(ivs, idx_vals)) + internal_consts = [] + def _collect_internal(block): + for op in block.operations: + c = _const_op(op.operation.results[0]) if len(op.operation.results) == 1 else None + if c is not None: + internal_consts.append(op.operation.results[0]) + for rg in op.operation.regions: + for b in rg.blocks: + _collect_internal(b) + _collect_internal(tblk) + const_clones = [] + ext_consts = [] + def _find_ext_consts(block): + for op in block.operations: + for opnd in op.operation.operands: + if _const_op(opnd) is None: + continue + if any(opnd == ic for ic in internal_consts): + continue + if any(opnd == e for e in ext_consts): + continue + ext_consts.append(opnd) + for rg in op.operation.regions: + for b in rg.blocks: + _find_ext_consts(b) + _find_ext_consts(tblk) + top = ir.InsertionPoint(tblk.operations[0]) + for e in ext_consts: + c = _const_op(e) + clone = ir.Operation.create(c.name, results=[e.type], + attributes={"value": c.attributes["value"]}, loc=loc, ip=top).results[0] + const_clones.append((e, clone)) + + allcaps = caps + const_clones + def _remap(block): + for op in block.operations: + for i in range(len(op.operation.operands)): + cur = op.operation.operands[i] + for orig, new in allcaps: + if cur == orig: + op.operation.operands[i] = new + break + for rg in op.operation.regions: + for b in rg.blocks: + _remap(b) + _remap(tblk) + + # --- the dispatcher: marshal the IVs and hand the tile fn to togsim_dispatch --- + term = [o for o in Lbody.operations + if o.operation.name in ("affine.yield", "func.return")][0] + fn_ref = _opaque(ctx, ts.TILE_SYMBOL) # function name -> verbatim pointer in C + with ir.InsertionPoint(term): + if ivs: + arrty = ir.Type.parse("!emitc.array<%dxi64>" % len(ivs), ctx) + arr = ir.Operation.create("emitc.variable", results=[arrty], + attributes={"value": _opaque(ctx, "")}, loc=loc).results[0] + for k, iv in enumerate(ivs): + kc = ir.Operation.create("emitc.constant", results=[i64], + attributes={"value": ir.IntegerAttr.get(i64, k)}, loc=loc).results[0] + v64 = ir.Operation.create("arith.index_cast", results=[i64], + operands=[iv], loc=loc).results[0] + sub = ir.Operation.create("emitc.subscript", results=[i64], + operands=[arr, kc], loc=loc).results[0] + # emitc.assign operands are (lvalue dest, value). + ir.Operation.create("emitc.assign", results=[], operands=[sub, v64], loc=loc) + ir.Operation.create( + "emitc.call_opaque", results=[], operands=[ctx_val, arr], + attributes={"callee": ir.StringAttr.get(ts.DISPATCH_CALLEE), + "args": ir.ArrayAttr.get( + [_idx(0), fn_ref, _idx(1), ir.IntegerAttr.get(i32, len(ivs))])}, + loc=loc) + else: + ir.Operation.create( + "emitc.call_opaque", results=[], operands=[ctx_val], + attributes={"callee": ir.StringAttr.get(ts.DISPATCH_CALLEE), + "args": ir.ArrayAttr.get( + [_idx(0), fn_ref, _opaque(ctx, "nullptr"), ir.IntegerAttr.get(i32, 0)])}, + loc=loc) + + +def _rewrite_togsim_ops(ctx, kernel, ctx_val): + block = kernel.regions[0].blocks[0] + victims = [] + for op in walk_ops(block): + name = op.operation.name + ipo = ir.InsertionPoint(op) + if name == ts.DMA: + dims = attr_i64_array(op, ts.ATTR_DIMS) + # The DRAM element offset is the togsim.dma operand (the original + # affine index, kept live by build_skeleton); pass it as a call + # operand so convert-arith-to-emitc lowers the address arithmetic + # into the producer (P3 approach A). The runtime adds the tensor base. + # Operands carried by build_skeleton: [dram_index, tag_index] (each + # optional). Pass each as a call operand so convert-arith-to-emitc + # lowers it; reference it from `args` by its operand position. offset + # -> DRAM byte address (runtime adds the tensor base); tag_slot -> the + # SRAM tile slot (runtime uses it for double-buffer/SRAM-capacity). + ins = list(op.operation.operands) + dram_operand = ins[0] if len(ins) >= 1 else None + tag_operand = ins[1] if len(ins) >= 2 else None + operands = [ctx_val] + offset_arg = i64(0) + tag_arg = i64(0) + if dram_operand is not None: + operands.append(dram_operand) + offset_arg = _idx(len(operands) - 1) + if tag_operand is not None: + operands.append(tag_operand) + tag_arg = _idx(len(operands) - 1) + args = [_idx(0), + i32(attr_int(op, ts.ATTR_DIR)), + i32(attr_int(op, ts.ATTR_ARG_ID)), + offset_arg, + i32(len(dims)), + _arr(ctx, dims), + _arr(ctx, attr_i64_array(op, ts.ATTR_STRIDES)), + i32(attr_int(op, ts.ATTR_ELEM_BITS)), + i32(_attr_bool(op, ts.ATTR_IS_ASYNC)), + i32(attr_int(op, ts.ATTR_TAG_ID)), + tag_arg] + _rb = attr_i64_array(op, ts.ATTR_READ_BUFS) + _wb = attr_i64_array(op, ts.ATTR_WRITE_BUFS) + args += [_arr(ctx, _rb), i32(len(_rb)), _arr(ctx, _wb), i32(len(_wb))] + # togsim_dma is void: the dma is paired with its barrier by the runtime + # (tag_id, tag_slot), not a returned handle. + ir.Operation.create( + "emitc.call_opaque", results=[], operands=operands, + attributes={"callee": ir.StringAttr.get(ts.EMITC_CALLEE[ts.DMA]), + "args": ir.ArrayAttr.get(args)}, + loc=ir.Location.unknown(ctx), ip=ipo) + victims.append(op) + elif name == ts.MEMORY_BAR: + # explicit async-DMA sync (the original dma_wait) -> + # togsim_memory_barrier(ctx, tag_id, tag_slot, write_bufs). The tag + # index operand (if any) is the runtime tag slot. + ins = list(op.operation.operands) + operands = [ctx_val] + tag_arg = i64(0) + if ins: + operands.append(ins[0]) + tag_arg = _idx(len(operands) - 1) + _wb = attr_i64_array(op, ts.ATTR_WRITE_BUFS) + ir.Operation.create( + "emitc.call_opaque", results=[], operands=operands, + attributes={"callee": ir.StringAttr.get(ts.EMITC_CALLEE[ts.MEMORY_BAR]), + "args": ir.ArrayAttr.get( + [_idx(0), i32(attr_int(op, ts.ATTR_TAG_ID)), tag_arg, + _arr(ctx, _wb), i32(len(_wb))])}, + loc=ir.Location.unknown(ctx), ip=ipo) + victims.append(op) + elif name == ts.COMPUTE: + # skeleton compute carries no dims (cost is keyed by tile_id) -> 0/null. + _rb = attr_i64_array(op, ts.ATTR_READ_BUFS) + _wb = attr_i64_array(op, ts.ATTR_WRITE_BUFS) + _call(ctx, ctx_val, op, ts.EMITC_CALLEE[ts.COMPUTE], + [i64(attr_int(op, ts.ATTR_TILE_ID)), + i32(attr_int(op, ts.ATTR_COMPUTE_TYPE)), + i32(0), _opaque(ctx, "nullptr"), + _arr(ctx, _rb), i32(len(_rb)), _arr(ctx, _wb), i32(len(_wb))]) + victims.append(op) + for op in victims: + op.operation.erase() + + +# --------------------------------------------------------------------------- +# step 3: post-conversion fixups +# --------------------------------------------------------------------------- +def _retype_for_to_size_t(module): + """Make every `emitc.for` use `!emitc.size_t` bounds + induction variable, + then drop the `index`<->`!emitc.size_t` `unrealized_conversion_cast` ops that + `convert-scf-to-emitc` / `convert-arith-to-emitc` leave behind (mlir-to-cpp + cannot print them; --reconcile cannot fold them). + + `emitc.for` accepts `size_t` bounds with the explicit type, and a `size_t` IV + makes the lowered address arithmetic (`convert-arith-to-emitc`, which works + in `size_t`) cast-free. So: set each IV to size_t, then for every + index<->size_t cast replace its result with its source (every consumer here + -- `emitc.for` bounds, `emitc.call_opaque` operands, `emitc` arith -- accepts + either, and after the IV retype each such cast bridges equal types).""" + idx = ir.IndexType.get() + st = ir.Type.parse("!emitc.size_t", module.context) + + for op in list(walk_ops(module.body)): + if op.operation.name == "emitc.for": + op.operation.regions[0].blocks[0].arguments[0].set_type(st) + + dead = [] + for op in list(walk_ops(module.body)): + if op.operation.name != "builtin.unrealized_conversion_cast": + continue + res = op.results[0] + src = list(op.operation.operands)[0] + # idx<->size_t bridges (incl. the size_t->size_t identities left after + # the IV retype): every consumer here accepts either, so fold to source. + if src.type in (idx, st) and res.type in (idx, st): + res.replace_all_uses_with(src) + dead.append(op) + for d in dead: + try: + d.operation.erase() + except Exception: + pass + + +def _add_extern_c(module, ctx): + for op in module.body.operations: + if (op.operation.name == "emitc.func" + and ir.StringAttr(op.operation.attributes["sym_name"]).value == ENTRY): + op.operation.attributes["specifiers"] = ir.ArrayAttr.get( + [ir.StringAttr.get('extern "C"')]) + return + raise ValueError("emitc.func @%s not found after conversion" % ENTRY) + + +# --------------------------------------------------------------------------- +# driver +# --------------------------------------------------------------------------- +def lower_to_emitc(skeleton_module): + """Lower a skeleton+API module (in place) to an EmitC module with the + `togsim_kernel` entry function. Returns the same module.""" + ctx = skeleton_module.context + kernel = _find_kernel(skeleton_module) + if kernel is None: + raise ValueError("no @kernel found in skeleton module") + + _strip_aux(skeleton_module) + ctx_val = _rewrite_signature(kernel, ctx) + _rewrite_togsim_ops(ctx, kernel, ctx_val) # togsim.* -> emitc.call_opaque + _outline_work_item(ctx, kernel, ctx_val) # work-item body -> togsim_kernel_tile + dispatch + + PassManager.parse(_PIPELINE, ctx).run(skeleton_module.operation) + + _retype_for_to_size_t(skeleton_module) + _add_extern_c(skeleton_module, ctx) + return skeleton_module + + +# --------------------------------------------------------------------------- +# C++ / .so backend +# --------------------------------------------------------------------------- +def _mlir_translate_bin(): + return os.path.join(os.environ.get("TORCHSIM_LLVM_PATH", "/usr/bin"), + "mlir-translate") + + +def emitc_to_cpp(emitc_module, mlir_translate=None): + """Render `emitc_module` to C++ source (prelude + mlir-to-cpp body).""" + mlir_translate = mlir_translate or _mlir_translate_bin() + proc = subprocess.run( + [mlir_translate, "--mlir-to-cpp"], + input=str(emitc_module), capture_output=True, text=True) + if proc.returncode != 0: + raise RuntimeError("mlir-translate --mlir-to-cpp failed:\n" + proc.stderr) + return _PRELUDE + proc.stdout + + +def compile_so(cpp_text, so_path, include_dir, cxx=None): + """Compile producer C++ to `so_path`. `include_dir` must hold + togsim_runtime.h. togsim_* symbols are left undefined (resolved at dlopen).""" + cxx = cxx or os.environ.get("CXX", "g++") + cpp_path = os.path.splitext(so_path)[0] + ".cpp" + with open(cpp_path, "w") as fh: + fh.write(cpp_text) + proc = subprocess.run( + [cxx, "-shared", "-fPIC", "-std=gnu++17", "-O2", + "-I", include_dir, cpp_path, "-o", so_path], + capture_output=True, text=True) + if proc.returncode != 0: + raise RuntimeError("%s failed:\n%s" % (cxx, proc.stderr)) + return so_path + + +def _default_include_dir(): + root = os.environ.get("TORCHSIM_DIR") + if not root: + root = os.path.dirname(os.path.dirname(os.path.dirname( + os.path.dirname(os.path.abspath(__file__))))) + return os.path.join(root, "TOGSim", "include") + + +def skeleton_to_so(skeleton_module, so_path, include_dir=None): + """skeleton module -> EmitC -> C++ -> compiled trace `.so`. Returns the + EmitC module text (for inspection / caching).""" + emitc = lower_to_emitc(skeleton_module) + cpp = emitc_to_cpp(emitc) + compile_so(cpp, so_path, include_dir or _default_include_dir()) + return str(emitc) + + +def build_trace_so(postvcix_path, so_path, include_dir=None): + """Full P2 path from a post-vcix kernel .mlir to a trace `.so`.""" + from . import build_skeleton as bs + + ctx = ir.Context() + ctx.allow_unregistered_dialects = True + with ctx: + module = ir.Module.parse(open(postvcix_path).read(), ctx) + bs.build_skeleton(module) + return skeleton_to_so(module, so_path, include_dir) + + +def main(argv): + import argparse + + parser = argparse.ArgumentParser(prog="lower_to_emitc.py") + parser.add_argument("input", help="post-vcix kernel .mlir") + parser.add_argument("--so", required=True, help="output .so path") + parser.add_argument("--include-dir", default=None, + help="dir holding togsim_runtime.h (default: TOGSim/include)") + parser.add_argument("--emit-cpp", default=None, + help="also write the generated C++ here") + parser.add_argument("--emit-mlir", default=None, + help="also write the EmitC MLIR here") + args = parser.parse_args(argv[1:]) + + from . import build_skeleton as bs + ctx = ir.Context() + ctx.allow_unregistered_dialects = True + with ctx: + module = ir.Module.parse(open(args.input).read(), ctx) + bs.build_skeleton(module) + emitc = lower_to_emitc(module) + if args.emit_mlir: + open(args.emit_mlir, "w").write(str(emitc)) + cpp = emitc_to_cpp(emitc) + if args.emit_cpp: + open(args.emit_cpp, "w").write(cpp) + compile_so(cpp, args.so, args.include_dir or _default_include_dir()) + import sys + sys.stderr.write("wrote %s\n" % args.so) + return 0 + + +if __name__ == "__main__": + import sys + sys.exit(main(sys.argv)) diff --git a/PyTorchSimFrontend/mlir/passes/lower_to_vcix.py b/PyTorchSimFrontend/mlir/passes/lower_to_vcix.py index ac93ebc8..df124d00 100644 --- a/PyTorchSimFrontend/mlir/passes/lower_to_vcix.py +++ b/PyTorchSimFrontend/mlir/passes/lower_to_vcix.py @@ -29,6 +29,8 @@ import mlir.ir as ir # noqa: E402 +from ._mlir_util import walk_ops, i32, i64, attr_bool + MARKERS = ("linalg.matmul", "math.exp", "math.erf", "math.tanh", "math.sin", "math.cos") # math op name -> (opcode, imm) for the vcix.v.iv lowering (mirror Math*ToVCIX). @@ -80,20 +82,12 @@ def _legalize_vector_type(vt, vlen): return n, ir.VectorType.get([elt_count >> (n - 1)], elt_ty, scalable=[True]) -def _i64(v): - return ir.IntegerAttr.get(ir.IntegerType.get_signless(64), v) - - -def _i32(v): - return ir.IntegerAttr.get(ir.IntegerType.get_signless(32), v) - - def _viv(operand, result_ty, opcode, imm, rvl=None): """Create an unregistered vcix.v.iv (vcix::BinaryImmOp) op at the current IP.""" operands = [operand] if rvl is None else [operand, rvl] return ir.Operation.create( "vcix.v.iv", results=[result_ty], operands=operands, - attributes={"opcode": _i64(opcode), "imm": _i32(imm)}).results[0] + attributes={"opcode": i64(opcode), "imm": i32(imm)}).results[0] def _make_sf_vc_v_iv(vec, op_vt, n, legal_ty, opcode, imm): @@ -104,7 +98,7 @@ def _make_sf_vc_v_iv(vec, op_vt, n, legal_ty, opcode, imm): scalable = legal_ty.scalable rvl = None if scalable: - rvl = arith.ConstantOp(ir.IntegerType.get_signless(64), _i64(9)).result + rvl = arith.ConstantOp(ir.IntegerType.get_signless(64), i64(9)).result if n == 1: return _viv(vec, legal_ty, opcode, imm, rvl) elt_ty = legal_ty.element_type @@ -119,24 +113,16 @@ def _make_sf_vc_v_iv(vec, op_vt, n, legal_ty, opcode, imm): for i in range(total // elt_count): ext = vector.ExtractStridedSliceOp( legal_ty, vec, - ir.ArrayAttr.get([_i64(i * elt_count)]), - ir.ArrayAttr.get([_i64(elt_count)]), - ir.ArrayAttr.get([_i64(1)])).result + ir.ArrayAttr.get([i64(i * elt_count)]), + ir.ArrayAttr.get([i64(elt_count)]), + ir.ArrayAttr.get([i64(1)])).result v = _viv(ext, legal_ty, opcode, imm, rvl) res = vector.InsertStridedSliceOp( - v, res, ir.ArrayAttr.get([_i64(i * elt_count)]), - ir.ArrayAttr.get([_i64(1)])).result + v, res, ir.ArrayAttr.get([i64(i * elt_count)]), + ir.ArrayAttr.get([i64(1)])).result return res -def _iter_ops(block): - for op in list(block.operations): - yield op - for region in op.operation.regions: - for b in region.blocks: - yield from _iter_ops(b) - - # --------------------------------------------------------------------------- # matmul lowering helpers (mirror MatmulOpLowering) # --------------------------------------------------------------------------- @@ -146,11 +132,6 @@ def _elt_bits(elt_ty): return ir.FloatType(elt_ty).width -def _bool_attr_true(op, key): - a = op.attributes - return key in a and ir.BoolAttr(a[key]).value - - def _enclosing_loops(op): """Walk ancestor ops; return (accumulation, outer, inner) affine.for lists, outermost-first (mirror the C++ insert-at-begin).""" @@ -158,11 +139,11 @@ def _enclosing_loops(op): parent = op.operation.parent while parent is not None: if parent.name == "affine.for": - if _bool_attr_true(parent, "accumulation_loop"): + if attr_bool(parent, "accumulation_loop"): acc.insert(0, parent) - if _bool_attr_true(parent, "outer_loop"): + if attr_bool(parent, "outer_loop"): outer.insert(0, parent) - if _bool_attr_true(parent, "inner_loop"): + if attr_bool(parent, "inner_loop"): inner.insert(0, parent) parent = parent.parent return acc, outer, inner @@ -200,7 +181,7 @@ def _scan_conv_offsets(ow_loop, o_h, k_h, o_w, k_w): """Mirror the heuristic offset scan: find affine.apply(o_h,k_h)/(o_w,k_w) in the o_w loop and read the constant in its map (default 1).""" offset_h = offset_w = 1 - for o in _iter_ops(ow_loop.regions[0].blocks[0]): + for o in walk_ops(ow_loop.regions[0].blocks[0]): if o.operation.name != "affine.apply": continue ops = list(o.operation.operands) @@ -391,7 +372,7 @@ def _root(v): return owner.operands[0] return v rootA, rootB = _root(A), _root(B) - for o in _iter_ops(outer[-1].regions[0].blocks[0]): + for o in walk_ops(outer[-1].regions[0].blocks[0]): if o.operation.name == "affine.vector_store": dest = _root(o.operation.operands[1]) if dest == rootA: @@ -488,6 +469,14 @@ def _root(v): # --- B dma_wait --- nacc = len(acc) acc_ivs = [_loop_iv(l) for l in acc] + # LEGACY behavior: coefficient -1 on each accumulation (reduction) loop var + # is a SENTINEL marking "this tag dim is the reduction axis", not an + # arithmetic offset. The legacy TOG path (TileGraphParser.cc) honors it by + # routing those vars to a separate accum tag component and skipping stride + # -1. The C++ trace path does NOT honor it: build_skeleton._strip_accum_terms + # drops these -1 terms so the memory_barrier slot stays subtile-only and + # pairs with its async load. Kept here for byte-identity with the C++ + # -test-pytorchsim-to-vcix pass; remove (do not flag) once legacy retires. bexpr = ir.AffineDimExpr.get(0) * -1 for i in range(1, nacc): bexpr = bexpr + ir.AffineDimExpr.get(i) * -1 @@ -544,6 +533,10 @@ def _root(v): with body_ip: # --- A dma_wait --- + # LEGACY behavior (see the B dma_wait above): the -1 coefficients mark the + # reduction axis for the legacy TOG path; the trace path strips them in + # build_skeleton._strip_accum_terms. Kept for byte-identity with the C++ + # -test-pytorchsim-to-vcix pass; remove once legacy retires. aexpr = ir.AffineDimExpr.get(0) * -1 for i in range(1, nacc): aexpr = aexpr + ir.AffineDimExpr.get(i) * -1 @@ -617,7 +610,7 @@ def run(module, vectorlane=128, vlen=128, **_): mms = [] for region in module.operation.regions: for b in region.blocks: - for o in _iter_ops(b): + for o in walk_ops(b): if o.operation.name == "linalg.matmul": mms.append(o.operation) for o in mms: @@ -625,7 +618,7 @@ def run(module, vectorlane=128, vlen=128, **_): targets = [] for region in module.operation.regions: for b in region.blocks: - for op in _iter_ops(b): + for op in walk_ops(b): if op.operation.name in _MATH_VIV: targets.append(op.operation) for op in targets: diff --git a/PyTorchSimFrontend/mlir/passes/lower_vlane_idx.py b/PyTorchSimFrontend/mlir/passes/lower_vlane_idx.py index 76e30cb3..3ed0a394 100644 --- a/PyTorchSimFrontend/mlir/passes/lower_vlane_idx.py +++ b/PyTorchSimFrontend/mlir/passes/lower_vlane_idx.py @@ -24,13 +24,7 @@ OP_NAME = "torchsim.vlane_idx" MARKERS = (OP_NAME,) - -def _iter_ops(block): - for op in list(block.operations): - yield op - for region in op.operation.regions: - for b in region.blocks: - yield from _iter_ops(b) +from ._mlir_util import walk_ops def run(module, **_): @@ -46,7 +40,7 @@ def run(module, **_): targets = [] for region in module.operation.regions: for b in region.blocks: - for op in _iter_ops(b): + for op in walk_ops(b): if op.operation.name == OP_NAME: targets.append(op.operation) diff --git a/PyTorchSimFrontend/mlir/passes/togsim_ops.py b/PyTorchSimFrontend/mlir/passes/togsim_ops.py new file mode 100644 index 00000000..21983da0 --- /dev/null +++ b/PyTorchSimFrontend/mlir/passes/togsim_ops.py @@ -0,0 +1,102 @@ +"""Shared vocabulary for the skeleton+API MLIR form (C1). + +The trace pipeline (docs/design/togsim_cpp_trace.md) reduces a kernel's MLIR to +a *loop skeleton + API calls*: native `affine.for`/`scf.for` loops (bounds kept +as-is, symbolic preserved) plus a handful of `togsim.*` ops that stand for the +runtime API. This module is the single source of truth for those op names and +attribute keys, shared by: + + * build_skeleton (C2) -- produces the skeleton+API MLIR, and + * togsim->emitc lowering (C4) -- rewrites each op to an `emitc.call_opaque`. + +The ops are kept *unregistered* (like the existing `togsim.transfer`), so there +is no C++ dialect to register; C4 is a custom rewrite, not a registered +ConversionPass. + +Grammar (each op lowers 1:1 to a `togsim_runtime.h` free function): + + "togsim.dma"(%dram_idx, %tag_idx) { -> togsim_dma(ctx, dir, arg_id, + dir = 0 | 1, # LOAD|STORE offset, ndim, dims, strides, + dims = [..], strides = [..], elem_bits, is_async, + elem_bits = i32, is_async = bool, tag_id, tag_slot, + tag_id = i32, arg_id = i32, read_bufs, write_bufs) + read_bufs = [..], write_bufs = [..] + } : (index, index) -> () + + "togsim.compute"() { -> togsim_compute(ctx, tile_id, + tile_id = i64, compute_type = i32, compute_type, ndim, dims, + read_bufs = [..], write_bufs = [..] read_bufs, write_bufs) + } : () -> () + + "togsim.memory_barrier"(%tag_idx) { -> togsim_memory_barrier(ctx, + tag_id = i32, write_bufs = [..] tag_id, tag_slot, write_bufs) + } : (index) -> () + +How an async dma pairs with its sync point: NOT by a compile-time id. One static +`togsim.dma` op runs once per loop iteration, each with a different RUNTIME tag +slot `%tag[%idx]`, so the pairing must be a runtime key. `togsim.dma` carries a +`tag_id` (its tag memref identity) and the runtime `%tag[%idx]` operand; the +original `memref.dma_wait` becomes an explicit `togsim.memory_barrier` carrying +the same `tag_id` + tag index. They pair at runtime by `(tag_id, tag_slot)` via +the Core's tag table (the dma signals the tag at data-arrival; the barrier waits +it). `tag_id` (which tag memref) is distinct from `tag_slot` (the SRAM tile slot, +used for the double-buffer / capacity model). A sync (non-async) dma is blocking, +so it needs no barrier. (Supersedes the earlier static `event_id` + `togsim.wait` +design, which could not express per-iteration pairing.) + +Keep this in lockstep with TOGSim/include/togsim_runtime.h (TOGSIM_ABI_VERSION). +""" + +# ---- op names ------------------------------------------------------------- +DMA = "togsim.dma" +COMPUTE = "togsim.compute" +MEMORY_BAR = "togsim.memory_barrier" # explicit async-DMA sync (the original dma_wait); tag-keyed + +#: every op this module owns (for matchers / DCE roots in C2). +OP_NAMES = (DMA, COMPUTE, MEMORY_BAR) + +#: op name -> the togsim_runtime.h symbol C4 lowers it to. +EMITC_CALLEE = { + DMA: "togsim_dma", + COMPUTE: "togsim_compute", + MEMORY_BAR: "togsim_memory_barrier", +} + +#: producer entry-point symbol the TOGSim loader resolves (see togsim_runtime.h). +ENTRY_SYMBOL = "togsim_kernel" + +#: outlined per-work-item function the dispatcher hands to togsim_dispatch +#: (uniform signature (ctx, int64* iv, i32 n); see togsim_cpp_trace.md sec 9.3). +TILE_SYMBOL = "togsim_kernel_tile" + +#: runtime callees emitted directly by lower_to_emitc (not skeleton ops), kept in +#: lockstep with togsim_runtime.h. DISPATCH_CALLEE is the higher-order wrapper the +#: dispatcher loop calls per work-item (round-robins a core + TILE_BEGIN/END); +#: TILE_SYMBOL is passed to it as the function pointer. +DISPATCH_CALLEE = "togsim_dispatch" + +# ---- attribute keys ------------------------------------------------------- +ATTR_DIR = "dir" # i32: DIR_LOAD | DIR_STORE +ATTR_DIMS = "dims" # i64 array: tile extents +ATTR_STRIDES = "strides" # i64 array: tile strides +ATTR_ELEM_BITS = "elem_bits" # i32 +ATTR_IS_ASYNC = "is_async" # bool +ATTR_TILE_ID = "tile_id" # i64: key into the precomputed tile_id->cycle table +ATTR_COMPUTE_TYPE = "compute_type" # i32: 0 vector / 1 matmul / 2 preload (Core enum) +ATTR_READ_BUFS = "read_bufs" # i64 array: SRAM buffer ids this op reads (sec 10 dataflow) +ATTR_WRITE_BUFS = "write_bufs" # i64 array: SRAM buffer ids this op writes (sec 10 dataflow) +ATTR_TAG_ID = "tag_id" # i32: identity of the DMA's tag memref; pairs an async dma with + # its memory_barrier by the RUNTIME tag slot (tag_id + tag index) +ATTR_ARG_ID = "arg_id" # i32: which tensor (func arg) this DMA's base is + +# Must match togsim_dma_dir in togsim_runtime.h. +DIR_LOAD = 0 +DIR_STORE = 1 + + +def is_togsim_op(op): + """True if `op` (an Operation or a wrapping view) is one of ours.""" + name = getattr(op, "name", None) + if name is None: + name = getattr(getattr(op, "operation", None), "name", None) + return name in OP_NAMES diff --git a/README.md b/README.md index f0bdc772..c2298376 100644 --- a/README.md +++ b/README.md @@ -385,6 +385,7 @@ num_cores: 1 core_freq_mhz: 940 core_stats_print_period_cycles: 10000 num_systolic_array_per_core: 2 +sa_weight_buffer_depth: 2 # per-SA resident weight slots; must be > 0 (default 2). Raise to loosen the preload throttle. # Optional: one entry per core, default ws_mesh # core_type: [ws_mesh, ws_mesh] # Optional STONNE cores: stonne_config_path, num_stonne_per_core, num_stonne_port @@ -453,7 +454,7 @@ codegen_compiler_optimization: all # all | none | list of option names One-line meaning for each group (details in the YAML block above). -- **Core (`num_cores`, `core_freq_mhz`, `core_stats_print_period_cycles`, `num_systolic_array_per_core`, optional `core_type`, STONNE keys)**: how many cores, their clock, stats cadence, systolic count per core, and optional non-default mesh vs STONNE mix. +- **Core (`num_cores`, `core_freq_mhz`, `core_stats_print_period_cycles`, `num_systolic_array_per_core`, `sa_weight_buffer_depth`, optional `core_type`, STONNE keys)**: how many cores, their clock, stats cadence, systolic count per core, the per-SA resident weight-slot count (must be > 0; bounds preload run-ahead—raise it to loosen the throttle), and optional non-default mesh vs STONNE mix. - **VPU (`vpu_*`)**: vector lane count, per-lane scratchpad (KB), and vector register width—**compiler** uses these for tiling/codegen. - **DRAM (`dram_type`, `dram_channels`, …)**: `ramulator2` uses `ramulator_config_path`; `simple` uses fixed latency and optional bandwidth caps (`dram_bandwidth_gbps_*`, `dram_freq_mhz` when capped). `dram_num_partitions` splits channels for NUMA-style addressing. - **Interconnect (`icnt_*`, `booksim_config_path`)**: `simple` adds fixed hop latency (`icnt_latency_cycles`); `booksim2` points at a BookSim2 topology file. diff --git a/Simulator/simulator.py b/Simulator/simulator.py index 2b9f05be..a4517285 100644 --- a/Simulator/simulator.py +++ b/Simulator/simulator.py @@ -560,7 +560,23 @@ def run_standalone( os.fsync(trace_file.fileno()) try: - cmd = f"{TOGSimulator.get_togsim_command(config_path, togsim_path)} --models_list {trace_file_path}" + # The C++ TOG (trace) path is the DEFAULT: drive the simulation from the + # emitted trace.so. The legacy ONNX TOG is the opt-in fallback via + # TORCHSIM_LEGACY_TOG=1. Each autotune candidate compiles to its own + # write_path (keyed by its retiled source), so its trace.so/cycle_table sit + # next to its tile_graph.onnx -- benchmark it through the trace path too. + # Fall back to legacy only if the .so was not emitted. + trace_so = os.path.join(os.path.dirname(str(model_path)), "trace.so") + cycle_tsv = os.path.join(os.path.dirname(str(model_path)), "trace_cycles.tsv") + base_cmd = TOGSimulator.get_togsim_command(config_path, togsim_path) + use_trace = (os.environ.get("TORCHSIM_LEGACY_TOG") != "1" + and os.path.exists(trace_so)) + if os.environ.get("TORCHSIM_LEGACY_TOG") == "1": + logger.warning("TORCHSIM_LEGACY_TOG=1 selects the DEPRECATED legacy ONNX TOG path") + if use_trace: + cmd = f"{base_cmd} --trace_so {trace_so} --cycle_table {cycle_tsv}" + else: # DEPRECATED: legacy ONNX TOG path + cmd = f"{base_cmd} --models_list {trace_file_path}" if extension_config.CONFIG_TOGSIM_DEBUG_LEVEL: cmd += f" --log_level {extension_config.CONFIG_TOGSIM_DEBUG_LEVEL}" diff --git a/TOGSim/include/Core.h b/TOGSim/include/Core.h index 286feb5f..75ad9cf4 100644 --- a/TOGSim/include/Core.h +++ b/TOGSim/include/Core.h @@ -1,6 +1,7 @@ #pragma once #include #include +#include #include #include #include @@ -19,11 +20,22 @@ enum class InstFinishTraceTag { DmaRespComplete, }; +// A timed effect due at a cycle: free a weight slot, or wake a MEMORY_BAR. +struct DueAction { + enum Kind { FreeWeightSlot, WakeBar } kind; + std::shared_ptr token; + std::shared_ptr bar; +}; + class Core { public: Core(uint32_t id, SimulationConfig config); ~Core()=default; virtual bool running(); + // True if this core has work actively in flight (DMA / compute pipeline / queues) + // that will produce a future finish event -- i.e. running() minus "tiles waiting". + // Used by the frozen-state (spad-too-small) guard. + bool has_inflight(); virtual bool can_issue(const std::shared_ptr& op); virtual void issue(std::shared_ptr tile); virtual std::shared_ptr pop_finished_tile(); @@ -55,6 +67,17 @@ class Core { void sa_cycle(); bool can_issue_compute(std::shared_ptr& inst); void update_stats(); + // SRAM-capacity throttle (sec 10.x): a consumer frees the buffer-versions it + // read (refcount -> 0 releases the spad bytes). Called when COMP/MOVOUT issue. + void release_sram(const std::shared_ptr& inst); + // Occupy inst's buffer-version footprint on issue; false if it would overflow + // the spad this cycle (the caller stalls it). True for untracked insts. + bool try_occupy_sram(const std::shared_ptr& inst); + // SA weight-buffer throttle (sec 10.x): pick a systolic array that has a free + // weight slot (round-robin among free); -1 if all full -> the preload stalls. + int pick_free_weight_sa(); + void process_due_events(); // drain _due_events due this cycle + void apply_due(const DueAction& a); /* Core id & config file */ const uint32_t _id; @@ -103,4 +126,18 @@ class Core { std::queue _request_queue; std::queue _response_queue; uint32_t _waiting_write_reqs; + + // SRAM-capacity throttle (sec 10.x). _sram_used = current per-core spad bytes; + // _sram_capacity = limit (0 = disabled); _sram_allocs maps a buffer-version id + // to its accumulated footprint bytes (freed when its last reader issues). + size_t _sram_used = 0; + size_t _sram_capacity = 0; + std::unordered_map _sram_allocs; + + // SA weight-buffer throttle (sec 10.x). _weight_slots_used[s] = weights resident + // on SA s (loaded by a preload, not yet freed by their last matmul); + // _weight_slot_depth = per-SA weight-slot capacity (must be > 0). + std::vector _weight_slots_used; + uint32_t _weight_slot_depth = 0; + std::multimap _due_events; }; \ No newline at end of file diff --git a/TOGSim/include/Instruction.h b/TOGSim/include/Instruction.h index bb62a440..24659791 100644 --- a/TOGSim/include/Instruction.h +++ b/TOGSim/include/Instruction.h @@ -6,13 +6,25 @@ #include #include +#include #include #include #include #include #include -enum class Opcode { MOVIN, MOVOUT, COMP, BAR, COUNT}; +// MEMORY_BAR: the DMA/memory barrier (waits a DMA tag in the tag table). +enum class Opcode { MOVIN, MOVOUT, COMP, MEMORY_BAR, COUNT}; + +// A dependency edge releases its consumer on one of the producer's lifecycle +// events: ISSUE (occupancy -- the consumer overlaps the producer on the SA +// pipeline) or DONE (latency -- the consumer needs the producer's result). +enum class DepEvent : uint8_t { ISSUE = 0, DONE = 1, COUNT = 2 }; + +// One weight slot on systolic array `sa` (sec 10.x). A preload sets refcount = +// the matmuls reusing the weight; each frees it at its streaming-end, the last +// one releases the slot. Shared (shared_ptr) by the preload's matmul consumers. +struct WeightToken { int sa; int refcount; }; typedef uint64_t addr_type; typedef uint64_t cycle_type; @@ -28,7 +40,28 @@ class Instruction : public std::enable_shared_from_this { std::vector accum_tag_idx_list); Instruction(Opcode opcode); void finish_instruction(); - void add_child(std::shared_ptr child); + // Subscribe `c` to this op's `on` event (ISSUE=occupancy, DONE=latency). The set + // dedups, so ready_counter is bumped only on a new edge (a producer writing + // several buffers one consumer reads links the pair once per buffer). + void add_dep(std::shared_ptr c, DepEvent on) { + if (_deps[static_cast(on)].insert(c).second) c->inc_ready_counter(); + } + // Release every subscriber of `e` (decrement its ready_counter) and clear. + void fire(DepEvent e) { + for (auto& c : _deps[static_cast(e)]) c->dec_ready_counter(); + _deps[static_cast(e)].clear(); + } + const std::set>& get_deps(DepEvent e) { + return _deps[static_cast(e)]; + } + void set_assigned_sa(int s) { _assigned_sa = s; } + int get_assigned_sa() const { return _assigned_sa; } + void set_weight_token(const std::shared_ptr& t) { _weight_token = t; } + const std::shared_ptr& get_weight_token() const { return _weight_token; } + // Trace-only: which work-item (togsim_dispatch tile) this op belongs to, for + // grouping/coloring in the timeline. Set by the bridge per TILE_BEGIN. + void set_tile_group(int g) { _tile_group = g; } + int get_tile_group() const { return _tile_group; } bool check_ready() { return ready_counter == 0; } const Opcode get_opcode() { return opcode; } bool is_dma_read() { return opcode == Opcode::MOVIN; } @@ -51,6 +84,9 @@ class Instruction : public std::enable_shared_from_this { void inc_waiting_request(); void dec_waiting_request(); size_t get_waiting_request() { return _nr_waiting_request; } + // trace: log only the FIRST DRAM response of a load (when data starts arriving). + bool got_first_response() const { return _got_first_response; } + void mark_first_response() { _got_first_response = true; } std::vector& get_tile_size() { return tile_size; } std::vector& get_tile_stride() { return tile_stride; } void set_overlapping_cycle(cycle_type cycle) { overlapping_cycle = cycle; } @@ -83,15 +119,33 @@ class Instruction : public std::enable_shared_from_this { void prepare_tag_key(); bool is_sparse_inst() { return _is_sparse_inst; } void set_sparse_state(bool state) { _is_sparse_inst = state; } - std::set>& get_child_inst() { return child_inst; } uint64_t get_global_inst_id() const { return _global_inst_id; } - cycle_type start_cycle; - cycle_type finish_cycle; + // SRAM-capacity model (sec 10.x). A load contributes its footprint to a + // buffer-version allocation; the version is freed when its LAST consumer (the + // program-order-last reader, tagged by the bridge) issues. The bridge fills + // these; Core enforces them. + // _sram_alloc_id : which buffer-version this load fills (-1 = untracked) + // _sram_release_allocs: versions this consumer frees on issue (tagged only on + // each version's last reader) + void set_sram_alloc(int64_t id) { _sram_alloc_id = id; } + int64_t get_sram_alloc() const { return _sram_alloc_id; } + void add_sram_release(int64_t id) { _sram_release_allocs.push_back(id); } + const std::vector& get_sram_release() const { return _sram_release_allocs; } + // bytes this instruction's buffer occupies in the spad. A DMA derives it from + // the tile it moves; a compute output gets it set explicitly by the bridge (the + // buffer's size is known from the DMA records that touch the same buffer). + void set_sram_footprint(size_t b) { _sram_footprint_override = b; } + size_t sram_footprint() const { + return _sram_footprint_override ? _sram_footprint_override + : _tile_numel * (_elem_bits / 8); + } + + cycle_type finish_cycle = 0; cycle_type bubble_cycle=0; bool finished=false; - int subgraph_id; + int subgraph_id = 0; private: uint64_t _global_inst_id = 0; static uint64_t _next_global_inst_id; @@ -99,16 +153,22 @@ class Instruction : public std::enable_shared_from_this { void *_owner = nullptr; std::list>* _owner_ready_queue_ref = nullptr; Opcode opcode; - cycle_type compute_cycle; - cycle_type overlapping_cycle; - size_t ready_counter; - std::set> child_inst; + cycle_type compute_cycle = 0; + cycle_type overlapping_cycle = 0; + size_t ready_counter = 0; // parents not yet finished; the minimal Instruction(Opcode) + // ctor (barriers) relies on this default + inc_ready_counter + // Per-event subscriber sets: _deps[ISSUE] released at issue (occupancy), + // _deps[DONE] released at finish (latency). std::set dedups + keeps a stable + // iteration order (byte-identical release order). + std::array>, + static_cast(DepEvent::COUNT)> _deps; std::vector tile_size; std::vector tile_stride; - size_t _tile_numel; + size_t _tile_numel = 0; size_t _nr_waiting_request=0; + bool _got_first_response=false; size_t _elem_bits = 0; - addr_type dram_addr; + addr_type dram_addr = 0; uint32_t _numa_id = 0; // For DMA instruction int _compute_type = 0; std::vector _tag_idx_list; @@ -123,4 +183,12 @@ class Instruction : public std::enable_shared_from_this { bool _is_indirect_mode=false; bool _is_sparse_inst=false; std::string _indirect_index_path=""; + // SRAM-capacity model (see the setters above). + int64_t _sram_alloc_id = -1; + std::vector _sram_release_allocs; + size_t _sram_footprint_override = 0; + // SA weight-buffer model (see the setters above). + int _assigned_sa = -1; + std::shared_ptr _weight_token; + int _tile_group = -1; // trace-only work-item id (see set_tile_group) }; \ No newline at end of file diff --git a/TOGSim/include/SimulationConfig.h b/TOGSim/include/SimulationConfig.h index 2ef08618..7785ff7a 100644 --- a/TOGSim/include/SimulationConfig.h +++ b/TOGSim/include/SimulationConfig.h @@ -27,6 +27,16 @@ struct SimulationConfig { uint32_t num_systolic_array_per_core = 1; uint32_t num_stonne_per_core = 1; uint32_t num_stonne_port = 1; + // Per-core VMEM/spad capacity (KB) for the trace-path DMA throttle (sec 10.x): + // a load that would overflow the spad does not issue until a consumer frees a + // tile. Provided by the config (the TPU configs set 16384 = 16 MB VMEM). 0 = + // unset -> gate disabled (unlimited). Only affects trace-path instructions + // (legacy TileGraphParser insts have alloc id -1 -> never gated). + uint32_t core_spad_size_kb = 0; + // SA weight-buffer depth (sec 10.x): weight tiles a systolic array holds; a + // preload stalls until a slot frees (its matmuls finished). 2 = weight + // double-buffer (convention default, tunable). 0 = disabled. + uint32_t sa_weight_buffer_depth = 2; /* DRAM config */ DramType dram_type; diff --git a/TOGSim/include/Simulator.h b/TOGSim/include/Simulator.h index e3542d51..91baf5b5 100644 --- a/TOGSim/include/Simulator.h +++ b/TOGSim/include/Simulator.h @@ -48,6 +48,9 @@ class Simulator { void dram_cycle(); void icnt_cycle(); bool running(); + // Spad-too-small guard: if the sim stays frozen (running() but nothing in + // flight) past kWedgeThreshold cycles, error out and exit. Called each cycle. + void check_frozen(); void set_cycle_mask(); uint32_t get_dest_node(mem_fetch *access); SimulationConfig _config; diff --git a/TOGSim/include/TraceLogTags.h b/TOGSim/include/TraceLogTags.h index 6c158099..759a4fdb 100644 --- a/TOGSim/include/TraceLogTags.h +++ b/TOGSim/include/TraceLogTags.h @@ -24,6 +24,7 @@ inline constexpr const char* kInstructionFinished = "INST_FINISHED"; inline constexpr const char* kInstructionSkipped = "INST_SKIP"; inline constexpr const char* kAsyncDmaAllRequestsIssued = "ASYNC_DMA_ISSUE"; +inline constexpr const char* kFirstDramResponse = "DRAM_RESP_FIRST"; inline constexpr const char* kAllDramResponsesReceived = "DRAM_RESP_DONE"; inline constexpr const char* kL2CacheableStatusForAddress = "L2CACHE_STAT"; diff --git a/TOGSim/include/togsim_loader.h b/TOGSim/include/togsim_loader.h new file mode 100644 index 00000000..17e10b34 --- /dev/null +++ b/TOGSim/include/togsim_loader.h @@ -0,0 +1,79 @@ +#pragma once +// togsim_loader.h +// ----------------------------------------------------------------------------- +// TOGSim-side loader for the compiled trace producer (C6, P3 task 5). NOT part +// of the producer ABI (togsim_runtime.h) -- this is the TOGSim half that +// `dlopen`s a producer `.so`, runs its `togsim_kernel`, and records the emitted +// instruction stream. See docs/design/togsim_cpp_trace.md sec 5.3 / 9.7. +// +// This first cut is the "materializing sink": the callbacks resolve each tile's +// DRAM address (base[arg_id] + offset*elem_bytes) and per-tile compute cost +// (the cycle table), mint event handles, and append a TraceRec per modeled +// instruction. Feeding the recorded stream into the existing timing core +// (Core/Simulator) for cycle-equivalence vs the build_tog path is the remaining +// task-5 step. +// ----------------------------------------------------------------------------- + +#include +#include + +#include "togsim_runtime.h" + +namespace togsim { + +// One modeled instruction recorded by the runtime callbacks. +struct TraceRec { + enum Kind { TILE_BEGIN, TILE_END, DMA, COMPUTE, MEMORY_BAR } kind; + int32_t core; // work-item -> core binding (set by togsim_dispatch) + // DMA / MEMORY_BAR + int32_t dir; // togsim_dma_dir + int32_t arg_id; // tensor + int32_t elem_bits; + int32_t is_async; + uint64_t addr; // resolved DRAM byte address = base[arg_id] + off*bytes + int32_t tag_id; // DMA/MEMORY_BAR: tag memref identity; with tag_slot the + // runtime pairing key (an async dma <-> its memory_barrier) + uint64_t tag_slot; // SRAM tile slot (double-buffer / capacity model) + std::vector dims; // tile extents (DMA) + std::vector strides; // tile strides (DMA) + std::vector read_bufs; // SRAM buffer ids read (sec 10 dataflow DAG) + std::vector write_bufs; // SRAM buffer ids written (MEMORY_BAR: released bufs) + // COMPUTE + uint64_t tile_id; + int32_t compute_type; // 0 vector / 1 matmul / 2 preload (Core unit enum) + int64_t cycle; // looked up from the cycle table + int64_t overlapping; // looked up from the cycle table +}; + +struct RunResult { + bool ok = false; + std::vector trace; +}; + +// Load `so_path`, run its `togsim_kernel(shape_args, n_shape)` against a freshly +// built EmitCtx, and return the recorded trace. +// tensor_base[arg_id] : DRAM base address of each kernel tensor argument +// cyc[tile_id] / ovl[tile_id] : the cycle table (cycle, overlapping_cycle) +// partition_cores : the core ids of the partition this kernel is enqueued to; +// dispatch round-robins work-items only over THESE cores (a +// kernel stays within its partition -- other partitions are +// independent). Empty/null -> core 0. +RunResult run_producer(const char* so_path, + const int64_t* shape_args, int32_t n_shape, + const uint64_t* tensor_base, int32_t n_tensors, + const int64_t* cyc, const int64_t* ovl, int32_t n_tiles, + const int32_t* partition_cores, int32_t n_partition_cores); + +// First-order reference timing over a recorded trace, to validate that the +// stream carries enough to be scheduled (it is NOT the production Core -- no +// DRAM/NoC/L2 contention; the real cycle-equivalence path feeds Tile/TileGraph +// into Core). Models, per core: a DMA-engine timeline (DMAs serialize, overlap +// compute), a compute timeline (serial = reduction accumulate, with the +// finish = prev.finish + cycle - overlapped pipeline overlap of Core.cc), and +// data dependencies (a compute waits the dmas whose handles its preceding +// togsim_wait()s named). +struct TimingParams { uint64_t dma_latency = 100; }; +struct SimResult { uint64_t total_cycle = 0; int n_compute = 0, n_dma = 0; }; +SimResult simulate(const RunResult& run, const TimingParams& params); + +} // namespace togsim diff --git a/TOGSim/include/togsim_runtime.h b/TOGSim/include/togsim_runtime.h new file mode 100644 index 00000000..d87c61d5 --- /dev/null +++ b/TOGSim/include/togsim_runtime.h @@ -0,0 +1,172 @@ +#pragma once +// togsim_runtime.h +// ----------------------------------------------------------------------------- +// Shared C ABI between a compiled, shape-parametric trace producer (`.so`, +// generated MLIR -> EmitC -> C++) and TOGSim. See docs/design/togsim_cpp_trace.md. +// +// The producer keeps loops as native loops (symbolic bounds become function +// parameters) and calls the functions below; each call emits one trace record = +// one modeled instruction. TOGSim `dlopen`s the producer, constructs an +// `EmitCtx`, calls the entry point, records the emitted stream, and feeds it to +// the existing timing core. The producer carries NO timing model and NO +// functional compute -- it is a deterministic trace generator only. +// +// ABI shape rationale: `mlir-translate --mlir-to-cpp` lowers our `togsim.*` ops +// (via `emitc.call_opaque`) to *free function* calls, so the contract is a set +// of `extern "C"` free functions taking an opaque `EmitCtx*` as the first +// argument. Implementations live in TOGSim and may dispatch internally; the +// `EmitCtx` is opaque to the producer. `togsim_abi_version()` guards against a +// producer `.so` built against a stale header. +// +// STATUS: firmed up in P2. The signatures below match what the C4 +// togsim->emitc lowering (PyTorchSimFrontend/mlir/passes/lower_to_emitc.py) +// emits as `emitc.call_opaque` targets and what `mlir-translate --mlir-to-cpp` +// renders. Synchronization is event-id based: each async op is registered +// under an integer `event_id` and the matching wait passes the same id (the +// "event-id table replaces the memory-keyed tag_table" decision). Tile DRAM +// base addresses are still passed as a stub (0) until P3 wires real addresses. +// ----------------------------------------------------------------------------- + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +// Bump whenever the signatures below change incompatibly. TOGSim refuses to load +// a producer whose embedded version (a `togsim_producer_abi_version` symbol, or +// a value passed at the entry point) does not match. +// v1 -> v2 (P2): dma takes an event_id and returns void (was: returns a +// handle); togsim_kernel shape_args is non-const to match the +// emitc/mlir-to-cpp output. +// v2 -> v3 (P3): add togsim_dispatch (work-item boundary + core binding) and +// togsim_wait_all (join / barrier). +// v3 -> v4 (P3): togsim_dma takes (arg_id, element offset) instead of a +// precomputed base_addr; the producer lowers the address +// arithmetic and the runtime adds the tensor base. +// v4 -> v5 (P3): event handles. togsim_dma RETURNS a fresh handle (drops the +// event_id arg); the producer parks it in a heap event buffer +// (togsim_event_alloc/free) and togsim_wait takes the handle. +// v5 -> v6 (P3): replace togsim_dispatch with togsim_core_alloc (returns a +// core id; no free) -- the runtime owns the core pool, num_cores +// is never baked into the producer. +// v6 -> v7 (P3): togsim_dma takes a tag_slot (SRAM tile slot) for the runtime's +// double-buffer / SRAM-capacity model. +// v7 -> v8 (P3): togsim_compute takes a compute_type (vector/matmul/preload) so +// the Core routes it to the right compute unit. +// v8 -> v9 (P3 sec10): togsim_dma/compute take read_bufs/write_bufs (SRAM buffer +// ids); the loader builds an explicit dependency DAG by +// last-writer per buffer (replaces in-order/tag dependencies). +// v9 -> v10 (P3 sec10.7): add togsim_compute_barrier (the explicit compute fence +// before a store; loader -> COMPUTE_BAR instruction). +// v10 -> v11 (P3 sec10): replace the static event-id pairing with the RUNTIME +// tag slot. togsim_dma takes a tag_id (its tag memref identity) +// and returns void; the original dma_wait becomes an explicit +// togsim_memory_barrier(tag_id, tag_slot, write_bufs) that pairs +// with its async dma by the runtime (tag_id, tag_slot) -- one +// static dma op runs once per loop iteration with a different +// %tag[%idx], so only a runtime key can pair them. Drops +// togsim_wait/signal/wait_all/event_alloc/event_free + the +// togsim_event handle (no compile-time pairing token). +// v11 -> v12 (P3 sec9.3): replace the bare togsim_core_alloc marker with a +// higher-order togsim_dispatch(ctx, tile_fn, iv, n_iv) wrapper. +// The producer outlines each parallel work-item into a uniform +// togsim_kernel_tile(ctx, iv, n) and the dispatcher loop hands it +// to togsim_dispatch, which round-robins a core and brackets the +// call with TILE_BEGIN/TILE_END. The work-item scope is now the +// function call itself (no implicit "until the next core_alloc" +// range); one general dispatcher serves every kernel (uniform +// iv-array ABI). Core alloc + the begin/end boundary are +// runtime-owned. +#define TOGSIM_ABI_VERSION 12 +int32_t togsim_abi_version(void); + +// Opaque per-invocation context owned by TOGSim. Holds the record sink and the +// tile_id->cycle lookup. Never dereferenced by the producer. +typedef struct EmitCtx EmitCtx; + +// Direction for togsim_dma. +typedef enum { + TOGSIM_DMA_LOAD = 0, // DRAM -> SRAM (MOVIN) + TOGSIM_DMA_STORE = 1, // SRAM -> DRAM (MOVOUT) +} togsim_dma_dir; + +// Emit a DMA. +// dir : load/store +// arg_id : which tensor (kernel func arg) this tile lives in +// offset : ELEMENT offset of this tile within that tensor, computed by the +// producer from the loop indices (the affine address arithmetic is +// lowered into the producer -- P3 approach A). The runtime forms +// the DRAM address as base[arg_id] + offset*elem_bytes (only the +// runtime knows the tensors' allocation base addresses). +// ndim : rank of the tile +// dims : ndim tile extents +// strides : ndim tile strides (may be null => contiguous) +// elem_bits : element width in bits +// is_async : non-zero => issue-complete is the finish; the consumer must be +// gated by an explicit togsim_memory_barrier (data arrives later). +// Zero => blocking: the dma finishes at data-arrival. +// tag_id : identity of this dma's tag memref. With tag_slot it forms the +// RUNTIME pairing key (tag_id, tag_slot) the matching +// togsim_memory_barrier waits on -- not a compile-time id, since +// one static dma op runs once per loop iteration. +// tag_slot : the SRAM tile slot this tile occupies (the producer's lowered +// tag index, evaluated at runtime). Also the double-buffer / +// SRAM-capacity slot. Single-buffer kernels pass 0. +// read_bufs/n_read, write_bufs/n_write : SRAM buffer ids this op reads/writes +// (sec 10 dataflow). The loader builds the dependency DAG by last-writer per +// buffer. +void togsim_dma(EmitCtx* ctx, int32_t dir, int32_t arg_id, + uint64_t offset, int32_t ndim, const int64_t* dims, + const int64_t* strides, int32_t elem_bits, + int32_t is_async, int32_t tag_id, uint64_t tag_slot, + const int64_t* read_bufs, int32_t n_read, + const int64_t* write_bufs, int32_t n_write); + +// Emit a fixed-size tile compute. Cost is looked up from the precomputed +// tile_id->cycle table (annotation pass / sample-mode); `dims` are passed for +// logging and future remainder-tile handling, not to compute cost here. +// compute_type : 0 vector / 1 matmul / 2 preload (maps to the Core unit enum; +// routes the op to the VPU vs the systolic array). +void togsim_compute(EmitCtx* ctx, uint64_t tile_id, int32_t compute_type, + int32_t ndim, const int64_t* dims, + const int64_t* read_bufs, int32_t n_read, + const int64_t* write_bufs, int32_t n_write); + +// Explicit async-DMA sync -- the original memref.dma_wait. Pairs with its async +// togsim_dma by the RUNTIME tag slot (tag_id, tag_slot) and gates consumers on +// data-arrival (resp-complete), since an async dma's own finish is only +// issue-complete. `write_bufs` is the SRAM buffer(s) that dma loaded; the loader +// makes the barrier the last writer of them so consumers depend on it. Sync DMAs +// need no barrier (they block to data-arrival themselves). +void togsim_memory_barrier(EmitCtx* ctx, int32_t tag_id, uint64_t tag_slot, + const int64_t* write_bufs, int32_t n_write); + +// A parallel work-item body, outlined by the producer (sec 9.3). Uniform across +// kernels: it takes the EmitCtx, the packed parallel loop indices `iv` (iv[0.. +// n_iv) -- e.g. the (m,n) output-tile indices) and their count. The body emits +// the work-item's ops (init / reduction / store). One signature => one general +// dispatcher serves every kernel. +// (iv is non-const to match the `int64_t*` the EmitC producer emits; the runtime +// only reads it.) +typedef void (*togsim_tile_fn)(EmitCtx* ctx, int64_t* iv, int32_t n_iv); + +// Dispatch one work-item (sec 9.3). The runtime round-robins a core from the +// pool, brackets the call with TILE_BEGIN/TILE_END (the work-item boundary), and +// invokes `fn(ctx, iv, n_iv)` -- so the work-item SCOPE is exactly the function +// call, not an implicit "ops until the next alloc" range. Core alloc + boundary +// are runtime-owned; the producer is core-count transparent (never names +// num_cores or a physical core). Independent work-items land on different cores +// -> multi-core. A general (kernel-independent) wrapper: it only forwards the +// opaque iv array to fn. +void togsim_dispatch(EmitCtx* ctx, togsim_tile_fn fn, + int64_t* iv, int32_t n_iv); + +// Entry point the loader resolves in the producer `.so`. `shape_args` carries +// the runtime values for the kernel's symbolic dimensions (in a kernel-specific +// order recorded alongside the cached `.so`); `n_shape_args` is their count. +void togsim_kernel(EmitCtx* ctx, int64_t* shape_args, int32_t n_shape_args); + +#ifdef __cplusplus +} // extern "C" +#endif diff --git a/TOGSim/include/togsim_trace_bridge.h b/TOGSim/include/togsim_trace_bridge.h new file mode 100644 index 00000000..f0213ef5 --- /dev/null +++ b/TOGSim/include/togsim_trace_bridge.h @@ -0,0 +1,18 @@ +#pragma once +// togsim_trace_bridge.h +// ----------------------------------------------------------------------------- +// Bridge from the recorded trace (togsim_loader.h RunResult) to a TileGraph the +// existing Simulator/Core can run, for production cycle-equivalence (P3 task 5; +// see togsim_cpp_trace.md sec 9.9). First cut: one Tile per work-item (the span +// between two togsim_core_alloc markers), bound to that work-item's core; the +// DMA/compute records become MOVIN/MOVOUT/COMP Instructions with the RAW +// dependency edges (a compute waits the dmas its preceding waits named). +// ----------------------------------------------------------------------------- +#include + +#include "TileGraph.h" +#include "togsim_loader.h" + +// Build a TileGraph from a recorded trace. `path`/`name` label the graph. +std::unique_ptr trace_to_tilegraph(const togsim::RunResult& run, + const std::string& name); diff --git a/TOGSim/src/CMakeLists.txt b/TOGSim/src/CMakeLists.txt index 65cd4dd4..d782d4d1 100644 --- a/TOGSim/src/CMakeLists.txt +++ b/TOGSim/src/CMakeLists.txt @@ -12,3 +12,8 @@ file(GLOB_RECURSE SRC_FILES # build add_executable(${LIB_NAME} ${SRC_FILES}) + +# Export the executable's dynamic symbols (-rdynamic) so a dlopen'd trace +# producer .so resolves the togsim_* runtime callbacks back into this binary +# (P3 trace pipeline). +set_target_properties(${LIB_NAME} PROPERTIES ENABLE_EXPORTS ON) diff --git a/TOGSim/src/Common.cc b/TOGSim/src/Common.cc index 3f84d885..6f9a74d7 100644 --- a/TOGSim/src/Common.cc +++ b/TOGSim/src/Common.cc @@ -64,6 +64,10 @@ SimulationConfig initialize_config(const YAML::Node& config, parsed_config.core_freq_mhz = get_config_value(config, "core_freq_mhz"); if (config["num_systolic_array_per_core"]) parsed_config.num_systolic_array_per_core = config["num_systolic_array_per_core"].as(); + if (config["core_spad_size_kb"]) + parsed_config.core_spad_size_kb = config["core_spad_size_kb"].as(); + if (config["sa_weight_buffer_depth"]) + parsed_config.sa_weight_buffer_depth = config["sa_weight_buffer_depth"].as(); if (config["num_stonne_per_core"]) parsed_config.num_stonne_per_core = config["num_stonne_per_core"].as(); if (config["num_stonne_port"]) diff --git a/TOGSim/src/Core.cc b/TOGSim/src/Core.cc index 9dad8597..915988ce 100644 --- a/TOGSim/src/Core.cc +++ b/TOGSim/src/Core.cc @@ -17,11 +17,75 @@ Core::Core(uint32_t id, SimulationConfig config) _stat_sa_compute_idle_cycle.resize(_num_systolic_array_per_core); _stat_inst_count.resize(static_cast(Opcode::COUNT), 0); _stat_tot_skipped_inst.resize(static_cast(Opcode::COUNT), 0); + _sram_capacity = (size_t)config.core_spad_size_kb * 1024; // 0 = throttle disabled + _weight_slot_depth = config.sa_weight_buffer_depth; // per-SA weight slots (>0) + if (_weight_slot_depth == 0) { + spdlog::error("sa_weight_buffer_depth must be > 0 (raise it to loosen the preload throttle)"); + exit(EXIT_FAILURE); + } + _weight_slots_used.resize(_num_systolic_array_per_core, 0); +} + +// Round-robin a systolic array that still has a free weight slot; -1 if all full +// (the preload must stall). Advances _systolic_array_rr past the chosen SA. +int Core::pick_free_weight_sa() { + for (uint32_t i = 0; i < _num_systolic_array_per_core; i++) { + uint32_t s = (_systolic_array_rr + i) % _num_systolic_array_per_core; + if (_weight_slots_used[s] < (int)_weight_slot_depth) { + _systolic_array_rr = (s + 1) % _num_systolic_array_per_core; + return (int)s; + } + } + return -1; +} + +void Core::apply_due(const DueAction& a) { + switch (a.kind) { + case DueAction::FreeWeightSlot: + if (--a.token->refcount <= 0) _weight_slots_used[a.token->sa]--; // last reader frees the slot + break; + case DueAction::WakeBar: { + auto bar = a.bar; // async load data arrived -> fire its MEMORY_BAR + finish_instruction(bar); + break; + } + } +} + +void Core::process_due_events() { + while (!_due_events.empty() && _due_events.begin()->first <= _core_cycle) { + apply_due(_due_events.begin()->second); + _due_events.erase(_due_events.begin()); + } +} + +// The LAST reader of a buffer-version issued (bridge tags only that consumer): +// free the version's bytes back to the per-core spad. +void Core::release_sram(const std::shared_ptr& inst) { + if (!_sram_capacity) return; + for (int64_t id : inst->get_sram_release()) { + auto it = _sram_allocs.find(id); + if (it == _sram_allocs.end()) continue; + _sram_used -= it->second; + _sram_allocs.erase(it); + } +} + +bool Core::try_occupy_sram(const std::shared_ptr& inst) { + if (!_sram_capacity || inst->get_sram_alloc() < 0) return true; // untracked + size_t F = inst->sram_footprint(); + if (_sram_used + F > _sram_capacity) return false; // would overflow -> stall + _sram_used += F; + _sram_allocs[inst->get_sram_alloc()] += F; // accumulate version footprint + return true; } bool Core::can_issue(const std::shared_ptr& op) { - /* Check SRAM is enough to run tile */ - return _tiles.size() < 4 && !op->is_stonne_tile(); + /* Bound concurrent dispatches so their combined spad working set fits: with the + * global @buffers each in-flight dispatch piles its own load versions, and too + * many at once overflow the spad (versions never free -> wedge). 2 keeps double- + * buffering overlap while leaving headroom. */ + return _tiles.size() < 2 && !op->is_stonne_tile(); } void Core::issue(std::shared_ptr op) { @@ -135,7 +199,7 @@ void Core::dma_cycle() { finish_instruction(instruction, InstFinishTraceTag::DmaRespComplete); for (auto & wait_inst : _dma.get_tag_waiter(instruction->subgraph_id, key)) { _dma.mark_tag_used(instruction->subgraph_id, key); - finish_instruction(wait_inst); + _due_events.emplace(_core_cycle, DueAction{DueAction::WakeBar, nullptr, wait_inst}); } } _dma_finished_queue.erase(_dma_finished_queue.begin()); @@ -154,7 +218,7 @@ void Core::dma_cycle() { } else if(!finished_inst->is_dma_read()) { core_trace_log::log_error_dma_instruction_invalid(_core_cycle, _id); exit(EXIT_FAILURE); - } else if (finished_inst->get_opcode() == Opcode::BAR) { + } else if (finished_inst->get_opcode() == Opcode::MEMORY_BAR) { core_trace_log::trace_instruction_line(_core_cycle, _id, TraceLogTag::pad15(TraceLogTag::kInstructionFinished), @@ -200,6 +264,8 @@ void Core::cycle() { /* Increase core cycle counter */ _core_cycle++; + process_due_events(); // weight-slot frees + DMA-arrival wakeups due this cycle + /* Iterate tile while an instruction is issued */ bool issued = false; @@ -207,9 +273,6 @@ void Core::cycle() { auto& instructions = _tiles[i]->get_ready_instructions(); for (auto it=instructions.begin(); it!=instructions.end();) { auto& inst = *it; - /* Skip instruction is not ready */ - //if (!inst->is_ready()) - // continue; switch (inst->get_opcode()) { case Opcode::MOVIN: @@ -240,6 +303,8 @@ void Core::cycle() { _stat_tot_skipped_inst.at(static_cast(inst->get_opcode()))++; break; } else { + // load occupies its spad bytes on issue; stall (retry next cycle) if full. + if (!try_occupy_sram(inst)) break; core_trace_log::trace_instruction_line(_core_cycle, _id, TraceLogTag::pad15( @@ -254,6 +319,7 @@ void Core::cycle() { } } case Opcode::MOVOUT: + release_sram(inst); // store issued -> free the tiles it drained core_trace_log::trace_instruction_line(_core_cycle, _id, TraceLogTag::pad15(TraceLogTag::kInstructionIssued), @@ -265,7 +331,41 @@ void Core::cycle() { break; case Opcode::COMP: { - auto& target_pipeline = get_compute_pipeline(inst->get_compute_type()); + const int ct = inst->get_compute_type(); + // a fresh-output compute occupies its spad bytes on issue; stall if full. + if (!try_occupy_sram(inst)) break; + // SA selection (sec 10.x): a preload picks an SA with a free weight slot + // and pins its matmul consumers there; a matmul runs on its pinned SA. + int sa_idx = -1; + if (ct == MATMUL || ct == PRELOAD) { + if (ct == PRELOAD) { + int n_consumers = 0; // matmuls reusing this weight + for (auto& c : inst->get_deps(DepEvent::ISSUE)) + if (c->get_compute_type() == MATMUL) n_consumers++; + if (n_consumers == 0) { // weight-slot model needs >=1 consumer + spdlog::error("preload has no matmul consumer (weight-slot model invariant)"); + exit(EXIT_FAILURE); + } + sa_idx = pick_free_weight_sa(); + if (sa_idx < 0) break; // all weight slots full -> stall (retry) + _weight_slots_used[sa_idx]++; + auto tok = std::make_shared(WeightToken{sa_idx, n_consumers}); + for (auto& c : inst->get_deps(DepEvent::ISSUE)) + if (c->get_compute_type() == MATMUL) { + c->set_assigned_sa(sa_idx); + c->set_weight_token(tok); + } + } else { // MATMUL + sa_idx = inst->get_assigned_sa(); // pinned by its preload + if (sa_idx < 0) { // unpinned -> no preload set its SA + spdlog::error("matmul was not pinned to an SA by a preload (weight-slot model invariant)"); + exit(EXIT_FAILURE); + } + } + inst->set_assigned_sa(sa_idx); // record the SA actually used (for the trace) + } + auto& target_pipeline = (ct == VECTOR_UNIT) ? _vu_compute_pipeline + : _sa_compute_pipeline.at(sa_idx); if (target_pipeline.empty()) { inst->finish_cycle = _core_cycle + inst->get_compute_cycle(); inst->bubble_cycle = inst->get_overlapping_cycle(); @@ -275,7 +375,19 @@ void Core::cycle() { inst->finish_cycle = target_pipeline.back()->finish_cycle + inst->get_compute_cycle() - overlapped_cycle; inst->bubble_cycle = bubble_cycle; } + // release the occupancy (ISSUE) dependents so a successor overlaps this op. + inst->fire(DepEvent::ISSUE); + + // Release this matmul's weight slot at its streaming-end (finish - + // overlapping), not at full finish (the drain tail does not read it). + if (ct == MATMUL && inst->get_weight_token()) { + cycle_type rel = inst->finish_cycle > inst->get_overlapping_cycle() + ? inst->finish_cycle - inst->get_overlapping_cycle() : _core_cycle; + _due_events.emplace(rel, DueAction{DueAction::FreeWeightSlot, + inst->get_weight_token(), nullptr}); + } + release_sram(inst); // free the tiles it read (before the skip path) if (inst->get_compute_cycle() == 0) { inst->finish_instruction(); static_cast(inst->get_owner())->inc_finished_inst(); @@ -297,12 +409,12 @@ void Core::cycle() { } } break; - case Opcode::BAR: + case Opcode::MEMORY_BAR: { auto& key = inst->get_tag_id(); uint32_t finished = _dma.get_tag_finish(inst->subgraph_id, key); if (finished == -1) { - for (auto child_inst : inst->get_child_inst()) { + for (auto child_inst : inst->get_deps(DepEvent::DONE)) { if (child_inst->get_opcode() == Opcode::COMP && child_inst->get_compute_type() == MATMUL) { child_inst->set_compute_cycle(0); } @@ -387,6 +499,19 @@ void Core::finish_instruction(std::shared_ptr& inst, InstFinishTrac core_trace_log::format_instruction_detail_line(*inst)); } +bool Core::has_inflight() { + // running() without the "_tiles.size() > 0" term: work that will produce a + // finish event on its own (so the sim is NOT frozen). If this is false but + // tiles remain, only stalled ready instructions are left. + if (!_vu_compute_pipeline.empty()) return true; + for (int i = 0; i < _num_systolic_array_per_core; i++) + if (!_sa_compute_pipeline.at(i).empty()) return true; + if (!_dma_waiting_queue.empty() || !_dma_finished_queue.empty()) return true; + if (!_dma.empty()) return true; + if (!_ld_inst_queue.empty() || !_st_inst_queue.empty()) return true; + return false; +} + bool Core::running() { bool running = false; running = running || _tiles.size() > 0; @@ -412,6 +537,13 @@ void Core::push_memory_response(mem_fetch* response) { Instruction* owner_inst = static_cast(response->get_custom_data()); assert(owner_inst->get_waiting_request()); + if (!owner_inst->got_first_response()) { // first data of this load arrived + owner_inst->mark_first_response(); + core_trace_log::trace_instruction_line(_core_cycle, _id, + TraceLogTag::pad15(TraceLogTag::kFirstDramResponse), + owner_inst->get_global_inst_id(), + core_trace_log::format_instruction_detail_line(*owner_inst)); + } owner_inst->dec_waiting_request(); if (!owner_inst->get_waiting_request()) { auto it = _dma_waiting_queue.find(owner_inst); diff --git a/TOGSim/src/CoreTraceLog.cc b/TOGSim/src/CoreTraceLog.cc index ebc31de0..7086893e 100644 --- a/TOGSim/src/CoreTraceLog.cc +++ b/TOGSim/src/CoreTraceLog.cc @@ -31,7 +31,7 @@ std::string format_dma_inst_issued_detail(Instruction& inst) { } return fmt::format( "addr_name={} dram=0x{:016x} rank={} elem_bits={} async={} indirect={} tag=0x{:016x} stride=[{}] size=[{}] " - "tag_idx=[{}]", + "tag_idx=[{}] tile={}", inst.get_addr_name(), static_cast(inst.get_base_dram_address()), rank, @@ -41,7 +41,8 @@ std::string format_dma_inst_issued_detail(Instruction& inst) { tag_hex, fmt::join(inst.get_tile_stride(), ","), fmt::join(ts, ","), - fmt::join(tidx, ",")); + fmt::join(tidx, ","), + inst.get_tile_group()); } std::string format_dma_inst_issued_trace_line(Instruction& inst) { @@ -52,31 +53,35 @@ std::string format_instruction_detail_line(Instruction& inst) { const Opcode op = inst.get_opcode(); const std::string opname = opcode_to_string(op); if (op == Opcode::COMP) { - return fmt::format("{} (compute_type={} compute_cycle={} overlapping_cycle={})", + return fmt::format("{} (compute_type={} compute_cycle={} overlapping_cycle={} sa={} tile={})", opname, inst.get_compute_type(), inst.get_compute_cycle(), - inst.get_overlapping_cycle()); + inst.get_overlapping_cycle(), + inst.get_assigned_sa(), + inst.get_tile_group()); } if ((op == Opcode::MOVIN || op == Opcode::MOVOUT) && inst.is_async_dma()) { - return fmt::format("{} (ASYNC subgraph_id={} addr_name={} tag_id=[{}] tag_idx=[{}] tag_stride=[{}])", + return fmt::format("{} (ASYNC subgraph_id={} addr_name={} tag_id=[{}] tag_idx=[{}] tag_stride=[{}] tile={})", opname, inst.subgraph_id, inst.get_addr_name(), format_tag_key_list_hex(inst.get_tag_id()), fmt::join(inst.get_tag_idx_list(), ","), - fmt::join(inst.get_tag_stride_list(), ",")); + fmt::join(inst.get_tag_stride_list(), ","), + inst.get_tile_group()); } if (op == Opcode::MOVIN || op == Opcode::MOVOUT) { - return fmt::format("{} (addr_name={})", opname, inst.get_addr_name()); + return fmt::format("{} (addr_name={} tile={})", opname, inst.get_addr_name(), inst.get_tile_group()); } - if (op == Opcode::BAR) { - return fmt::format("{} (addr_name={} tag_id=[{}] tag_idx=[{}] tag_stride=[{}])", + if (op == Opcode::MEMORY_BAR) { + return fmt::format("{} (addr_name={} tag_id=[{}] tag_idx=[{}] tag_stride=[{}] tile={})", opname, inst.get_addr_name(), format_tag_key_list_hex(inst.get_tag_id()), fmt::join(inst.get_tag_idx_list(), ","), - fmt::join(inst.get_tag_stride_list(), ",")); + fmt::join(inst.get_tag_stride_list(), ","), + inst.get_tile_group()); } return opname; } diff --git a/TOGSim/src/Instruction.cc b/TOGSim/src/Instruction.cc index f236d160..ee184a1a 100644 --- a/TOGSim/src/Instruction.cc +++ b/TOGSim/src/Instruction.cc @@ -23,7 +23,7 @@ std::string opcode_to_string(Opcode opcode) { case Opcode::MOVIN: return "MOVIN"; case Opcode::MOVOUT: return "MOVOUT"; case Opcode::COMP: return "COMP"; - case Opcode::BAR: return "BAR"; + case Opcode::MEMORY_BAR: return "MEMORY_BAR"; default: return "Unknown"; } } @@ -50,16 +50,10 @@ Instruction::Instruction(Opcode opcode) } void Instruction::finish_instruction() { - for (auto& counter : child_inst) - counter->dec_ready_counter(); + fire(DepEvent::DONE); // latency consumers finished = true; } -void Instruction::add_child(std::shared_ptr child) { - child->inc_ready_counter(); - child_inst.insert(child); -} - void Instruction::inc_waiting_request() { _nr_waiting_request++; } diff --git a/TOGSim/src/Simulator.cc b/TOGSim/src/Simulator.cc index d987d787..366528ec 100644 --- a/TOGSim/src/Simulator.cc +++ b/TOGSim/src/Simulator.cc @@ -184,6 +184,38 @@ void Simulator::icnt_cycle() { _icnt->cycle(); } +// Consecutive frozen cycles tolerated before declaring the sim wedged (spad too +// small). Generous so transient idle never false-fires; a true freeze is constant. +static constexpr uint64_t kWedgeThreshold = 100000; + +// Frozen-state guard: work remains (running()) but nothing is in flight to +// advance it -- the SRAM throttle can never satisfy a load because the kernel's +// working set exceeds the whole per-core spad (core_spad_size_kb too small). The +// state repeats every cycle, so after a margin error out instead of looping +// forever. `stuck` is function-local-static (one running sim at a time; it resets +// on any progress). +void Simulator::check_frozen() { + static uint64_t stuck = 0; + // In flight = anything that will produce a future state change: icnt/dram busy, + // a core with DMA/compute pending, or a tile still schedulable. + bool inflight = _icnt->running() || _dram->running(); + for (int id = 0; id < _n_cores && !inflight; id++) { + if (_cores[id]->has_inflight()) inflight = true; + else if (!get_partition_scheduler(id)->empty(id)) inflight = true; + } + if (running() && !inflight) { + if (++stuck > kWedgeThreshold) { + spdlog::error("[Simulator] simulation wedged at cycle {}: work remains but " + "nothing is in flight -- the per-core spad (core_spad_size_kb) " + "is too small to hold a kernel's working set. Increase it.", + _core_cycles); + exit(EXIT_FAILURE); + } + } else { + stuck = 0; + } +} + void Simulator::cycle() { while (running() || _core_cycles < 1) { set_cycle_mask(); @@ -198,6 +230,8 @@ void Simulator::cycle() { // Interconnect cycle if (IS_ICNT_CYCLE(_cycle_mask)) icnt_cycle(); + + check_frozen(); // spad-too-small guard (errors out if wedged) } for (auto &core: _cores) { core->check_tag(); diff --git a/TOGSim/src/TileGraphParser.cc b/TOGSim/src/TileGraphParser.cc index 5060d336..c252258e 100644 --- a/TOGSim/src/TileGraphParser.cc +++ b/TOGSim/src/TileGraphParser.cc @@ -543,7 +543,7 @@ std::vector> TileLoopNode::get_tiles_from_iter(TileGraphPa fmt::join(new_tag_stride_list, ", ")); std::shared_ptr inst = std::make_shared( - Opcode::BAR, 0, + Opcode::MEMORY_BAR, 0, 0, base_addr, std::vector(), std::vector(), 0, tag_list, new_tag_stride_list, accum_tag_list @@ -584,7 +584,7 @@ std::vector> TileLoopNode::get_tiles_from_iter(TileGraphPa for (const auto& child_node: node->get_child()) { if (link_map.find(child_node) != link_map.end()) { std::shared_ptr child_inst = link_map[child_node]; - inst->add_child(child_inst); + inst->add_dep(child_inst, DepEvent::DONE); } } } @@ -606,7 +606,7 @@ std::vector> TileLoopNode::get_tiles_from_iter(TileGraphPa for (auto& inner_inst : inner_tile->get_instructions()) { tile_vec.back()->append_instuction(inner_inst); if (nr_inst) { - last_instruction->add_child(inner_inst); + last_instruction->add_dep(inner_inst, DepEvent::DONE); } } } @@ -662,7 +662,7 @@ std::vector> TileLoopNode::get_tiles_from_iter(TileGraphPa for (const auto& child_node: node->get_child()) { if (link_map.find(child_node) != link_map.end()) { std::shared_ptr child_inst = link_map[child_node]; - inst->add_child(child_inst); + inst->add_dep(child_inst, DepEvent::DONE); } } } diff --git a/TOGSim/src/main.cc b/TOGSim/src/main.cc index 010826ef..274d63da 100644 --- a/TOGSim/src/main.cc +++ b/TOGSim/src/main.cc @@ -1,3 +1,4 @@ +#include #include #include #include @@ -8,18 +9,72 @@ #include "Simulator.h" #include "TileGraphParser.h" #include "helper/CommandLineParser.h" +#include "togsim_loader.h" // P3 trace pipeline: run a compiled producer .so +#include "togsim_trace_bridge.h" // ... and bridge its trace to a TileGraph namespace fs = std::filesystem; namespace po = boost::program_options; +// Run a kernel's compiled trace producer (.so) and bridge it to a TileGraph, +// targeting `partition_id` (its work-items round-robin only over that partition's +// cores -- partitions are independent schedulers). The cycle-table TSV gives the +// per-tile compute latency; a flat stub is used if it is missing. Returns nullptr +// if the producer run fails. Shared by the standalone --trace_so path and the +// multi-tenant launchKernel below. +std::unique_ptr build_trace_tilegraph(Simulator* simulator, + const std::string& trace_so_path, + const std::string& cycle_table_path, + int partition_id) { + const auto& cfg = simulator->get_hardware_config_yaml(); + int num_cores = cfg["num_cores"] ? cfg["num_cores"].as() : 1; + std::vector partition_cores; + for (int c = 0; c < num_cores; c++) + if (simulator->get_partition_id(c) == partition_id) partition_cores.push_back(c); + if (partition_cores.empty()) partition_cores.push_back(0); + // First cut: stub tensor bases (real per-tensor addresses come later). + std::vector bases(16); + for (size_t i = 0; i < bases.size(); ++i) bases[i] = 0x100000ull * (i + 1); + // Cycle table: load the per-tile_id TSV sidecar if present, else a flat stub. + std::vector cyc, ovl; + std::ifstream ct(cycle_table_path); + if (ct.is_open()) { + int64_t c, o; + while (ct >> c >> o) { cyc.push_back(c); ovl.push_back(o); } + } + if (cyc.empty()) { cyc.assign(256, 128); ovl.assign(256, 0); } + auto run = togsim::run_producer(trace_so_path.c_str(), nullptr, 0, + bases.data(), (int)bases.size(), + cyc.data(), ovl.data(), (int)cyc.size(), + partition_cores.data(), (int32_t)partition_cores.size()); + if (!run.ok) return nullptr; + return trace_to_tilegraph(run, "trace_kernel"); +} + void launchKernel(Simulator* simulator, unsigned int kernel_id, std::string onnx_path, std::string attribute_path, const YAML::Node& config_yaml, cycle_type request_time=0, int partition_id=0, int device_id=0) { - auto graph_praser = TileGraphParser(onnx_path, attribute_path, config_yaml); - std::unique_ptr& tile_graph = graph_praser.get_tile_graph(); + std::unique_ptr tile_graph; + std::string tog_path = onnx_path; // for the log line + // The C++ trace path is the supported one: the kernel's trace.so / trace_cycles.tsv + // sit next to its tile_graph.onnx (same write_path). The legacy ONNX parser below is + // DEPRECATED -- only used via TORCHSIM_LEGACY_TOG=1 or when the .so is absent / fails. + const char* legacy = std::getenv("TORCHSIM_LEGACY_TOG"); + std::string dir = fs::path(onnx_path).parent_path().string(); + std::string trace_so = dir + "/trace.so"; + std::string cycle_tsv = dir + "/trace_cycles.tsv"; + if ((!legacy || std::string(legacy) != "1") && fs::exists(trace_so)) { + tile_graph = build_trace_tilegraph(simulator, trace_so, cycle_tsv, partition_id); + if (tile_graph) tog_path = trace_so; + else spdlog::warn("[TOGSim] trace.so run failed for {}; falling back to ONNX", trace_so); + } + if (!tile_graph) { + spdlog::warn("[TOGSim] using the DEPRECATED legacy ONNX TOG path for {}", onnx_path); + auto graph_praser = TileGraphParser(onnx_path, attribute_path, config_yaml); + tile_graph = std::move(graph_praser.get_tile_graph()); + } tile_graph->set_arrival_time(request_time ? request_time : simulator->get_core_cycle()); tile_graph->set_kernel_id(kernel_id); spdlog::info("[Scheduler {}] Enqueued kernel_id: {}, tog_path: {}, operation: {}, request_time_cycles: {}", - partition_id, kernel_id, onnx_path, tile_graph->get_name(), request_time); + partition_id, kernel_id, tog_path, tile_graph->get_name(), request_time); simulator->enqueue_graph(partition_id, std::move(tile_graph)); } @@ -104,6 +159,11 @@ int main(int argc, char** argv) { "models_list", "Path for the trace file (.trace)"); cmd_parser.add_command_line_option( "log_level", "Set for log level [trace, debug, info], default = info"); + cmd_parser.add_command_line_option( + "trace_so", "Path to a compiled trace producer .so (P3 trace pipeline)"); + cmd_parser.add_command_line_option( + "cycle_table", "Path to a 'cycleoverlapping' per-tile_id sidecar (TSV) " + "for --trace_so; falls back to a flat stub if omitted"); try { cmd_parser.parse(argc, argv); } catch (const CommandLineParser::ParsingError& e) { @@ -147,6 +207,27 @@ int main(int argc, char** argv) { exit(1); } + // P3 trace pipeline: if a compiled producer .so is given, run it, bridge the + // recorded trace to a TileGraph, and run the existing Simulator on it. + std::string trace_so_path; + cmd_parser.set_if_defined("trace_so", &trace_so_path); + if (!trace_so_path.empty()) { + // Standalone single-kernel trace run: enqueue to partition 0 (its work-items + // round-robin over partition 0's cores only; see build_trace_tilegraph). + std::string cycle_table_path; + cmd_parser.set_if_defined("cycle_table", &cycle_table_path); + auto tg = build_trace_tilegraph(simulator, trace_so_path, cycle_table_path, 0); + if (!tg) { spdlog::error("[TOGSim] trace producer run failed"); exit(1); } + tg->set_arrival_time(simulator->get_core_cycle()); + tg->set_kernel_id(0); + simulator->enqueue_graph(0, std::move(tg)); + simulator->run_simulator(); + spdlog::info("[TOGSim-trace] Total cycles: {}", simulator->get_core_cycle()); + spdlog::info("Simulation finished"); + simulator->print_core_stat(); + return 0; + } + // Get trace file path cmd_parser.set_if_defined("models_list", &trace_file_path); diff --git a/TOGSim/src/togsim_runtime.cc b/TOGSim/src/togsim_runtime.cc new file mode 100644 index 00000000..a83b8541 --- /dev/null +++ b/TOGSim/src/togsim_runtime.cc @@ -0,0 +1,196 @@ +// togsim_runtime.cc +// ----------------------------------------------------------------------------- +// C6 runtime + loader for the compiled trace producer (P3 task 5). Implements +// the producer ABI (togsim_runtime.h) and the TOGSim-side loader +// (togsim_loader.h). See docs/design/togsim_cpp_trace.md sec 5.3 / 9.6.1 / 9.7. +// +// The producer `.so` calls the extern "C" togsim_* functions below; each one +// records a TraceRec on the EmitCtx. EmitCtx is the opaque type the producer +// only ever passes back to us. This is the "materializing sink": it resolves +// addresses and per-tile cycles into a recorded instruction stream. Wiring the +// stream into the existing timing core (Core/Simulator) is the remaining step. +// ----------------------------------------------------------------------------- + +#include "togsim_loader.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +// Full definition of the opaque handle from togsim_runtime.h. The producer holds +// only EmitCtx* and never dereferences it. +struct EmitCtx { + // inputs supplied by the loader + const uint64_t* tensor_base = nullptr; + int32_t n_tensors = 0; + const int64_t* cyc = nullptr; // tile_id -> cycle + const int64_t* ovl = nullptr; // tile_id -> overlapping_cycle + int32_t n_tiles = 0; + std::vector cores{0}; // the partition's core ids; dispatch round-robins over these + // mutable run state + int32_t rr = 0; // round-robin cursor into `cores` + int32_t cur_core = -1; // current work-item's core + std::vector trace; +}; + +namespace { +inline togsim::TraceRec blank(togsim::TraceRec::Kind k, int32_t core) { + togsim::TraceRec r{}; + r.kind = k; + r.core = core; + return r; +} +} // namespace + +extern "C" { + +int32_t togsim_abi_version(void) { return TOGSIM_ABI_VERSION; } + +void togsim_dispatch(EmitCtx* ctx, togsim_tile_fn fn, int64_t* iv, int32_t n_iv) { + // Higher-order work-item wrapper (sec 9.3): round-robin over THIS partition's + // cores (a kernel is enqueued to one partition; partitions are independent, so + // a work-item must never land on another partition's core -- that subgraph would + // sit in this partition's scheduler forever). Bracket the work-item with + // TILE_BEGIN/TILE_END; the ops fn emits records under ctx->cur_core. + ctx->cur_core = ctx->cores.empty() ? 0 + : ctx->cores[ctx->rr++ % (int32_t)ctx->cores.size()]; + ctx->trace.push_back(blank(togsim::TraceRec::TILE_BEGIN, ctx->cur_core)); + fn(ctx, iv, n_iv); + ctx->trace.push_back(blank(togsim::TraceRec::TILE_END, ctx->cur_core)); +} + +void togsim_dma(EmitCtx* ctx, int32_t dir, int32_t arg_id, + uint64_t offset, int32_t ndim, const int64_t* dims, + const int64_t* strides, int32_t elem_bits, + int32_t is_async, int32_t tag_id, uint64_t tag_slot, + const int64_t* read_bufs, int32_t n_read, + const int64_t* write_bufs, int32_t n_write) { + uint64_t base = (arg_id >= 0 && arg_id < ctx->n_tensors) + ? ctx->tensor_base[arg_id] : 0; + uint64_t addr = base + offset * (uint64_t)(elem_bits / 8); + togsim::TraceRec r = blank(togsim::TraceRec::DMA, ctx->cur_core); + r.dir = dir; r.arg_id = arg_id; r.elem_bits = elem_bits; + r.is_async = is_async; r.addr = addr; r.tag_id = tag_id; r.tag_slot = tag_slot; + for (int32_t i = 0; i < ndim; ++i) { + if (dims) r.dims.push_back(dims[i]); + if (strides) r.strides.push_back(strides[i]); + } + for (int32_t i = 0; i < n_read; ++i) r.read_bufs.push_back(read_bufs[i]); + for (int32_t i = 0; i < n_write; ++i) r.write_bufs.push_back(write_bufs[i]); + ctx->trace.push_back(r); +} + +void togsim_compute(EmitCtx* ctx, uint64_t tile_id, int32_t compute_type, + int32_t ndim, const int64_t* dims, + const int64_t* read_bufs, int32_t n_read, + const int64_t* write_bufs, int32_t n_write) { + (void)ndim; (void)dims; + togsim::TraceRec r = blank(togsim::TraceRec::COMPUTE, ctx->cur_core); + r.tile_id = tile_id; + r.compute_type = compute_type; + for (int32_t i = 0; i < n_read; ++i) r.read_bufs.push_back(read_bufs[i]); + for (int32_t i = 0; i < n_write; ++i) r.write_bufs.push_back(write_bufs[i]); + if (ctx->cyc && (int32_t)tile_id < ctx->n_tiles) r.cycle = ctx->cyc[tile_id]; + if (ctx->ovl && (int32_t)tile_id < ctx->n_tiles) r.overlapping = ctx->ovl[tile_id]; + ctx->trace.push_back(r); +} + +void togsim_memory_barrier(EmitCtx* ctx, int32_t tag_id, uint64_t tag_slot, + const int64_t* write_bufs, int32_t n_write) { + togsim::TraceRec r = blank(togsim::TraceRec::MEMORY_BAR, ctx->cur_core); + r.tag_id = tag_id; r.tag_slot = tag_slot; + for (int32_t i = 0; i < n_write; ++i) r.write_bufs.push_back(write_bufs[i]); + ctx->trace.push_back(r); +} + +} // extern "C" + +namespace togsim { + +RunResult run_producer(const char* so_path, + const int64_t* shape_args, int32_t n_shape, + const uint64_t* tensor_base, int32_t n_tensors, + const int64_t* cyc, const int64_t* ovl, int32_t n_tiles, + const int32_t* partition_cores, int32_t n_partition_cores) { + RunResult res; + void* lib = dlopen(so_path, RTLD_NOW | RTLD_GLOBAL); + if (!lib) { fprintf(stderr, "togsim: dlopen failed: %s\n", dlerror()); return res; } + auto emit = (void (*)(EmitCtx*, int64_t*, int32_t))dlsym(lib, "togsim_kernel"); + if (!emit) { fprintf(stderr, "togsim: dlsym togsim_kernel failed: %s\n", dlerror()); return res; } + + EmitCtx ctx; + ctx.tensor_base = tensor_base; ctx.n_tensors = n_tensors; + ctx.cyc = cyc; ctx.ovl = ovl; ctx.n_tiles = n_tiles; + ctx.cores.assign(partition_cores, partition_cores + (n_partition_cores > 0 ? n_partition_cores : 0)); + if (ctx.cores.empty()) ctx.cores.push_back(0); + emit(&ctx, (int64_t*)shape_args, n_shape); + + res.ok = true; + res.trace = std::move(ctx.trace); + return res; +} + +SimResult simulate(const RunResult& run, const TimingParams& params) { + SimResult out; + std::unordered_map dma_free; // DMA-engine free time, per core + std::unordered_map comp_free; // compute free time, per core + std::unordered_map prev_comp; // prev compute finish (overlap), per core + std::map, uint64_t> tag_finish; // (tag_id,tag_slot) -> finish + std::vector pending; // barrier-resolved deps since last compute + + for (const auto& t : run.trace) { + const int c = t.core; + switch (t.kind) { + case TraceRec::DMA: { + // DMAs serialize on the core's DMA engine (overlap compute -> separate + // timeline). finish = issue + latency, recorded under the runtime tag. + uint64_t start = dma_free[c]; + uint64_t fin = start + params.dma_latency; + dma_free[c] = fin; + tag_finish[{t.tag_id, t.tag_slot}] = fin; + out.n_dma++; + break; + } + case TraceRec::MEMORY_BAR: { + // the explicit async-DMA sync: gate the next compute on the paired dma's + // data-arrival, found by the runtime tag (tag_id, tag_slot). + auto it = tag_finish.find({t.tag_id, t.tag_slot}); + if (it != tag_finish.end()) pending.push_back(it->second); + break; + } + case TraceRec::COMPUTE: { + uint64_t deps = 0; + for (uint64_t f : pending) deps = std::max(deps, f); + pending.clear(); + uint64_t start = std::max(comp_free[c], deps); + uint64_t fin; + auto pit = prev_comp.find(c); + if (pit != prev_comp.end()) { + uint64_t prev = pit->second; + uint64_t tail = prev > start ? prev - start : 0; // prev still running + uint64_t overlapped = std::min(tail, (uint64_t)t.overlapping); + fin = std::max(start, prev) + (uint64_t)t.cycle - overlapped; + } else { + fin = start + (uint64_t)t.cycle; + } + comp_free[c] = fin; + prev_comp[c] = fin; + out.n_compute++; + break; + } + case TraceRec::TILE_BEGIN: + case TraceRec::TILE_END: + break; // work-item boundary: no cost in this reference timer + } + } + for (auto& kv : dma_free) out.total_cycle = std::max(out.total_cycle, kv.second); + for (auto& kv : comp_free) out.total_cycle = std::max(out.total_cycle, kv.second); + return out; +} + +} // namespace togsim diff --git a/TOGSim/src/togsim_trace_bridge.cc b/TOGSim/src/togsim_trace_bridge.cc new file mode 100644 index 00000000..351e313e --- /dev/null +++ b/TOGSim/src/togsim_trace_bridge.cc @@ -0,0 +1,342 @@ +// togsim_trace_bridge.cc -- see togsim_trace_bridge.h +#include "togsim_trace_bridge.h" + +#include +#include +#include +#include + +#include "Tile.h" +#include "Instruction.h" + +namespace { + +// `uniq` is a per-DMA-record unique tag-key id minted by the caller. The Core +// tag table keys completion on [addr_id, ..., sum(tag_idx*stride)]; using `uniq` +// as addr_id makes every reduction iteration of one static dma get a DISTINCT +// key -- so multi-tile-K (and conv, whose reduction is the kh*kw*C nest) do not +// collide, with no coordinate enumeration. The matching memory_barrier reuses +// the same `uniq` (current-load map per (tag_id, tag_slot), see +// trace_to_tilegraph), so the table still pairs them. This works because the +// recorded stream is already per-iteration (the producer ran the loops) -- +// unlike a compile-time event_id. `tag_idx` (the subtile slot) is retained for +// the SRAM double-buffer model. +// +// FIXME(semantics): the per-iteration tag is still reconstructed HERE from the +// record order. The producer IR now DOES carry a per-iteration tag -- dma_fine_- +// grained emits a fresh tag memref.alloc just before each coarse load (rewiring +// its dma_wait), so successive reduction iterations allocate distinct tags -- but +// build_skeleton collapses that to one static tag_id (it DCEs the alloc and keys +// togsim.dma by the alloc's static identity), so this bridge still needs `uniq` +// to tell iterations apart at runtime. The faithful finish is to thread the +// per-iteration alloc identity through build_skeleton as an SSA tag handle on the +// togsim.dma / togsim.memory_barrier (then `uniq` here is unnecessary). +std::shared_ptr make_dma(const togsim::TraceRec& t, int64_t uniq) { + Opcode op = (t.dir == 1) ? Opcode::MOVOUT : Opcode::MOVIN; + std::vector tile_size(t.dims.begin(), t.dims.end()); + std::vector tile_stride(t.strides.begin(), t.strides.end()); + std::vector tag_idx{(int64_t)t.tag_slot}; + std::vector tag_stride{1}; + auto inst = std::make_shared( + op, /*compute_cycle=*/0, /*num_parents=*/0, /*dram_addr=*/t.addr, + tile_size, tile_stride, (size_t)t.elem_bits, tag_idx, tag_stride, + /*accum_tag_idx_list=*/std::vector{}); + inst->set_is_async(t.is_async != 0); + inst->set_addr_name("tag" + std::to_string(uniq), uniq); + inst->prepare_tag_key(); + return inst; +} + +// A MEMORY_BAR carrying the SAME `uniq` tag key as the async dma it gates -- the +// Core's tag table signals it at the dma's DATA-ready (resp-complete), unlike a +// raw DONE edge that the async dma releases at issue-complete. +std::shared_ptr make_mem_bar(const togsim::TraceRec& t, int64_t uniq) { + auto bar = std::make_shared( + Opcode::MEMORY_BAR, 0, 0, 0, + std::vector{}, std::vector{}, 0, + std::vector{(int64_t)t.tag_slot}, std::vector{1}, + std::vector{}); + bar->set_addr_name("tag" + std::to_string(uniq), uniq); + bar->prepare_tag_key(); + return bar; +} + +std::shared_ptr make_compute(const togsim::TraceRec& t) { + auto inst = std::make_shared( + Opcode::COMP, /*compute_cycle=*/(cycle_type)t.cycle, /*num_parents=*/0, + /*dram_addr=*/0, std::vector{}, std::vector{}, /*elem_bits=*/0, + std::vector{}, std::vector{}, std::vector{}); + inst->set_overlapping_cycle((cycle_type)t.overlapping); + inst->set_compute_type(t.compute_type); // route to VPU vs systolic array + return inst; +} + +} // namespace + +std::unique_ptr trace_to_tilegraph(const togsim::RunResult& run, + const std::string& name) { + using togsim::TraceRec; + auto tg = std::make_unique(name, name); + // Empty cache plan (no L2/CMEM persistence) -- append_subgraph propagates it + // to each subgraph, and DMA::is_cacheable dereferences it, so it must be a + // valid (if empty) IntervalTree rather than null. + tg->init_cache_plan({}); + + std::shared_ptr sg; + std::shared_ptr tile; + // Explicit dependency DAG (sec 10), one clean dataflow rule (see `link`). + // Per SRAM buffer we keep writers(b) -- a SET of the current producers' + // DONE-handles -- and readers(b). Scoped per work-item (reset at each dispatch) + // -- buffers are work-item-local, so distinct work-items are independent + // (-> parallel). + std::map>> writers; // buffer id -> current producers (DONE-handles) + // An async dma is paired with its explicit memory_barrier(s) by the runtime tag + // (tag_id, tag_slot). It is 1 load : N barriers (the load happens once per + // reduction iteration; each consumer in that iteration is preceded by a wait on + // the same tag), so we track the CURRENT (most recent) load per (tag_id, + // tag_slot) -- not a FIFO. Each load gets a fresh `uniq` Core key, so successive + // reduction iterations (multi-tile-K, conv) never collide in the tag table; the + // iteration's barriers reuse that load's uniq. Correct because the load nest and + // its consumer nest run in order within the reduction body (no cross-iteration + // prefetch). Scoped per work-item. + std::map, + std::pair>> current_dma; + // Dedup identical dma_waits: the barrier already built for the CURRENT load of a + // (tag_id, tag_slot). A later memory_barrier on the SAME load instance reuses it + // (its consumers gate on the existing bar) instead of re-emitting -- a conv reads one + // loaded subtile from many matmuls, so the fine-grained per-consumer waits collapse to + // one per load. A new load (next reduction iter) bumps uniq, so a genuine new wait + // still gets its own bar; the first wait stays at its consumer, so overlap is kept. + std::map, + std::pair>> bar_for_load; + int64_t next_tag = 0; // mints a unique Core tag key per dma record + int cur_tile_group = -1; // work-item index, bumped per TILE_BEGIN (trace grouping) + + auto flush = [&]() { + if (sg && tile) { + sg->add_tile(tile); + tile->set_owner(sg); + tg->append_subgraph(sg); + } + sg.reset(); + tile.reset(); + writers.clear(); + current_dma.clear(); + bar_for_load.clear(); + next_tag = 0; + }; + + // Single dataflow rule (sec 10). Per buffer b, writers(b) is a SET of the + // current producers' DONE-handles. + // - READ b: depend on ALL writers(b) -- occupancy (ISSUE) when both are SA ops + // (preload/matmul overlap on the pipeline), else latency (DONE). + // - WRITE b: REPLACE -- reset writers(b)={inst}. + // - Exception is_mm_accum (a MATMUL reading AND writing b = a commutative + // accumulator, Y += X@W): skip the read edge and UNION the write -- wait only the + // non-matmul seed (init/bias) and join writers(b) without resetting or ordering + // against co-matmuls, so the K matmuls do not chain through the accumulator and a + // later reader joins all of them. TOGSim is timing-only (values come from trace). + // Buffer-reuse (WAR) ordering is modeled by the resource models, not edges: the SRAM + // version/capacity machinery for spad buffers, the weight-slot machinery for weights. + const int MATMUL_CT = 1, PRELOAD_CT = 2; + auto is_mm_accum = [&](const std::shared_ptr& inst, int64_t b, + const std::vector& writes) { + if (inst->get_compute_type() != MATMUL_CT) return false; + for (int64_t w : writes) if (w == b) return true; + return false; + }; + auto link = [&](std::shared_ptr inst, + const std::vector& reads, + const std::vector& writes) { + for (int64_t b : reads) { + if (is_mm_accum(inst, b, writes)) continue; // accumulator read -> handled in WRITE (UNION) + auto it = writers.find(b); + if (it != writers.end()) + for (auto& w : it->second) { + int pct = w->get_compute_type(); + // both SA ops -> occupancy (overlap on the SA pipeline); else latency. + DepEvent on = (inst->get_compute_type() == MATMUL_CT && + (pct == MATMUL_CT || pct == PRELOAD_CT)) + ? DepEvent::ISSUE : DepEvent::DONE; + w->add_dep(inst, on); + } + } + for (int64_t b : writes) { + if (is_mm_accum(inst, b, writes)) { // UNION (commutative accumulate) + auto it = writers.find(b); + if (it != writers.end()) + for (auto& s : it->second) + if (s->get_compute_type() != MATMUL_CT) + s->add_dep(inst, DepEvent::DONE); // wait the init/bias seed only + writers[b].push_back(inst); // join; no reset, no co-matmul edge + } else { // REPLACE (normal output; resets the producer set) + writers[b] = { inst }; + } + } + tile->append_instuction(inst); + }; + + // --- SRAM-capacity tracking (buffer-version allocations, sec 10.x) --- + // A coarse tile = one version of its buffer; the fine DMAs that fill it share + // one allocation, freed once all the version's consumers have issued (refcount + // -> 0). NOT reset in flush(): the spad is one physical per-core resource, so a + // buffer reused by the next reduction iter / work-item is a NEW version that + // must wait for the old one to free (WAR / double-buffer). Both DMA-loaded + // buffers AND compute outputs (the accumulator, vector epilogue results) are + // tracked; the virtual SA-weights are not (weight slots model them). (v1: + // single-core; multi-core would key cur_alloc/vers by (core, buf).) + int64_t next_alloc = 0; + std::map cur_alloc; // buf -> current version id + std::map open_ver; // buf -> version still accepting writes + struct Ver { std::vector> loads, readers; }; + std::map vers; + // Spad bytes per buffer id, taken from the DMA records that touch it (load fills + // its dst, store drains its src) -- the authoritative tile size. A compute output + // (never DMA-loaded but stored) gets its footprint from its store record. Built + // in a pre-pass so it is known before the producing compute is processed. + auto rec_bytes = [](const TraceRec& t) { // single source of the tile footprint + size_t numel = 1; + for (auto d : t.dims) numel *= (size_t)d; + return numel * (t.elem_bits / 8); + }; + std::map buf_bytes; + for (const auto& t : run.trace) { + if (t.kind != TraceRec::DMA) continue; + const auto& bs = (t.dir == 1) ? t.read_bufs : t.write_bufs; // store reads spad, load writes spad + for (int64_t b : bs) buf_bytes[b] = rec_bytes(t); + } + auto sram_on_load = [&](int64_t b, const std::shared_ptr& ld) { + if (!cur_alloc.count(b) || !open_ver[b]) { // a read closed it -> new version + cur_alloc[b] = next_alloc++; + open_ver[b] = true; + vers[cur_alloc[b]] = {}; + } + ld->set_sram_alloc(cur_alloc[b]); + vers[cur_alloc[b]].loads.push_back(ld); + }; + // A compute that freshly produces buffer b (b not read-and-written in place) opens + // a version like a load; the opener carries b's footprint (from buf_bytes). A + // version continues across the producing writes until a consuming read closes it, + // and its last reader frees it (sram_finalize) -- identical lifecycle to a load. + auto sram_on_write = [&](int64_t b, const std::shared_ptr& w) { + auto bb = buf_bytes.find(b); + if (bb == buf_bytes.end()) return; // size unknown (never DMA'd) -> untracked + if (!cur_alloc.count(b) || !open_ver[b]) { // a consuming read closed it -> new version + cur_alloc[b] = next_alloc++; + open_ver[b] = true; + vers[cur_alloc[b]] = {}; + w->set_sram_alloc(cur_alloc[b]); + w->set_sram_footprint(bb->second); + vers[cur_alloc[b]].loads.push_back(w); + } + // already-open version (further producing writes): same physical bytes, no re-add. + }; + auto sram_on_read = [&](int64_t b, const std::shared_ptr& rd) { + auto it = cur_alloc.find(b); + if (it == cur_alloc.end()) return; // not a load buffer -> untracked + vers[it->second].readers.push_back(rd); + open_ver[b] = false; // next write starts a new version + }; + auto sram_finalize = [&]() { // tag only each version's LAST reader + for (auto& kv : vers) { + auto& v = kv.second; + if (v.readers.empty()) { // no consumer -> never freed: untrack + for (auto& ld : v.loads) ld->set_sram_alloc(-1); + continue; + } + v.readers.back()->add_sram_release(kv.first); // it frees the whole version on issue + } + }; + + for (const auto& t : run.trace) { + if (t.kind == TraceRec::TILE_BEGIN) { + // togsim_dispatch opened a work-item -> new subgraph (bound to its core) + + // tile. The scope runs until the matching TILE_END (the dispatch wrapper + // brackets the tile fn call), not until the next begin. + flush(); + sg = std::make_shared(); + sg->set_core_id(t.core); + tile = std::make_shared(Tile::Status::INITIALIZED); + cur_tile_group++; + continue; + } + if (t.kind == TraceRec::TILE_END) { + flush(); // close the work-item explicitly (scope = the tile fn call) + continue; + } + if (!tile) continue; // defensive: ops before the first TILE_BEGIN + + if (t.kind == TraceRec::DMA) { + int64_t uniq = next_tag++; // fresh Core tag key per dma record + auto inst = make_dma(t, uniq); + inst->set_tile_group(cur_tile_group); + tile->inc_required_sram_size(rec_bytes(t)); // SRAM footprint (ready-tile ordering) + if (t.dir == 1) { // STORE + // store reads the result buffer(s) -> link() JOINs all their writers. + link(inst, t.read_bufs, t.write_bufs); + for (int64_t b : t.read_bufs) sram_on_read(b, inst); // store frees what it drains + } else { // LOAD + tile->append_instuction(inst); + // async load: record it as the CURRENT load for this (tag_id, tag_slot) + // with its fresh uniq; the barriers in this reduction iteration reuse that + // uniq (1 load : N barriers). A new iteration's load overwrites it with a + // new uniq -> distinct tag key, no collision. writers = the dma for now; + // the barrier overwrites it so consumers gate on data arrival. A sync load + // has no barrier and blocks to arrival itself. + if (t.is_async) current_dma[{t.tag_id, t.tag_slot}] = {uniq, inst}; + for (int64_t b : t.write_bufs) { + // No hard WAR edge here: load-buffer reuse (double-buffering, X_spad/ + // W_spad reloaded each reduction iter) is modeled by the SRAM + // version/capacity machinery (sram_on_load), which sizes how many + // versions physically coexist. A latency WAR edge would force + // single-buffering and kill the overlap the spad permits. (The + // accumulator Y is NOT a load buffer -> its cross-tile WAR is handled by + // the REPLACE branch of link() when the next tile's init overwrites it.) + writers[b] = { inst }; + sram_on_load(b, inst); // occupy spad + } + } + } else if (t.kind == TraceRec::MEMORY_BAR) { + // the explicit async-DMA sync (the original dma_wait). Pair with the CURRENT + // load for this (tag_id, tag_slot), reusing its uniq Core key so the dma and + // bar pair in the tag table; the dma releases the bar at issue-complete + // (a DONE edge), then the bar parks on the tag until data-ready (resp-complete, + // set_tag_finish). Consumers of the loaded buffer then gate on the bar, so + // the bar (not the load) is the load's DONE-handle in writers(b). + auto it = current_dma.find({t.tag_id, t.tag_slot}); + int64_t uniq = next_tag++; // fallback if unpaired + std::shared_ptr dma_inst; + if (it != current_dma.end()) { uniq = it->second.first; dma_inst = it->second.second; } + // Identical wait (same slot, same load instance) already has a barrier -> reuse it + // so the buffer's consumers gate on it, instead of emitting a redundant barrier. + auto bf = bar_for_load.find({t.tag_id, t.tag_slot}); + if (bf != bar_for_load.end() && bf->second.first == uniq) { + for (int64_t b : t.write_bufs) writers[b] = { bf->second.second }; + continue; + } + auto bar = make_mem_bar(t, uniq); + bar->set_tile_group(cur_tile_group); + if (dma_inst) dma_inst->add_dep(bar, DepEvent::DONE); + tile->append_instuction(bar); + // the bar is the load's DONE-handle: REPLACE writers(b) with it (no WAR -- the + // load already WAR'd the prior readers when it wrote). + for (int64_t b : t.write_bufs) writers[b] = { bar }; + bar_for_load[{t.tag_id, t.tag_slot}] = {uniq, bar}; + } else if (t.kind == TraceRec::COMPUTE) { + auto inst = make_compute(t); + inst->set_tile_group(cur_tile_group); + link(inst, t.read_bufs, t.write_bufs); + // in-place buffers (read AND written) are version-transparent (accumulator, + // in-place vector): skip the self-read and the self-write so footprint is not + // double-counted. read_bufs/write_bufs are tiny, so a linear scan beats a set. + auto in = [](const std::vector& v, int64_t b) { + return std::find(v.begin(), v.end(), b) != v.end(); + }; + for (int64_t b : t.read_bufs) if (!in(t.write_bufs, b)) sram_on_read(b, inst); // consuming reads + for (int64_t b : t.write_bufs) if (!in(t.read_bufs, b)) sram_on_write(b, inst); // fresh outputs + } + } + flush(); + sram_finalize(); // readers per version are now final -> set each version's refcount + return tg; +} diff --git a/configs/systolic_ws_128x128_c1_booksim_tpuv2.yml b/configs/systolic_ws_128x128_c1_booksim_tpuv2.yml index 6d2537d9..7fea374b 100644 --- a/configs/systolic_ws_128x128_c1_booksim_tpuv2.yml +++ b/configs/systolic_ws_128x128_c1_booksim_tpuv2.yml @@ -22,3 +22,6 @@ codegen_external_mapping_file: '' codegen_autotune_max_retry: 10 codegen_autotune_template_topk: 4 codegen_compiler_optimization: all + +# Per-core VMEM (vector/scratchpad) size: TPUv2/v3/v4 = 16 MB. +core_spad_size_kb: 16384 diff --git a/configs/systolic_ws_128x128_c1_booksim_tpuv3.yml b/configs/systolic_ws_128x128_c1_booksim_tpuv3.yml index f830419b..3a96b588 100644 --- a/configs/systolic_ws_128x128_c1_booksim_tpuv3.yml +++ b/configs/systolic_ws_128x128_c1_booksim_tpuv3.yml @@ -26,3 +26,6 @@ codegen_external_mapping_file: '' codegen_autotune_max_retry: 10 codegen_autotune_template_topk: 4 codegen_compiler_optimization: all + +# Per-core VMEM (vector/scratchpad) size: TPUv2/v3/v4 = 16 MB. +core_spad_size_kb: 16384 diff --git a/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.yml b/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.yml index 1a8c60f6..41e267b6 100644 --- a/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.yml +++ b/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.yml @@ -25,3 +25,6 @@ codegen_external_mapping_file: '' codegen_autotune_max_retry: 10 codegen_autotune_template_topk: 4 codegen_compiler_optimization: all + +# Per-core VMEM (vector/scratchpad) size: TPUv2/v3/v4 = 16 MB. +core_spad_size_kb: 16384 diff --git a/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml b/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml index ff976784..397f0fb7 100644 --- a/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml +++ b/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml @@ -26,3 +26,6 @@ codegen_external_mapping_file: '' codegen_autotune_max_retry: 10 codegen_autotune_template_topk: 4 codegen_compiler_optimization: all + +# Per-core VMEM (vector/scratchpad) size: TPUv2/v3/v4 = 16 MB. +core_spad_size_kb: 16384 diff --git a/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.yml b/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.yml index 2ed1bb12..f080fc69 100644 --- a/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.yml +++ b/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.yml @@ -26,3 +26,6 @@ codegen_external_mapping_file: '' codegen_autotune_max_retry: 10 codegen_autotune_template_topk: 4 codegen_compiler_optimization: all + +# Per-core VMEM (vector/scratchpad) size: TPUv2/v3/v4 = 16 MB. +core_spad_size_kb: 16384 diff --git a/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_timing_only.yml b/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_timing_only.yml index 1bcc9bb3..f89661b8 100644 --- a/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_timing_only.yml +++ b/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_timing_only.yml @@ -26,3 +26,6 @@ codegen_external_mapping_file: '' codegen_autotune_max_retry: 10 codegen_autotune_template_topk: 8 codegen_compiler_optimization: all + +# Per-core VMEM (vector/scratchpad) size: TPUv2/v3/v4 = 16 MB. +core_spad_size_kb: 16384 diff --git a/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.yml b/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.yml index 39d195b0..ca69d930 100644 --- a/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.yml +++ b/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.yml @@ -28,3 +28,6 @@ codegen_external_mapping_file: '' codegen_autotune_max_retry: 10 codegen_autotune_template_topk: 4 codegen_compiler_optimization: all + +# Per-core VMEM (vector/scratchpad) size: TPUv2/v3/v4 = 16 MB. +core_spad_size_kb: 16384 diff --git a/configs/systolic_ws_128x128_c2_booksim_tpuv3.yml b/configs/systolic_ws_128x128_c2_booksim_tpuv3.yml index bf01913b..b7b03e7a 100644 --- a/configs/systolic_ws_128x128_c2_booksim_tpuv3.yml +++ b/configs/systolic_ws_128x128_c2_booksim_tpuv3.yml @@ -26,3 +26,6 @@ codegen_external_mapping_file: '' codegen_autotune_max_retry: 10 codegen_autotune_template_topk: 4 codegen_compiler_optimization: all + +# Per-core VMEM (vector/scratchpad) size: TPUv2/v3/v4 = 16 MB. +core_spad_size_kb: 16384 diff --git a/configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.yml b/configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.yml index 8c71c528..903ffcbc 100644 --- a/configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.yml +++ b/configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.yml @@ -34,3 +34,6 @@ codegen_external_mapping_file: '' codegen_autotune_max_retry: 10 codegen_autotune_template_topk: 4 codegen_compiler_optimization: all + +# Per-core VMEM (vector/scratchpad) size: TPUv2/v3/v4 = 16 MB. +core_spad_size_kb: 16384 diff --git a/configs/systolic_ws_128x128_c2_chiplet_tpuv3.yml b/configs/systolic_ws_128x128_c2_chiplet_tpuv3.yml index d058f188..6a234017 100644 --- a/configs/systolic_ws_128x128_c2_chiplet_tpuv3.yml +++ b/configs/systolic_ws_128x128_c2_chiplet_tpuv3.yml @@ -28,3 +28,6 @@ codegen_external_mapping_file: '' codegen_autotune_max_retry: 10 codegen_autotune_template_topk: 4 codegen_compiler_optimization: all + +# Per-core VMEM (vector/scratchpad) size: TPUv2/v3/v4 = 16 MB. +core_spad_size_kb: 16384 diff --git a/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.yml b/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.yml index 019a0f0f..f0546e56 100644 --- a/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.yml +++ b/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.yml @@ -27,3 +27,6 @@ codegen_external_mapping_file: '' codegen_autotune_max_retry: 10 codegen_autotune_template_topk: 4 codegen_compiler_optimization: all + +# Per-core VMEM (vector/scratchpad) size: TPUv2/v3/v4 = 16 MB. +core_spad_size_kb: 16384 diff --git a/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.yml b/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.yml index 348babae..08ec26ac 100644 --- a/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.yml +++ b/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.yml @@ -25,3 +25,6 @@ codegen_external_mapping_file: '' codegen_autotune_max_retry: 10 codegen_autotune_template_topk: 4 codegen_compiler_optimization: all + +# Per-core VMEM (vector/scratchpad) size: TPUv2/v3/v4 = 16 MB. +core_spad_size_kb: 16384 diff --git a/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.yml b/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.yml index a0985aec..a6e073e9 100644 --- a/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.yml +++ b/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.yml @@ -26,3 +26,6 @@ codegen_external_mapping_file: '' codegen_autotune_max_retry: 10 codegen_autotune_template_topk: 4 codegen_compiler_optimization: all + +# Per-core VMEM (vector/scratchpad) size: TPUv2/v3/v4 = 16 MB. +core_spad_size_kb: 16384 diff --git a/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_ils.yml b/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_ils.yml index 166e2e25..5436b3e8 100644 --- a/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_ils.yml +++ b/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_ils.yml @@ -29,3 +29,6 @@ codegen_external_mapping_file: '' codegen_autotune_max_retry: 10 codegen_autotune_template_topk: 4 codegen_compiler_optimization: all + +# Per-core VMEM (vector/scratchpad) size: TPUv2/v3/v4 = 16 MB. +core_spad_size_kb: 16384 diff --git a/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.yml b/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.yml index 6119e83d..d928f9d3 100644 --- a/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.yml +++ b/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.yml @@ -30,3 +30,6 @@ codegen_external_mapping_file: '' codegen_autotune_max_retry: 10 codegen_autotune_template_topk: 4 codegen_compiler_optimization: all + +# Per-core VMEM (vector/scratchpad) size: TPUv2/v3/v4 = 16 MB. +core_spad_size_kb: 16384 diff --git a/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml b/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml index 9100c22a..dd9dfac7 100644 --- a/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml +++ b/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml @@ -28,3 +28,6 @@ codegen_external_mapping_file: '' codegen_autotune_max_retry: 10 codegen_autotune_template_topk: 4 codegen_compiler_optimization: all + +# Per-core VMEM (vector/scratchpad) size: TPUv2/v3/v4 = 16 MB. +core_spad_size_kb: 16384 diff --git a/configs/systolic_ws_8x8_c1_booksim.yml b/configs/systolic_ws_8x8_c1_booksim.yml index f46d380e..1593e148 100644 --- a/configs/systolic_ws_8x8_c1_booksim.yml +++ b/configs/systolic_ws_8x8_c1_booksim.yml @@ -23,3 +23,6 @@ codegen_external_mapping_file: '' codegen_autotune_max_retry: 10 codegen_autotune_template_topk: 4 codegen_compiler_optimization: all + +# Per-core spad: 8x8 array, 128 KB x 8 = 1 MB. +core_spad_size_kb: 1024 diff --git a/configs/systolic_ws_8x8_c1_simple_noc.yml b/configs/systolic_ws_8x8_c1_simple_noc.yml index 1be24b85..b2d16c6a 100644 --- a/configs/systolic_ws_8x8_c1_simple_noc.yml +++ b/configs/systolic_ws_8x8_c1_simple_noc.yml @@ -24,3 +24,6 @@ codegen_external_mapping_file: '' codegen_autotune_max_retry: 10 codegen_autotune_template_topk: 4 codegen_compiler_optimization: all + +# Per-core spad: 8x8 array, 128 KB x 8 = 1 MB. +core_spad_size_kb: 1024 diff --git a/docs/design/togsim_cpp_trace.md b/docs/design/togsim_cpp_trace.md new file mode 100644 index 00000000..9565bdfb --- /dev/null +++ b/docs/design/togsim_cpp_trace.md @@ -0,0 +1,1006 @@ +# TOGSim C++ Trace Generation — Design Proposal + +**Status:** Implemented end-to-end through the real timing Core (256^3 GEMM); see +§11 for remaining work. +**Branch:** `feature/togsim-cpp-trace` +**Scope:** Replace the timing-path TOG producer (MLIR → Python-dict → ONNX → C++ +parser) with a compiled, shape-parametric trace producer (MLIR → C++ → `.so`). +TOGSim's timing core is preserved. + +**Note on the sync mechanism (read before §3, §5, §9).** An earlier version of +this design synchronized an asynchronous DMA with the consumer that waits on its +data using a compile-time integer `event_id` — one id per static `togsim.dma`/ +`togsim.wait` op, paired through a heap "event buffer" of opaque handles. That +mechanism was *removed*: a single static `togsim.dma` op executes once per loop +iteration, each iteration writing a different runtime tag slot, so one +compile-time id per op cannot represent the per-iteration pairing. The current +design (ABI v11) pairs an async DMA with its sync point by the **runtime tag +slot** instead. Sections below have been rewritten to the runtime-tag model; +where a section still mentions `event_id` / event handles / `togsim_wait` / +`togsim_signal`, it is flagged as the superseded design, not current behavior. + +--- + +## 1. Motivation + +The current Tile-Operation Graph (TOG) pipeline has accumulated structural debt +that blocks where we want to go (notably dynamic shape for LLM decode / MoE): + +1. **"ONNX in name only."** TOG is serialized as ONNX, but every op is a custom + `torchsim_*` attribute. We pay ONNX's costs (rigid schema, protobuf, + stringly-typed attribute encoding) and use none of its interop value + (onnxruntime, standard ops, netron). The schema lives in three places — + Python dict (`extension_op.py`), ONNX (`AsmParser/onnx_utility.py`), C++ + (`TOGSim/.../TileGraphParser`) — and drifts. + +2. **Synchronization is ad-hoc and DMA-specific.** Completion tracking is a + counting-semaphore in disguise, but unnamed and tangled: + - `DMA.h`: `tag_table[subgraph][tag_key] -> uint32` with overloaded magic + values (`0` pending, `1` signaled, `>1` consumed-count, `-1` sparse) plus a + parallel `waiters` wait-queue. The `tag_key` is a hand-rolled + content-addressed vector computed from loop indices/strides (`calc_tag`), + with implicit fallbacks (push `0` when an index is missing, dedup by + silently `continue`-ing). + - A *second*, separate dependency mechanism — `Instruction::ready_counter` + + `child_inst` graph edges — handles structural ordering. + - Net: one concept ("an async op completed; a consumer may proceed") is + expressed two different ways, and the event-like one only works for DMA. + +3. **Static shape is baked in.** `build_tog._affine_for_bounds` resolves loop + bounds to constants (`_const_index_value`). The graph is fully materialized + per static shape, so dynamic shape forces recompile-per-shape — pathological + for decode (a new `seq_len` every step) and MoE (variable expert load). + +4. **Loop-flattening hackery.** Much of the roughness (`loop_end` tricks, + `calc_tag`, dedup-by-skip, magic offsets) exists only to flatten loop nests + into a static graph. + +See [Appendix A](#appendix-a-current-state-references) for file:line references. + +## 2. Key idea: trace-driven → execution-driven + +Instead of materializing a flattened graph, **TOG becomes a stream emitted by +*running* a shape-parametric producer.** The producer is C++ compiled from the +kernel's MLIR; it keeps loops as loops (with symbolic bounds) and calls a small +**event-based API**. Each API call emits one trace record = one modeled +instruction. TOGSim `dlopen`s the producer `.so`, injects a callback context +that records and times the stream. + +This directly resolves the four problems: + +| Problem | Resolution | +|---|---| +| ONNX-in-name-only / 3-place schema | The API signature is the single contract. No ONNX. | +| DMA-only, ad-hoc sync | An async DMA and the consumer that waits on its data are paired at runtime by the tile's tag slot, through the existing Core tag table (`prepare_tag_key`/`set_tag_finish`/`register_tag_waiter`). The DMA signals the tag when its data arrives; an explicit `togsim.memory_barrier` waits on it and becomes the last-writer of the loaded buffer, so consumers gate on data arrival. No content-hashed `calc_tag`, no magic values. | +| Static shape | Loop bounds flow from MLIR as-is; symbolic bounds become native loop bounds in C++, so trip count is dynamic. | +| Loop-flatten hacks | Loops stay loops; the trace is generated by executing them. `calc_tag`/dedup disappear. | + +It is *not* a dynamic hardware scheduler: control flow is still statically +emitted by the compiler. The `.so` is a deterministic **trace generator**, not a +timing model — it keeps the trace-as-data boundary, so TOGSim's timing core is +untouched. + +## 3. Core algebra + +Small, orthogonal primitives. Everything else is composition (Layer-1 helpers +like `double_buffered_loop`, not IR primitives). + +- `dma(dir, arg_id, offset, shape, is_async, tag_id, tag_slot, …)` — + `dir ∈ {LOAD, STORE}`. Returns void. A **synchronous** (non-async) DMA is + blocking: it finishes when its data arrives, and consumers depend on it + directly. An **async** DMA returns control immediately and signals its tag at + data arrival (DMA response-complete); a later `memory_barrier` is the explicit + point that waits on it. +- `compute(tile_id, dims…)` — references a fixed-size tile kernel; cost is looked + up (§6), not computed here. +- `memory_barrier(tag_id, tag_slot, write_bufs)` — the explicit async-DMA sync. + It waits until the async DMA carrying the same `(tag_id, tag_slot)` has + delivered its data, then becomes the last-writer of the loaded buffer so + consumers gate on data arrival. It is the original `memref.dma_wait` mapped + through from the source IR, not a synthesized barrier. +- `compute_barrier()` — a compute fence inserted before a store, so the store + sees the drained accumulator. This is the **one** remaining auto-inserted + barrier; it is marked FIXME in the code as something that should also become + explicit in the source IR later (§10.7.3). +- **Control flow lives in the producer** — ordinary `for`/`if`/`while` with + runtime bounds. Loop types (normal/parallel/accumulation/inner) and dynamic + shape are just producer loops; the emitted trace is already specialized. + +Two distinct things share the word "tag", and the design uses **both together** +as the dma↔barrier pairing key: + +- **`tag_id`** — the identity of a DMA's *tag memref*. It plus the runtime + `tag_slot` index identifies which async DMA a `memory_barrier` is waiting on. +- **`tag_slot`** — the SRAM tile slot the loaded tile occupies (the + double-buffer / SRAM-capacity index). It is *also* part of the pairing key + because each load's tile maps to its own slot. The slot is **subtile-only**: + `lower_to_vcix` writes the dma_wait tag index with a `-acc_iv` term for each + accumulation (reduction) loop var — a sentinel marking the reduction axis, not + an arithmetic offset — and `build_skeleton` strips those terms so a + `memory_barrier` waits on the same slot its async load wrote. (Mirrors legacy + `TileGraphParser`, which skips stride -1; reduction iterations are told apart + by the per-iteration tag alloc + a fresh per-record Core key in the bridge, not + by the slot.) Without the strip, the producer evaluates `-acc_iv` to a negative + slot at reduction iteration > 0 and the pairing fails on subtile + multi-tile-K. + +Pairing is done at runtime by the existing TOGSim Core tag table: the async DMA +calls `prepare_tag_key` and `set_tag_finish` (signal at data arrival), the +`memory_barrier` calls `register_tag_waiter` (wait on `(tag_id, tag_slot)`). +A synchronous DMA needs no barrier — it blocks until data arrival itself. + +> **Superseded.** An earlier version used a neutral `event` completion token +> (freely allocated, not tied to memory) with `signal`/`wait`/`wait_all` +> primitives. That has been removed in favor of the runtime-tag mechanism above. + +## 4. Decisions (locked) + +| Axis | Decision | +|---|---| +| Input MLIR | Use the **given MLIR as-is**. Do not touch inductor / MLIR templates / shape plumbing. Whatever bounds the MLIR carries (const or symbolic) pass through verbatim. | +| MLIR → C++ | **EmitC dialect + `mlir-translate --mlir-to-cpp`** (upstream). | +| `.so` ↔ TOGSim | **`dlopen` + `EmitCtx` callback** (execution-driven). The ABI boundary is the main design surface. | +| `.so` role | **Timing trace only.** Functional correctness stays on the existing Spike/LLVM path. Strip every op without a timing dependency; keep loop skeleton + API ops + ops feeding bounds/addresses. | +| Compute cycle | A **separate annotation pass** reuses the existing **sample-mode** to produce a **precomputed `tile_id → cycle` table**, looked up at runtime. | +| Dynamic shape | Falls out of symbolic loop bounds in the MLIR. Per-tile cost is static (tiles are fixed-size); only trip count is dynamic. | + +## 5. Architecture + +### 5.1 Artifacts (per kernel) + +- **Trace `.so`** — compiled from the skeleton+API MLIR. Shape-parametric: + symbolic bounds become C++ function parameters. Calls the runtime API + (`togsim_dma`, `togsim_compute`, `togsim_memory_barrier`, …). +- **Cycle table** — `tile_id → cycle`, produced by the annotation pass. + +### 5.2 Pipeline (input = given MLIR) + +``` +given MLIR (affine/scf.for + memref.dma_start/dma_wait + vcix/vector compute) +│ +├── Branch A (trace): +│ C2 build_skeleton pass (reuse build_tog traversal) +│ • affine/scf.for kept, bounds as-is (symbolic preserved) +│ • dma_start → togsim.dma(... tag_id, %tag[%idx], is_async) +│ • dma_wait → togsim.memory_barrier(tag_id, %tag[%idx], write_bufs) +│ • compute block → togsim.compute(tile_id, dims) +│ • DCE: drop ops with no dependency to loop/address/API operands +│ → C4 togsim→emitc lowering (togsim.* → emitc.call_opaque; +│ convert-scf/arith-to-emitc; func args incl. symbolic shapes) +│ → mlir-translate --mlir-to-cpp +│ → C5 compile → trace .so (cached by kernel key) +│ +└── Branch B (cost): + C3 annotation pass over the same MLIR + • extract per-tile compute bodies, assign tile_id + • run through existing sample-mode → tile_id → cycle table + +TOGSim (C6): + dlopen(trace.so) → resolve togsim_kernel + inject EmitCtx { tag table; record sink; cost = cycle_table[tile_id] } + togsim_kernel(ctx, runtime_shape_args...) // producer runs, emits stream + → existing timing core consumes the recorded Instruction stream +``` + +### 5.3 Components + +- **C1 — `togsim` API op vocabulary.** `togsim.dma(...)` (void result, carrying + `tag_id`, the runtime tag-index operand, `is_async`), + `togsim.memory_barrier(tag_id, tag_slot, write_bufs)`, + `togsim.compute(tile_id, dims)`, `togsim.compute_barrier`. Kept *unregistered* + (like the existing `togsim.transfer`), so no C++ dialect registration; the + togsim→emitc step is a custom Python rewrite, not a registered ConversionPass. +- **C2 — `build_skeleton` pass.** Sibling to `build_tog.py`, reusing its + traversal (matmul FSM, `_dma_start_fields`, loop typing). Emits the + skeleton+API MLIR instead of TOG nodes; preserves `is_async`. The original + `memref.dma_wait` is mapped through to an explicit `togsim.memory_barrier` + carrying the DMA's `tag_id` and the runtime tag-index operand. +- **C3 — annotation pass + cycle table.** Reuses sample-mode to sample the + deterministic per-tile cycle; emits the `tile_id → cycle` table artifact. +- **C4 — togsim→emitc lowering.** Maps each `togsim.*` op to an + `emitc.call_opaque "togsim_*"`; lowers control flow via `convert-scf-to-emitc` + / `convert-arith-to-emitc`; func arguments (including symbolic shapes) become + C++ parameters. Then `mlir-translate --mlir-to-cpp`. +- **C5 — `.so` build.** Compile emitted `.cpp` + `togsim_runtime.h` to `.so` + via the existing toolchain; cache by kernel key. +- **C6 — TOGSim runtime + loader.** `togsim_runtime.h/.cc`: `EmitCtx` and the + `togsim_dma/compute/memory_barrier/compute_barrier/core_alloc` + implementations (compute looks up the cycle table). Loader `dlopen`s the + `.so`, calls `togsim_kernel` with runtime shape args, records the stream, feeds + the existing timing core. An async DMA and its `memory_barrier` are paired at + runtime by `(tag_id, tag_slot)` through the existing Core tag table. + +### 5.4 ABI sketch (current: v11) + +```c +// togsim_runtime.h — shared contract between emitted .cpp and TOGSim +typedef struct EmitCtx EmitCtx; + +void togsim_dma(EmitCtx*, int32_t dir, int32_t arg_id, uint64_t offset, + int32_t ndim, const int64_t* dims, const int64_t* strides, + int32_t elem_bits, int32_t is_async, + int32_t tag_id, uint64_t tag_slot, + const int64_t* read_bufs, int32_t n_read, + const int64_t* write_bufs, int32_t n_write); + +void togsim_memory_barrier(EmitCtx*, int32_t tag_id, uint64_t tag_slot, + const int64_t* write_bufs, int32_t n_write); + +void togsim_compute(EmitCtx*, uint64_t tile_id, int32_t compute_type, /* dims */ ...); +void togsim_compute_barrier(EmitCtx*); +int32_t togsim_core_alloc(EmitCtx*); + +// entry point the loader resolves: +void togsim_kernel(EmitCtx*, int64_t* shape_args, int32_t n_shape_args); +``` + +`togsim_dma` returns void (no handle). An async DMA carries `(tag_id, tag_slot)`; +the matching `togsim_memory_barrier` waits on the same pair through the Core tag +table. The symbols are resolved as free `extern "C"` functions: the loaded `.so` +links back into the Simulator binary (built with `ENABLE_EXPORTS`). + +> **Superseded.** v2–v10 evolved through a `togsim_event` handle type with +> `togsim_dma` returning a handle and `togsim_wait`/`togsim_signal`/ +> `togsim_wait_all` plus `togsim_event_alloc`/`togsim_event_free`. v11 removed +> all of those; see the note at the top of this doc and §9.6.1. + +## 6. Compute cost model + +The annotation pass (C3) reuses **sample-mode** to measure each tile's +deterministic cycle once and stores a **precomputed `tile_id → cycle` table**. +`togsim_compute` looks it up at runtime. + +This is consistent with dynamic shape because **tiles are fixed-size** +(`TILE_M/N/K`): the per-tile cycle is invariant; only the *number* of tiles +(loop trip count) varies, and that is handled by the symbolic loop in the `.so`. + +**Open edge case — remainder tiles.** When a dimension is not divisible by the +tile size, edge tiles are partial and have a different cycle than the table +entry. Options: pad to full-tile cost (simple, small error) vs. sample a +separate `tile_id` for the remainder. Decided at P4. + +## 7. Milestones + +- **P0** — DONE. New branch; runtime API header (C6 surface) + `togsim` op + vocabulary (C1). +- **P1** — DONE. `build_skeleton` pass (C2) on a matmul kernel; verified against + the legacy `build_tog` TOG. The async DMA's `memref.dma_wait` is mapped through + to an explicit `togsim.memory_barrier` carrying the DMA's `tag_id` and the + runtime tag-index operand; the IR verifies across sibling prefetch/compute loop + nests because the pairing is by runtime tag slot, not a cross-region SSA edge. +- **P2** — DONE. togsim→emitc (C4) + `mlir-translate` + compile (C5) → `.so` for + that kernel (static shape). C4 rewrites the unregistered `togsim.*`/signature + then drives the upstream `lower-affine`/`convert-*-to-emitc` passes, with a + small fold for residual `emitc.for` bound casts (see §8). Base addresses + stubbed to 0 (wired in P3). +- **P3** — DONE. TOGSim loader + runtime (C6) + cycle table (C3); runs end-to-end + through the real Simulator/Core (256^3 GEMM via `--trace_so`). Parallelism / + reduction / core dispatch design is locked in **§9** (core-transparent work + function + `togsim_core_alloc` hook). Async DMA↔consumer sync is the runtime + tag-slot mechanism (`togsim.memory_barrier`), not an event-id. +- **P4** — Symbolic bounds end-to-end on a decode-style kernel; verify trace + length scales with runtime shape; decide remainder-tile handling. +- **P5** — Migrate remaining op families (conv, SDPA, vector). + +## 8. Risks / open questions + +- **Remainder tiles vs. precomputed table** (§6) — P4. +- **ABI versioning** — RESOLVED. Free `extern "C"` symbols (the `.so` links back + into the Simulator binary via `ENABLE_EXPORTS`); `TOGSIM_ABI_VERSION` is v11. +- **togsim→emitc for unregistered ops** — must be a custom rewrite to + `emitc.call_opaque`, since unregistered ops have no registered conversion + patterns. +- **EmitC coverage** — RESOLVED (P2). C4 uses the upstream conversion passes + (`lower-affine`, `convert-scf-to-emitc`, `convert-arith-to-emitc`, + `convert-func-to-emitc`). One gap in this LLVM 20 build: + `convert-scf-to-emitc` emits `emitc.for` with `index` bounds, so + `convert-arith-to-emitc` leaves `builtin.unrealized_conversion_cast` on the + bounds (`emitc.size_t`↔`index`) that `--reconcile-unrealized-casts` cannot + fold and `mlir-to-cpp` cannot print. C4 adds a small post-pass + (`_retype_for_to_size_t`) that retypes each `emitc.for` to `!emitc.size_t` + bounds + IV (`emitc.for` accepts size_t with the explicit type) and folds the + residual index<->size_t casts. A size_t IV also makes the lowered *address* + arithmetic cast-free, which is what lets P3 wire real addresses (approach A): + `togsim_dma` passes `(arg_id, element offset)` where the offset is computed + from the loop IVs and lowered by `convert-arith-to-emitc`. +- **async/fire-and-forget** — `is_async` preserved on `togsim.dma`. An async DMA + signals its tag at data arrival; a sync DMA is blocking. A DMA with no matching + `memory_barrier` is fire-and-forget (nothing waits its tag). + +## 9. P3 design: parallelism, reduction, and core dispatch (locked) + +How the trace producer expresses *which core runs what*, *what is parallel*, and +*what is a reduction* (cross-iteration dependency). This is the design for P3. + +### 9.1 Where the semantics come from + +Nothing new has to be inferred — the post-vcix `affine.for` already carries the +mapping decision the frontend made, and `build_skeleton` preserves it: + +| attribute | meaning | role | +|---|---|---| +| `outer_loop` | PARALLEL axis (e.g. GEMM m, n) | independent output tiles -> distributable across cores | +| `accumulation_loop` | REDUCTION axis (e.g. GEMM k) | partial sums into one output tile -> ordered dependency | +| `inner_loop` | tile micro-loop | within one tile | + +This matches what legacy TOGSim already does with `torchsim_loop_type` +(`TileGraphParser`: PARALLEL -> `outer_loop_idx` selects a core; ACCUMULATION -> +`accum_tag` groups dependent partials). The current gap is only that +`lower_to_emitc` (P2) *drops* these attributes when it lowers `affine.for` to +`emitc.for`, producing a flat single-stream producer. + +### 9.2 Principle: bake intrinsic, parameterize extrinsic + +Two different kinds of hardware dependence must be treated differently: + +- **Intrinsic** (vlane / vector width, `TILE_M/N/K`, systolic size) — defines the + *content and cost of each instruction*. Already baked into the IR; correct. +- **Extrinsic** (`num_cores`) — defines only the *distribution* of an otherwise + fixed set of work-items. The tile set, the per-tile cost table + (`tile_id -> cycle`), and the DMA tile shapes are all `num_cores`-invariant. + +Therefore `num_cores` is **not** baked into the producer. The producer is +**core-count transparent**: it knows nothing about how many cores exist. + +### 9.3 Model: core-transparent work function + dispatch hook + +The producer is two functions, split at the PARALLEL/ACCUMULATION boundary: + +```c +// WORK: trace for ONE independent output tile. Core-transparent: takes the +// PARALLEL indices directly, names no core. Reduction (k) is program order -> +// the dependency is implicit (the accumulator is core-local). An async load is +// synced to its consumer by an explicit memory_barrier on the same tag slot. +void togsim_kernel_tile(EmitCtx* ctx, int64_t mi, int64_t ni, int64_t* shape) { + togsim_core_alloc(ctx); // first line: new work-item + pick core + togsim_compute(ctx, /*tile_id=*/0, ...); // acc init + for (size_t ki = 0; ki < KT; ++ki) { // REDUCTION = program order + togsim_dma(ctx, LOAD, A, offA(mi,ki), ..., /*is_async=*/1, /*tag_id=*/0, ki%D, ...); + togsim_dma(ctx, LOAD, B, offB(ki,ni), ..., /*is_async=*/1, /*tag_id=*/1, ki%D, ...); + togsim_memory_barrier(ctx, /*tag_id=*/1, ki%D, ...); togsim_compute(ctx, 1, ...); + togsim_memory_barrier(ctx, /*tag_id=*/0, ki%D, ...); togsim_compute(ctx, 2, ...); + } + togsim_dma(ctx, STORE, C, offC(mi,ni), ...); +} + +// DISPATCH: enumerate the PARALLEL domain, one call per work-item. +extern "C" void togsim_kernel(EmitCtx* ctx, int64_t* shape, int32_t n) { + size_t MT = shape[0]/256, NT = shape[1]/256; + for (size_t mi = 0; mi < MT; ++mi) + for (size_t ni = 0; ni < NT; ++ni) + togsim_kernel_tile(ctx, mi, ni, shape); +} +``` + +Reduced to two orthogonal concepts: + +- **Parallel** = each `togsim_kernel_tile` call is an independent work-item (no + tags shared across calls). TOGSim is free to place it on any core. +- **Reduction** = ordering *inside* one work-item: program order on its core + (no explicit barrier). The `memory_barrier`/tag-slot mechanism is only the + async-DMA → consumer data sync. +- **Core assignment** = `togsim_core_alloc(ctx)` (a runtime callback, body in + TOGSim) marks the work-item boundary and binds the following ops to a chosen + core. The producer never sees `core_id`/`num_cores`; those live only in + TOGSim's dispatch policy (round-robin / blocked / cost-aware via the cycle + table). + +The boundary callback lives at the start of each work-item; it cannot be folded +away because TOGSim cannot intercept the producer-internal work-function call -- +only `togsim_*` callbacks are visible across the `dlopen` boundary. + +> FINAL API (supersedes the `togsim_dispatch` naming used below): the boundary + +> core binding is **`int32_t togsim_core_alloc(EmitCtx*)`** (header v6). The +> producer calls it at each work-item start; the **runtime owns the core pool** +> and round-robins -- `num_cores` is NEVER baked into the producer (it is purely +> a runtime quantity). There is **no free**: a core is an assignment, not a held +> resource; the next `togsim_core_alloc` starts the next work-item. The returned +> id is discarded by the producer. This keeps the producer core-count transparent +> while making the core mapping an explicit runtime allocation. Wherever the text +> below says `togsim_dispatch`, read `togsim_core_alloc`. + +### 9.4 Codegen (lower_to_emitc) and ABI deltas + +- `lower_to_emitc` splits the loop nest at the PARALLEL/ACCUMULATION boundary + into two `emitc.func`: the PARALLEL loops become `togsim_kernel` (dispatcher, + passing the loop indices as args); the ACCUMULATION+INNER body becomes + `togsim_kernel_tile`, with `togsim_core_alloc(ctx)` inserted at its entry. +- ABI additions in `togsim_runtime.h`: `int32_t togsim_core_alloc(EmitCtx*)` + (runtime owns the core pool; no `num_cores` in the producer; no free). + `togsim_kernel_tile` may stay internal (`static`) for now; export it only if a + future loader wants to own the parallel enumeration (which would also need a + `num_tiles`-style count — not required now). +- `tile_id -> cycle` table unchanged (num_cores-invariant). + +> Implementation status (P3, ABI v12): `lower_to_emitc` OUTLINES the innermost +> PARALLEL-loop body into a uniform `togsim_kernel_tile(ctx, iv, n)` func and the +> dispatcher loop hands it to `togsim_dispatch(ctx, fn, iv, n)` -- a higher-order +> runtime wrapper that round-robins a core and brackets the call with +> TILE_BEGIN/TILE_END. The work-item SCOPE is now the function call itself (not an +> implicit "ops until the next core_alloc" range), and one general dispatcher +> serves every kernel (uniform iv-array ABI). Earlier this was a single +> `togsim_kernel` with a bare `togsim_core_alloc` marker; the emitted *trace* is +> identical (one work-item bracket, then the work ops), so cycles are unchanged -- +> the outline was done to make the boundary explicit, not for timing. Address +> arithmetic is wired (approach A): each `togsim_dma` passes `(arg_id, element +> offset)` with the offset computed from the loop IVs (lowered by +> `convert-arith-to-emitc`, cast-free thanks to the size_t IV retype); the runtime +> adds the tensor base. The parallel IVs reach the tile fn through the iv array. + +### 9.5 Stance and the split-K exception + +This refines the design's "not a dynamic scheduler / static control flow": +**per-work-item trace is static and deterministic; only the work-item -> core +binding is dynamic** (decided by `togsim_core_alloc`). That is independent-task +distribution, not data-dependent control flow, and it matches a real tile +scheduler more closely. + +The transparent model holds while work-items are independent (data-parallel over +output tiles). **Split-K** (a reduction split *across* cores) breaks +independence: the producer must emit `c` partials + a combine, so the +instruction stream then depends on `num_cores`, and the cross-core dependency +must be a real dataflow edge (not program order). Treat split-K as a deliberate, +scoped exception — start P3 with data-parallel only. + +### 9.6 Work-items form a DAG (barriers, cross-parallel reduction) + +Work-items are not always a flat independent set. When there is a computation +*between* parallel loops (e.g. an op at the m-level after the inner n parallel +loop), it can only run once the inner parallel region completes — a join / +barrier: + +``` +parallel for m: + parallel for n: A(m,n) # leaf work-items + B(m) # join: needs all n of this m +``` + +This needs **no new primitive**: it is the same dataflow-edge mechanism the trace +already uses (§10), just at work-item granularity. The join op declares the +leaves' output buffers as its inputs, so the bridge makes it depend on every leaf +through the last-writer-per-buffer analysis: + +``` +parallel for m: + parallel for n: A(m,n) // each writes a tile of m's intermediate buffer + B(m) // reads that buffer -> depends on all n of this m +``` + +So the general picture: **work-items form a DAG; edges are buffer producer → +consumer dependencies.** The independent data-parallel case is the degenerate +edge-less DAG; barriers, reduction-across-a-parallel-axis, and split-K are the +same DAG with real dataflow edges. (Async-DMA data arrival is the one edge that +needs an explicit `memory_barrier` on the tag slot, because the buffer write +completes only at DMA response-complete, later than the producing op's +issue — see §10.7.4.) + +> **Superseded.** An earlier version expressed these joins with a per-leaf +> completion `event` plus `togsim_wait_all`. Those primitives were removed; joins +> are now ordinary buffer dependencies in the dataflow DAG (§10). + +### 9.6.1 How a barrier finds its DMA: runtime tag-slot pairing (locked) + +How the explicit `togsim.memory_barrier` (lowered from `memref.dma_wait`) finds +*which* `togsim.dma` instance's data it must wait for. The hard case is a +reduction loop: one static `togsim.dma` op executes once per iteration, each +iteration loading a different tile into a different runtime tag slot. The pairing +must therefore key on a *runtime* value, not a compile-time one. + +The locked model: pair by the **runtime tag slot**, using the existing TOGSim +Core tag table. + +- **A DMA carries `(tag_id, tag_slot)`.** `tag_id` is the compile-time identity + of the DMA's tag memref (which logical channel — e.g. A-load vs B-load). + `tag_slot` is the *runtime* tag index `%tag[%idx]`, i.e. the SRAM tile slot + the loaded tile occupies this iteration. Together they uniquely name this + iteration's load. +- **An async DMA signals; the barrier waits.** At DMA response-complete (the + moment data has actually arrived in SRAM), the runtime calls + `set_tag_finish(tag_id, tag_slot)`. The matching `togsim.memory_barrier` + carries the same `(tag_id, tag_slot)`; it calls `register_tag_waiter` and is + woken at that signal. The barrier then becomes the **last-writer** of the + loaded SRAM buffer (`write_bufs`), so every consumer that reads the buffer + gates on data arrival through the ordinary dataflow-edge analysis (§10). +- **A synchronous DMA needs no barrier.** It is blocking — it finishes at data + arrival itself, and consumers depend on it directly. +- **Reduction iterations do not collide.** Because `tag_slot` is the runtime + index, iteration `i`'s DMA and iteration `i`'s barrier share a slot that is + distinct from (or correctly reused after) other iterations — exactly the + per-iteration pairing a compile-time id could not express. The + double-buffer/pipeline depth is the slot's lifetime, owned by the Core's tag + table. + +**What this drops vs legacy `tag_table`:** no `calc_tag` content-hash, no magic +values (`0`/`1`/`-1`/`>1`), no FIFO, no in-order assumption. The pairing key is +`(tag_id, tag_slot)`, both carried explicitly on the trace ops. + +> Status: IMPLEMENTED (ABI v11). `build_skeleton` maps `memref.dma_wait` to +> `togsim.memory_barrier` and tags `togsim.dma` with `tag_id` + the runtime +> tag-index operand; `lower_to_emitc` lowers both; the runtime pairs them via +> `prepare_tag_key`/`set_tag_finish`/`register_tag_waiter`. Verified bad=0 on the +> 256^3 GEMM. (All current fixtures have tag memref size 1, i.e. single-buffer; +> deeper double-buffer pipelines exercise more slots but use the same key.) +> +> **Superseded.** ABI v5–v10 used a dynamically minted `togsim_event` handle +> parked in a heap "event buffer" (`togsim_event_alloc`/`togsim_event_free`), +> with `togsim_dma` returning the handle and `togsim_wait(handle)` consuming it. +> That mechanism — and the earlier static `event_id` it replaced — could not +> represent per-iteration reduction pairing and was removed in v11 in favor of +> the runtime tag slot above. + +### 9.7 Execution / simulation model: trace generation (not co-execution) + +The producer is a **pure trace (DAG) generator**: running its loops *emits* the +ordered op stream + dependency edges. It never computes cycles, models hardware, +or schedules. Two consequences pin the model: + +- **What is an edge vs. what blocks.** Data dependencies (buffer producer → + consumer edges, plus the async-DMA `memory_barrier` on its tag slot) are + recorded *edges* — the producer does not block on them. The only thing that + ever blocks the producer is *resource backpressure* (finite cores, + double-buffer / SRAM slots, DMA-queue depth), and that is pure flow control, + not timing semantics. +- **Cores, double-buffering, DRAM/NoC are the timing core's job — reused, not + reimplemented.** TOGSim's timing core already models all of this when it + consumes the legacy TOG (Appendix A: `tag_table` double-buffer sync, + `num_cores`). The producer stays oblivious; depths/counts are consumer-side + config. + +Consumption is staged via a swappable **sink** behind the callbacks, so the +choice does not touch the producer or the ABI: + +| | sink | threads | when | +|---|---|---|---| +| **P3** | *materializing* — callbacks append to the timing core's input; reuse its existing scheduler/timing | none | static shape; like-for-like cycle-equivalence vs `build_tog` | +| **P4+** | *streaming* — callbacks push to a bounded queue; the producer runs as a fiber/coroutine and blocks on backpressure; the DES loop advances time, frees resources, resumes it | producer fiber | only when dynamic-shape trace size makes full materialization impractical | + +This is **not** timing co-execution: even the streaming sink only blocks the +producer on resource flow-control, never on timing-resolved data events. It is +the lazy/streamed realization of the same trace model. Decision: **do P3 with +the materializing sink (no threads); defer streaming to P4 as a sink swap.** The +single forward-compat requirement is that the callback sink is an interface. + +### 9.8 P3 task list + +1. DONE. `togsim_runtime.h` + `togsim_runtime.cc`/`togsim_loader.h`: C6 runtime + (`EmitCtx`) + `dlopen` loader (`run_producer`), materializing sink. Callees: + `togsim_core_alloc` (runtime core pool), `togsim_dma` (records a tile load/ + store, signals its tag at data arrival), `togsim_compute` (cycle-table lookup), + `togsim_memory_barrier` (waits the matching `(tag_id, tag_slot)`), + `togsim_compute_barrier`. +2. DONE (single-buffer). `lower_to_emitc`: OUTLINES the work-item body into + `togsim_kernel_tile(ctx, iv, n)` + a `togsim_dispatch` call at the work-item + boundary (ABI v12; was a bare `togsim_core_alloc` marker), lowers + `togsim.memory_barrier`, and reads `loop_type`. (Two-function outline DONE; + trace identical.) +3. DONE. Real tile addresses wired (approach A): build_skeleton keeps the DRAM + index operand on `togsim.dma`; lower_to_emitc passes `(arg_id, offset)` and + `convert-arith-to-emitc` lowers the offset (size_t IV retype makes it + cast-free). Verified on 1024^3 GEMM: per-tile offsets are correct + (A[m,k]=m*1024+k, B[k,n]=k*1024+n). +4. PARTIAL. C3 cycle table: `cycle_table.py` builds `tile_id -> (cycle, + overlapping_cycle)` from a per-tile `cycle_list`, with `overlapping_cycle = + max(cycle - offset[type], 0)` (the legacy formula) and a JSON sidecar dump. + Remaining (folds into task 5): feed it the gem5 sample-mode `cycle_list` + already computed in `extension_codecache` (reused -> both paths stay + cycle-consistent), and have `togsim_compute` set BOTH cycle and + overlapping_cycle on the Instruction. +5. PARTIAL. C6 runtime + loader: `TOGSim/src/togsim_runtime.cc` + + `togsim_loader.h` implement the producer ABI and `run_producer` -- dlopen the + `.so`, run `togsim_kernel` against an `EmitCtx`, and record a `TraceRec` stream + (the materializing sink): each dma resolves `base[arg_id] + offset*elem_bytes` + and signals its tag at data arrival, each compute looks up the cycle table, + core_alloc round-robins the core. Verified standalone on the 256^3 GEMM: + addresses/cycles resolved correctly. DONE (sec 10, 10.7.4): the recorded + stream is fed into the existing timing core (Core/Simulator) -- TraceRec maps + to `Instruction` (compute_cycle + overlapping_cycle, dataflow-buffer deps + + runtime-tag barriers). + +Legacy path: the ONNX-TOG producer (`run_tog` -> `tog_generator` -> ONNX -> +C++ `TileGraphParser`) is marked DEPRECATED in place (comments in +`extension_codecache.py` and `tog_generator.py`) but kept live -- it must not +break during the transition. It is retired only once this trace pipeline is +stable. The cycle measurement (`cycle_list`, `x_offset`/`w_offset`) is shared, +so the two paths stay cycle-consistent meanwhile. + +### 9.9 Task-5 completion roadmap: TraceRec -> Core (DONE; see §10) + +> **Status: implemented.** This roadmap is retained for context. The dependency +> model it sketches (a per-`togsim_wait`-handle RAW edge) was *superseded* during +> implementation by the explicit dataflow-DAG model in §10: edges come from SRAM +> last-writer-per-buffer plus the vcix preload/matmul FSM, and async-DMA data +> arrival is gated by an explicit `togsim.memory_barrier` paired on the runtime +> `(tag_id, tag_slot)` (§10.7.4) — not by a returned event handle. Read the +> bullets below as the original target shape, with that one substitution. + +Grounded by reading `Instruction.h`, `Core.cc`, `TileGraphParser.h/.cc`, +`Simulator.cc`. + +**Target architecture (legacy, reused):** `ONNX -> TileGraphParser -> TileGraph +(TileLoopNode / TileMemoryNode / TileMemoryWaitNode / TileComputeNode) -> +Simulator distributes Tiles to Cores -> Core runs Instructions`. We replace only +the front: build the same `TileGraph` / `Instruction`s from the recorded +`TraceRec` stream, then hand it to the existing `Simulator`. + +**Mapping (TraceRec -> Instruction):** `Instruction(opcode, compute_cycle, +num_parents, dram_addr, tile_size, tile_stride, elem_bits, tag_idx_list, +tag_stride_list, accum_tag_idx_list)`; `ready_counter = num_parents`. +- DMA load/store -> `MOVIN`/`MOVOUT`: `dram_addr = TraceRec.addr`, `tile_size`/ + `tile_stride`/`elem_bits` from the dma, `tag_idx_list = {tag_slot}` (the + SRAM-slot key), `is_async` set. compute_cycle 0. +- COMPUTE -> `COMP`: `compute_cycle = TraceRec.cycle`, + `set_overlapping_cycle(TraceRec.overlapping)`, `set_compute_type(...)`. +- Dependency (RAW): a compute depends on its loads through the SRAM + last-writer-per-buffer analysis (§10); for an async load the last-writer is the + `togsim.memory_barrier` paired on the load's runtime `(tag_id, tag_slot)`, so + the compute's `ready_counter` only clears once the data has arrived (§10.7.4). +- SRAM double-buffer / capacity (WAR): the existing Core enforces it through the + tag mechanism (`register_tag`/`set_tag_finish`/`mark_tag_used`, DMA.h) keyed by + `tag_idx_list`; our `(arg_id, tag_slot)` is that key. Reduction grouping -> + `accum_tag_idx_list` (the accumulation-loop index). + +**Build/wiring:** compile the bridge into TOGSim (it needs the conan deps; +include flags are in `TOGSim/build/compile_commands.json`, notably +`-D_GLIBCXX_USE_CXX11_ABI=0` and the `/root/.conan/data/{robin-hood,spdlog,fmt, +yaml-cpp,boost}` include dirs). Add `togsim_runtime.cc` + the bridge to +`TOGSim/CMakeLists.txt`. Either (a) build `TileGraph`/`Tile` nodes from TraceRec +(maximal reuse of `Simulator`'s tile distribution + Core), or (b) build the +`Instruction` DAG directly and drive a single Core. (a) is closer to legacy and +gives multi-core for free. + +**Cycle-table feed:** reuse the gem5 `cycle_list` already computed in +`extension_codecache` (so both paths stay cycle-consistent); pass it + +`x_offset`/`w_offset` to `cycle_table.build_cycle_table`, dump the sidecar, and +have the loader populate `EmitCtx.cyc/ovl`. + +**Validation:** same post-vcix fixture through both paths; compare the +`Simulator`'s total cycles / DRAM traffic. Start with the 256^3 GEMM (static +shape, single-buffer), then multi-tile / double-buffer kernels. + +This is a focused C++ integration (TOGSim build + TileGraph construction), not a +small increment -- best executed as its own push; all the producer-side inputs +(addresses, cycles, handles, core, tag_slot) are already in the trace. + +## Appendix A: current-state references + +- `TOGSim/include/DMA.h:27-115` — `tag_table` (overloaded `0/1/-1/>1`) + + `waiters`; `register_tag` / `set_tag_finish` / `register_tag_waiter` / + `mark_tag_used` (= init / signal / wait / consume). +- `TOGSim/src/Core.cc:118-140, 214-324` — async-DMA signal path and the `BAR` + wait/consume path over the tag table. +- `TOGSim/include/Instruction.h:40-48, 104-117` — `ready_counter` / `child_inst` + (the second, separate dependency mechanism) and the tag fields. +- `PyTorchSimFrontend/mlir/passes/build_tog.py` — `TogBuilder.print_operation` + dispatch (`affine.for` / `memref.dma_start` / `memref.dma_wait` / `vcix.*`); + `_affine_for_bounds` (constant-bound resolution → static shape). +- `PyTorchSimFrontend/mlir/passes/__init__.py`, + `PyTorchSimFrontend/mlir/passes/lower_to_llvm.py` — in-process Python MLIR pass + orchestration via the bindings; the functional Spike/LLVM path (unchanged). +- `PyTorchSimFrontend/mlir/mlir_gemm_template.py` — kernel template emitting the + `affine.for` nest + `linalg.matmul` + `togsim.transfer` DMA ops. + +## 10. Explicit dependency-edge trace (revised dependency model) + +Supersedes the in-order / runtime-tag approach for expressing dependencies. The +trace is an explicit dataflow DAG: every op declares the producers of the data it +consumes; the consumer (Core) does all resource scheduling. Reached after finding +that (a) flat in-order over-serializes parallel tiles, (b) the current TOG pass +does NO dependency analysis (it emits a lexical loop tree + tags resolved at +runtime by the C++ tag_table), and (c) compute I/O is collapsed away by +build_skeleton, so dependencies must be recovered before the collapse. + +### 10.1 Representation + +The dependency edge is "consumer reads the buffer that producer wrote". As +landed (ABI v9 onward; see STATUS "sec 10 explicit-edge bridge"), each op +declares the **SRAM buffer ids** it reads and writes (`read_bufs` / `write_bufs`); +the bridge builds the Instruction DAG by **last-writer per buffer**, scoped per +work-item. There is no SSA event token threaded by the producer and no event +handle returned by an op. + +- The edge source is data, not order: an op that reads buffer `b` gets an edge + from whatever op most recently wrote `b`. +- No in-order chain, no runtime tag content-hash, no op-pattern heuristics. +- Resource scheduling -- SA round-robin, double-buffer (<=N in flight), SRAM -- + stays entirely in the Core. The trace never reasons about SRAM occupancy or + timing; it only states producer->consumer order. +- One exception: an **async** DMA's write completes only at data arrival (DMA + response-complete), later than its issue, so its last-writer edge is routed + through an explicit `togsim.memory_barrier` that waits the load's runtime + `(tag_id, tag_slot)` (§10.7.4). A synchronous DMA is blocking and needs no + barrier. + +> The sketch below uses an `out_ev = op(ctx, in_events[])` SSA notation to +> *illustrate* the edges; it predates the landed `read_bufs`/`write_bufs` form +> and is no longer the literal ABI. Read `in={…}` as "reads these buffers". + +Producer C++ form (events threaded like SSA; loop-carried = a reassigned var): + + for mi, ni: // PARALLEL: independent tiles + ev acc = compute(ctx, INIT, in={}); + for ki: // REDUCTION: loop-carried acc + ev a = dma_load(ctx, A[mi,ki], in={}); + ev b = dma_load(ctx, B[ki,ni], in={}); + ev w = compute(ctx, PRELOAD, in={b}); + acc = compute(ctx, MATMUL, in={a,w,acc}); // new acc event each iter + dma_store(ctx, C[mi,ni], in={acc}); + +The INIT dependency reaches every accumulate transitively through the acc chain +(INIT -> mm_k0 -> mm_k1 -> store); each node only needs edges to its immediate +producers. Different (mi,ni) -> separate acc chains -> independent -> parallel. + +### 10.2 Two dependency sources (both available pre-collapse in the TOG pass) + +A single "SRAM access" analysis is necessary but NOT sufficient -- verified on the +GEMM post-vcix: + +| dependency | source | visible in SRAM? | +|---|---|---| +| load -> compute (DMA writes X_spad/W_spad, preload/matmul read) | SRAM last-writer per (buffer, slot) | yes | +| accumulator chain (INIT writes Y_spad; the drain/epilogue read-modify-writes Y_spad; store reads it) | SRAM last-writer on Y_spad | yes | +| **preload -> matmul** (preload loads weights into the systolic-array registers; matmul consumes them) | **vcix opcode FSM** (op1=preload pairs with the following op0=matmul; build_tog already tracks this via `current_preload_node`) | **no -- SA-internal, not a memref access** | + +So the analysis derives edges from (1) SRAM (buffer, slot) last-writer for loads +and the accumulator, and (2) the vcix preload/matmul pairing for the SA-weight +dependency. The slot is a concrete value at run time (the producer runs the +loops), so matching is by value -- no static affine-overlap math. + +Key facts (256^3 GEMM, post-vcix): SRAM buffers are %0=X_spad(A), %1=W_spad(B), +%2=Y_spad(acc/out). matmul (vcix op0) reads %0 only; preload (vcix op1) reads %1; +the matmul does NOT read %1 (weights come from the SA), which is exactly why a +memref-only analysis lets it run before the weight load -- the preload->matmul +edge must come from the FSM. The accumulation is the epilogue's `transfer_read +%2 + addf + transfer_write %2`, which IS SRAM-visible. + +### 10.3 Components changed (as landed) + +- TOG pass (`build_skeleton` + `dep_analysis`, on post-vcix before collapse): per + op, the read/write SRAM buffer ids + the preload->matmul pairing (folded as a + virtual `SA_WEIGHTS` buffer) -> the read/write buffer sets. +- ABI (`togsim_runtime.h`): `togsim_dma`/`togsim_compute` carry + `read_bufs`/`write_bufs`; an async DMA also carries `(tag_id, tag_slot)` for the + `togsim.memory_barrier` pairing. No `in_events[]`, no returned event, no + `event_id`/handle-buffer mechanism. +- `lower_to_emitc`: emits the buffer-id arrays on each op (and lowers + `togsim.memory_barrier`). +- bridge: builds the Instruction DAG by last-writer per buffer (`add_child`); + no in-order chain, no runtime tag content-hash. +- Core: unchanged (ready_counter DAG + SA pipeline + double-buffer already exist). + +### 10.4 Open decisions + +- Reduction timing: model the acc chain as completion-serial (conservative, + simple) first; SA-pipelined (matches legacy's overlap) — RESOLVED via the + occupancy/latency split (§10.7). +- Buffer-id lifetime: the last-writer map is scoped per work-item (reset at each + `togsim_core_alloc`). + +### 10.5 Known issue: preload concurrency not bounded by #systolic-arrays + +Observed in the --trace_so run (256^3 GEMM): 4 PRELOADs execute concurrently +(issue ~1028, finish ~1119-1122), but with num_systolic_array_per_core = 2 at +most 2 should overlap, and two preloads on the same SA should serialize (one +weight register file per array). Cause: a preload's overlapping_cycle equals its +compute_cycle (91 == 91), so its occupancy (compute - overlapping) is ~0 and the +Core's SA compute pipeline accepts unbounded back-to-back preloads. + +This is a PRE-EXISTING Core SA-model property, NOT introduced by the trace +pipeline: the legacy build_tog path shows the same -- its 4 preloads issue at +1215-1218 and finish 1306-1309 (4 concurrent). So it is not a trace-vs-legacy +regression, but it is a real hardware-fidelity gap: the model should cap +concurrent preloads at the systolic-array count and serialize same-SA preloads on +the single weight buffer. Track separately from the trace work (affects both +paths equally). + +### 10.6 Known issue: accumulator dependency over-serializes the reduction + +Observed in the --trace_so run: consecutive matmuls run 396 cycles apart (fully +serial: issue 1120, 1516, 1912, ...), but physically matmuls that accumulate into +the same output should PIPELINE on the systolic array (the partial sums stream +through; consecutive matmuls overlap by overlapping_cycle, ~128 effective). They +should NOT wait the previous matmul to complete. + +Cause: the explicit-edge bridge builds a hard completion edge (add_child) for the +Y_spad accumulator read-modify-write, so matmul_k1 waits matmul_k0's +finish_instruction -> when it issues, k0 is already done -> the overlapping_cycle +window is empty -> no pipeline. This is the mechanism behind the 4888 vs legacy +2095 gap (legacy has NO inter-matmul edges, so its matmuls pipeline on 2 SAs: +finishes 1704,1707 | 1832,1835 = +128 within an SA, +3 across SAs). + +So the accumulator (Y_spad) dependency is a PIPELINED/ordering dependency, not a +completion barrier. add_child cannot express that. Fix direction: do not create a +matmul->matmul completion edge through the accumulator -- the accumulation order +is preserved implicitly by same-SA issue order + the SA pipeline (overlapping_ +cycle), exactly as legacy does. Keep the real barriers: load->compute, and +store->last-matmul (the store needs the final accumulator). The asymmetry (a +matmul consuming Y pipelines; the store consuming Y waits) is the crux to model -- +likely "do not barrier when the consumer is a same-unit pipelined compute". + +Related to the same root as 10.5 (the SA/compute-pipeline occupancy model): both +are about modeling the systolic array's streaming/pipelined execution rather than +treating each compute as an atomic completion. + +### 10.7 Occupancy/latency split for pipelined computes (design + prototype) + +Idea (keeps add_child uniform): give each compute two completion points instead of +one. A systolic-array op occupies its unit for occupancy = compute_cycle - +overlapping_cycle (the initiation interval, ~128 for the matmul) and its result is +ready at latency = compute_cycle (~395). Then add_child releases: + - a same-unit pipelined successor (next matmul, accumulator RMW) at OCCUPANCY + -> it starts ~128 later -> pipeline; + - a result consumer (the store reads the drained accumulator) at LATENCY + -> it waits the full drain (tail). +So a single add_child mechanism stays, but the release point depends on whether +the edge is an occupancy-dependency (same-unit pipeline) or a latency-dependency +(reads the result). This also fixes 10.5: a preload then occupies its SA for its +occupancy, so concurrent preloads are naturally capped at the SA count. + +Prototype (bridge stopgap, committed): skip the matmul->matmul accumulator edge +(treat it as pipelined, not a barrier); keep every other edge. Result on 256^3 +GEMM: matmuls now issue back-to-back (1120-1127) and finish pipelined on 2 SAs +(1515,1516 | 1643,1644 | 1771,1772 | 1899,1900 = +128 within an SA, +1 across), +exactly like legacy. Total 4888 -> 2501 (vs legacy 2095 / 2608-incl-store; our +matmuls finish at 1900 vs legacy 2091 -- our load chain is shorter). This +confirms the accumulator dependency is pipelined. The clean replacement is the +occupancy/latency split above in the Core so add_child stays uniform and the +bridge needs no matmul-specific skip. + +#### 10.7.1 preload->matmul is also an occupancy dependency (preload fully overlaps) + +The preload->matmul edge is the SAME kind as matmul->matmul: a same-SA pipeline +(occupancy) dependency, not a latency barrier. A preload's overlapping_cycle +equals its compute_cycle (91 == 91), so its occupancy = compute - overlapping = 0 +-- it fully overlaps. With the occupancy/latency split, the matmul (successor) +released at the preload's OCCUPANCY (= preload issue + 0) starts immediately, so +the preload's 91-cycle latency is entirely hidden under the matmul. + +In the current prototype the preload->matmul edge is still an add_child barrier +(only matmul->matmul was skipped), so the matmul issues at 1120 -- right after the +preload finishes at ~1119 -- paying the full 91. The bridge cannot cleanly skip +preload->matmul (skipping it outright loses the ordering: the matmul could be +ready before the preload and reach the SA without weights). So preload-overlap is +another reason the proper fix is the Core occupancy/latency split (10.7), which +releases the matmul at the preload's occupancy (0) while keeping the issue order. + +Net: the Core occupancy/latency split resolves three notes at once -- 10.5 +(concurrent preloads capped at SA count via preload occupancy), 10.6 (matmuls +pipeline), 10.7.1 (preload fully overlaps) -- all instances of "model the SA as a +pipeline (occupancy + latency) instead of atomic completion". + +#### 10.7.2 Occupancy/latency split: implemented + POC result + +Implemented uniformly: Instruction gains add_pipeline_child / release_pipeline_ +children; the Core releases an op's pipeline children when it ISSUES (enters the +SA pipeline), and its normal children at finish. The bridge classifies edges: a +preload/matmul -> matmul edge is occupancy (add_pipeline_child), everything else +is latency (add_child). No matmul-specific skip heuristic. + +256^3 GEMM result: preloads issue 1028-1031, matmuls issue 1032-1039 (right after +the preloads ISSUE, not after they finish at ~1119 -> preload fully overlaps), and +matmuls finish pipelined on 2 SAs (1427,1428 | 1555,1556 | 1683,1684 | 1811,1812 += +128 within an SA, +1 across). Total 4888 -> 2501 (matmul-skip) -> 2413 +(occupancy/latency). Legacy is 2095 (matmul completion; our matmuls finish at 1812 +vs legacy 2091 -- shorter load chain -- and our 2413 includes the store). + +Note on 10.5 (preload concurrency): NOT fixed by this alone. A preload's +overlapping_cycle == compute_cycle, so its occupancy is 0 -> it does not hold the +SA -> 4 preloads still issue concurrently (1028-1031). Capping concurrent preloads +at the SA count needs the preload to have a non-zero occupancy reflecting the +weight-load time (a cycle-model input), separate from this edge-release change. + +#### 10.7.3 Explicit compute fence: implemented (COMPUTE_BAR), BAR -> MEMORY_BAR + +The compute fence is now a first-class trace entity, not a bridge-internal edge: + - togsim_ops: `togsim.compute_barrier`; ABI v10 adds `togsim_compute_barrier(ctx)`. + - build_skeleton emits a `togsim.compute_barrier` before each store DMA; lower_to_emitc + lowers it; the runtime records a COMPUTE_BAR TraceRec. + - The two barrier kinds are now named distinctly: Opcode::BAR -> Opcode::MEMORY_BAR + (the DMA/tag memory barrier, unchanged) and a new Opcode::COMPUTE_BAR. + - Core: COMPUTE_BAR finishes only once ALL compute pipelines drain (every systolic + array + the VPU empty); until then it stays in the ready queue (re-checked each + cycle). Its ready_counter is gated (pipeline-child of the outstanding async + computes) so it is only evaluated after they have ISSUED into the pipeline. + - bridge: a COMPUTE_BAR record -> a COMPUTE_BAR Instruction (pipeline-child of the + outstanding async matmuls); the following store add_child's the fence. + +256^3 GEMM: trace shows `... matmul x N -> COMPUTE_BAR -> STORE`; the COMPUTE_BAR +instruction finishes at 1813 (after the SAs drain, last matmul ~1812), the store +issues at 1814. Total 2414 (matches the implicit-flush 2413 + the 1-cycle fence). +Multiple SAs handled (drains all _sa_compute_pipeline[*]). 7 python tests pass. + +#### 10.7.4 load->compute uses MEMORY_BAR (async DMA data wait); fixes a real bug + +Bug found: a consumer reading an async-loaded buffer ran BEFORE the data arrived +(preload issued @1028 but its weight load W finished @1131). Cause: a raw +add_child on an async DMA fires at the load's ISSUE-complete (program flow), not +its DATA-ready (resp-complete) -- the async DMA signals data only via the tag +table (set_tag_finish at resp-complete). So the buffer-edge model alone cannot +gate compute on async-loaded data. + +Fix (symmetric with COMPUTE_BAR): route async load -> compute through a MEMORY_BAR +that carries the load's tag. The load registers the tag at issue; the MEMORY_BAR +(made ready after the load issues, via add_child) parks on the tag and is woken at +resp-complete; consumers depend on the MEMORY_BAR (last_writer[buf] = bar). So the +memory-arrival notification (set_tag_finish) connects to compute via the existing +tag mechanism -- now explicit in the trace as a MEMORY_BAR instruction. + +256^3 GEMM: preload now issues @1132 (after W resp-done @1131), correct. Total +2414 (buggy/optimistic) -> 2518 (correct: compute waits the slow weight load). +Both barriers are explicit and symmetric: MEMORY_BAR (DMA tag, resp-complete) for +load->compute, COMPUTE_BAR (SA pipeline drain) for compute->store. + +## 11. Remaining work + next-session handoff + +### 11.1 Status + +PR #267 (feature/togsim-cpp-trace -> develop). The trace pipeline runs end-to-end +through the REAL Simulator/Core on a 256^3 GEMM via `--trace_so`, with an explicit +dataflow dependency model (SRAM last-writer + vcix FSM) and two explicit barriers: +MEMORY_BAR for async load->compute data (paired to its DMA by the runtime +`(tag_id, tag_slot)` tag slot) and COMPUTE_BAR for the SA drain before a store. +The async-DMA sync is the runtime tag slot, NOT a compile-time event-id (ABI +bumped to v11; the event-id / event-handle / wait/signal design was removed). +Legacy ONNX-TOG path kept + DEPRECATED. All togsim python tests pass; TOGSim +builds. + +**Validation (256^3 GEMM, real gem5 cycle table):** through the real Core the +trace path totals **2518 cycles** vs the legacy path's **2698** on the same +table. The earlier 10.x notes (with a stub table) report different absolute +numbers; 2518-vs-2698 is the current real-table figure. + +### 11.2 Remaining work (priority order) + +1. **Cycle-equivalence closure.** Characterize/close the trace-vs-legacy gap on the + 256^3 GEMM with the SAME gem5 cycle_list. Sub-items 2-3 are the main drivers. +2. **Preload concurrency cap (sec 10.5).** 4 preloads run concurrently though there + are 2 SAs, because a preload's occupancy is 0 (overlapping_cycle == compute). + Give the preload a non-zero occupancy (the weight-load time) so concurrent + preloads are capped at the SA count. Pre-existing in BOTH paths. +3. **Robust gem5 cycle_list wiring.** The extension_codecache `TORCHSIM_DUMP_TRACE_SO=1` + hook dumps trace.so + trace_cycles.tsv from the real cycle_list, but is flaky + under concurrent compiles (saw cycle_list==[] once). Make it robust (or force a + single-thread compile), so `--trace_so --cycle_table` uses real per-tile cycles. +4. **Parallel output tiles / multi-core.** One dispatch per work-item today; for + distributing independent output tiles across cores, emit a dispatch per parallel + (m_sub, n_sub) tile. The inner sub-tile loops are currently unlabeled (only the + macro loops carry subtile/accumulation), so the axis role must be recovered. +5. **Cleanup.** The obsolete WAIT/SIGNAL trace records and the event-handle + buffer are dropped (v11). COMPUTE_BAR logs finish twice (cosmetic). The + preload node mis-attributes an X_spad read (build_tog `_steal_leading_transfer_read`) + -> a harmless extra edge. +6. **P5 op coverage.** Only GEMM is exercised. Extend to conv / SDPA / vector / pool. +7. **P4.** Symbolic/dynamic shape; streaming sink (coroutine, alloc-blocks). +8. **Two-function outline** (togsim_kernel_tile) -- DONE (ABI v12). The work-item + body is outlined into a uniform `togsim_kernel_tile(ctx, iv, n)` and run via the + higher-order `togsim_dispatch` wrapper (round-robin core + TILE_BEGIN/TILE_END); + the work-item scope is now the function call. Trace/cycles identical to the old + single-function `togsim_core_alloc` form. One general dispatcher serves every + kernel. +9. **Retire the legacy ONNX-TOG path** once the trace path is stable. + +### 11.3 Next-session context + +- Worktree `/workspace/PyTorchSim-cpptrace`, branch `feature/togsim-cpp-trace`, + PR #267 -> develop. The branch is rebased ONTO develop (the retire-floormod base + was dropped -- develop already has it). `source .envrc` in the worktree. +- Build TOGSim: submodules are init'd; `cd TOGSim/build && cmake .. -DCMAKE_BUILD_TYPE=Release && make -j$(nproc)`. + The Simulator target has ENABLE_EXPORTS (so a dlopen'd .so resolves the togsim_* + callbacks); togsim_runtime.cc + togsim_trace_bridge.cc are picked up by the src glob. +- Run the trace path: + `python -m PyTorchSimFrontend.mlir.passes.lower_to_emitc --so trace.so [--emit-cpp x.cpp]` + then `bin/Simulator --config --trace_so trace.so [--cycle_table cyc.tsv] [--log_level trace]`. +- Get a post-vcix fixture: a real torch.compile GEMM with `TORCHSIM_DUMP_MLIR_IR=1 + pytorchsim_functional_mode=False` writes `outputs//..._sample_postvcix.mlir`. + Real cycle data + legacy reference: add `TORCHSIM_DUMP_TRACE_SO=1` to also dump + trace.so + trace_cycles.tsv in `outputs//` (see 11.2 #3). (Prior /tmp + fixtures are ephemeral -- regenerate.) +- Env (.envrc): gem5 `/gem5/release/gem5.opt`, spike `/release/bin/spike`, + LLVM `/riscv-llvm/bin`. +- Tests: `TOGSIM_SKELETON_FIXTURE= pytest tests/test_togsim_{skeleton,emitc,runtime}.py`. + These are NOT in the CI allowlist (`.github/workflows/pytorchsim_test.yml`) -- register them to gate CI. +- Key files: passes `build_skeleton.py`, `lower_to_emitc.py`, `dep_analysis.py`, + `cycle_table.py`, `togsim_ops.py`; `TOGSim/include/{togsim_runtime.h, togsim_loader.h, togsim_trace_bridge.h}`, + `TOGSim/src/{togsim_runtime.cc, togsim_trace_bridge.cc}`; `Core.cc`/`Instruction.{h,cc}` + (COMPUTE_BAR + MEMORY_BAR rename); `main.cc` (--trace_so); `extension_codecache.py` + (TORCHSIM_DUMP_TRACE_SO hook). +- Local-only backups of the pre-squash/pre-rebase 28-commit history: tag + `pr-backup-ccfea43e`, branch `backup-presquash-3cfd4a3f` (NOT pushed). diff --git a/docs/design/togsim_cpp_trace_HANDOFF.md b/docs/design/togsim_cpp_trace_HANDOFF.md new file mode 100644 index 00000000..23f642bb --- /dev/null +++ b/docs/design/togsim_cpp_trace_HANDOFF.md @@ -0,0 +1,191 @@ +# Handoff — TOGSim C++ Trace Generation + +Continuation notes for picking this work up in a fresh session. Read alongside +the full design: [`togsim_cpp_trace.md`](./togsim_cpp_trace.md) and the snapshot +[`togsim_cpp_trace_STATUS.md`](./togsim_cpp_trace_STATUS.md). + +## Goal (one line) + +Replace the timing-path TOG producer (MLIR -> Python-dict -> ONNX -> C++ parser) +with a compiled, shape-parametric trace producer (MLIR -> EmitC -> C++ -> `.so`); +TOGSim's timing core is preserved. + +## Current state (one paragraph) + +The trace pipeline is implemented end-to-end and runs through the REAL +Simulator/Core on a 256^3 GEMM (`--trace_so`). Dependencies are an explicit +dataflow DAG (SRAM last-writer per buffer + the vcix preload/matmul FSM). An +asynchronous DMA is synced to the consumer of its data by the **runtime tag +slot** `(tag_id, tag_slot)` through an explicit `togsim.memory_barrier` (lowered +from the source `memref.dma_wait`); a sync DMA is blocking. ABI is **v11**. An +earlier design used a compile-time `event_id` / heap event handle with +`wait`/`signal`; it was removed because one static DMA op runs once per loop +iteration into a different tag slot, which a compile-time id cannot pair per +iteration. **Validation:** on the 256^3 GEMM with the real gem5 cycle table, the +trace path totals **2518 cycles** vs the legacy path's **2698** through the real +Core; all togsim python tests pass; TOGSim builds. + +## Branch + +- Work branch: `feature/togsim-cpp-trace` (PR #267 -> develop) + +## Status + +| Milestone | State | +|---|---| +| P0 — ABI header + op vocabulary | DONE (ABI evolved to v11) | +| P1 — `build_skeleton` pass | DONE, verified — runs on a real GEMM fixture, module verifies, compute grouping + dma/barrier counts match the legacy `build_tog` TOG. | +| P2 — togsim -> emitc -> cpp -> .so | DONE — `lower_to_emitc.py` builds EmitC, `mlir-translate` -> C++, `g++ -shared` -> `.so`; validated by build/symbol checks and a dlopen run harness. | +| P3 — TOGSim loader + runtime + cycle table; real-Core run | DONE — runs end-to-end through the real Simulator/Core (256^3 GEMM, `--trace_so`). Runtime tag-slot pairing (ABI v11, `togsim.memory_barrier`), explicit dataflow DAG (read/write_bufs last-writer + vcix FSM), real tile addresses, cycle_table. `togsim_runtime.cc`/`togsim_loader.h`/`togsim_trace_bridge.cc` feed TraceRec into the real Core. Cycle comparison vs legacy on the real gem5 table: trace 2518 vs legacy 2698. Legacy ONNX-TOG path DEPRECATED in place, kept live. | +| P4 — symbolic-bound dynamic shape, streaming sink | not started | +| P5 — op-family migration (conv/SDPA/vector) | not started | + +### Async-DMA sync: runtime tag slot (current), event-id (removed) + +The original P1 threaded the dma->wait dependency as an SSA `!togsim.event` +value, which fails `module.verify()` on a software-pipelined kernel (the +`togsim.dma` sits in the prefetch loop nest, its consumer in a sibling compute +nest, so the value does not dominate its use). An intermediate fix used a +compile-time `event_id` attribute (later a heap-allocated event handle). Both +were **removed**: one static `togsim.dma` op executes once per loop iteration +into a *different* runtime tag slot, so a compile-time id (one per static op) +cannot pair iteration i's DMA with iteration i's wait. + +Current mechanism (ABI v11): `togsim.dma` carries `tag_id` (its tag-memref +identity) plus the runtime tag-index operand `%tag[%idx]` and returns void. The +source `memref.dma_wait` is mapped through to an explicit +`togsim.memory_barrier {tag_id, write_bufs}` carrying the runtime tag index. At +runtime an async DMA and its barrier are paired by `(tag_id, tag_slot)` through +the existing Core tag table (`prepare_tag_key`/`set_tag_finish`/ +`register_tag_waiter`): the DMA signals at data arrival, the barrier waits, and +the barrier becomes the loaded buffer's last-writer so consumers gate on +arrival. (The one remaining auto-inserted barrier is `togsim.compute_barrier`, +the compute fence before a store — marked FIXME to become explicit later.) + +### P2 decisions + +* **ABI v11 (runtime tag slot).** `togsim_dma` returns void and carries + `(is_async, tag_id, tag_slot, read_bufs, write_bufs)`. The + `togsim_memory_barrier(tag_id, tag_slot, write_bufs)` is the explicit + async-DMA sync. No `event_id`, no event handle, no `wait`/`signal`. +* **C4 drives the upstream EmitC conversion passes** (it does not hand-build + EmitC). It only does the parts upstream cannot: rewrite the *unregistered* + `togsim.*` ops to `emitc.call_opaque` and rewrite the kernel signature to the + ABI form. Then it runs, in-process (`mlir.passmanager`), + `func.func(lower-affine), convert-scf-to-emitc, convert-arith-to-emitc, + convert-func-to-emitc`. One local fixup: in this LLVM 20 build + `convert-scf-to-emitc` emits `emitc.for` with `index` bounds, so + `convert-arith-to-emitc` (constants -> `!emitc.size_t`) leaves + `unrealized_conversion_cast` on the bounds that nothing folds and + `mlir-to-cpp` can't print (design sec 8 risk). `_fold_for_bound_casts` + rewrites those bound constants to `index`-typed `emitc.constant`, clearing + the casts. (`emitc.for` *does* accept `size_t` bounds with an explicit + `: !emitc.size_t`, but keeping the bounds `index` avoids retyping the IV.) +* **Addresses (wired in P3, approach A):** `togsim_dma` passes `(arg_id, element + offset)` with the offset computed from the loop IVs; the runtime adds the + tensor base. `togsim.compute` is keyed by `tile_id` for cost. + +## Files (key) + +- `TOGSim/include/togsim_runtime.h` — extern "C" ABI v11 (`togsim_dma`, + `togsim_memory_barrier`, `togsim_compute`, `togsim_compute_barrier`, + `togsim_core_alloc`, `togsim_kernel` entry, `TOGSIM_ABI_VERSION`, opaque + `EmitCtx`). +- `PyTorchSimFrontend/mlir/passes/togsim_ops.py` — single source of truth for the + skeleton+API MLIR vocabulary (op names, attr keys, op->callee map). +- `PyTorchSimFrontend/mlir/passes/build_skeleton.py` + `dep_analysis.py` — the P1 + pass + dependency analysis (reuse build_tog's `TogBuilder`/`_build`; map + dma_start->togsim.dma, dma_wait->togsim.memory_barrier, attach read/write_bufs; + use-based DCE). +- `TOGSim/src/togsim_runtime.cc`, `TOGSim/include/togsim_loader.h`, + `TOGSim/src/togsim_trace_bridge.cc` — C6 runtime, dlopen loader, and the bridge + that feeds the recorded TraceRec stream into the real Core. +- `tests/test_togsim_skeleton.py` — `test_togsim_ops_contract` (runs anywhere) + + `test_build_skeleton_on_fixture` (gated on bindings + a fixture). +- `PyTorchSimFrontend/mlir/passes/lower_to_emitc.py` — the P2/C4 pass: skeleton + module -> EmitC `togsim_kernel` -> C++ (`mlir-translate`) -> `.so` (`g++`). + Entry points: `lower_to_emitc(module)`, `build_trace_so(postvcix_path, so)`, + and a `__main__` CLI (`--so`, `--emit-cpp`, `--include-dir`). +- `tests/test_togsim_emitc.py` — `test_build_trace_so` (EmitC + symbol checks) + + `test_trace_so_runs` (dlopen the `.so` against a stub runtime, run it). Gated + on bindings + `mlir-translate` + a C++ compiler + the fixture. + +## Reproduce P1 + P2 (one GEMM kernel) + +```bash +# 1. post-vcix fixture: compile a GEMM (needs the built PyTorchSimDevice .so). +export pytorchsim_functional_mode=False +python tests/ops/gemm/test_matmul.py +FIX=$(find "${TORCHSIM_DUMP_PATH:-.}" -name '*_postvcix.mlir' | head -1) +# build_skeleton/lower_to_emitc only need the .mlir + bindings, not torch, so a +# fixture compiled in any worktree is fine. + +# 2. P1: skeleton+API MLIR. +python -m PyTorchSimFrontend.mlir.passes.build_skeleton "$FIX" --out /tmp/skel.mlir +# stderr: "skeleton: compute=.. dma=.. memory_barrier=.." + +# 3. P2: skeleton -> EmitC -> C++ -> .so (reads skel from $FIX via build_skeleton). +python -m PyTorchSimFrontend.mlir.passes.lower_to_emitc "$FIX" \ + --so /tmp/trace.so --emit-cpp /tmp/trace.cpp +nm -D /tmp/trace.so | grep togsim # togsim_kernel = T; togsim_dma/memory_barrier/compute = U + +# 4. tests +TOGSIM_SKELETON_FIXTURE="$FIX" python -m pytest \ + tests/test_togsim_skeleton.py tests/test_togsim_emitc.py -q +``` + +Note: `mlir-opt`/`mlir-translate` live in `$TORCHSIM_LLVM_PATH` but are not on +`$PATH`; `lower_to_emitc` resolves `mlir-translate` from `TORCHSIM_LLVM_PATH`. + +## Next steps (P3 is done; remaining work) + +The producer is wired into TOGSim and runs through the real Core (trace 2518 vs +legacy 2698 on the 256^3 GEMM). The parallelism / reduction / core-dispatch +design is in `togsim_cpp_trace.md` §9. Summary: the producer is core-transparent +(knows nothing about `num_cores`); it enumerates parallel output-tile work-items +and calls `togsim_core_alloc` at each work-item boundary. Parallel = independent +work-items; reduction = program order inside one work-item; core binding = the +`togsim_core_alloc` runtime callback (policy lives in TOGSim). Async-DMA data +sync = the runtime `(tag_id, tag_slot)` via `togsim.memory_barrier`. `num_cores` +is extrinsic so it is never baked; vlane/tile sizes are intrinsic and stay baked. +Split-K is a deferred exception. + +Remaining (priority order; full list in STATUS §7 and design §11.2): + +- **SRAM tile lifecycle (double-buffer throttle).** `togsim.dma` carries + `tag_slot` (the SRAM slot key); the consumer must use it to throttle in-flight + loads to the buffer depth on multi-tile / double-buffered kernels. +- **Preload concurrency cap (design §10.5).** Give a preload a non-zero occupancy + (its weight-load time) so concurrent preloads are capped at the SA count. + Pre-existing in BOTH paths. +- **Per-output-tile dispatch / multi-core.** One `togsim_core_alloc` per + work-item today; distribute independent output tiles across cores. +- **Robust gem5 cycle_list wiring.** The extension_codecache + `TORCHSIM_DUMP_TRACE_SO=1` hook is flaky under concurrent compiles. +- **P5 op coverage** (conv/SDPA/vector) and **P4** (symbolic shape, streaming + sink), then **retire the legacy ONNX-TOG path**. + +Full design: `togsim_cpp_trace.md` §5-11. + +## Environment requirements (for the new session) + +- MLIR Python bindings importable (`import mlir.ir`). They ship with the LLVM + build at `${TORCHSIM_LLVM_PATH%/bin}/python_packages/mlir_core`; the CI docker + image `ghcr.io/psal-postech/torchsim-ci` has them. `passes/__init__` also + derives the path from `TORCHSIM_LLVM_PATH`. +- `pytest` to run the test files directly (`pip install pytest` if absent). +- `mlir-translate` (in `$TORCHSIM_LLVM_PATH`) and a host C++ compiler (`g++`/ + `$CXX`) for the P2 `.so` path. +- TOGSim build (for `--trace_so`): `cd TOGSim/build && cmake .. + -DCMAKE_BUILD_TYPE=Release && make -j$(nproc)`. The Simulator target has + ENABLE_EXPORTS so a dlopen'd `.so` resolves the `togsim_*` callbacks. +- When iterating on passes, clear the codegen caches (`$TORCHSIM_DUMP_PATH`, + default `outputs/`) between runs — see CLAUDE.md "Codegen changes are sticky". + +## Verification that already passes anywhere (sanity) + +```bash +python -m py_compile PyTorchSimFrontend/mlir/passes/build_skeleton.py \ + PyTorchSimFrontend/mlir/passes/togsim_ops.py tests/test_togsim_skeleton.py +# contract test (no bindings needed): see test_togsim_ops_contract +``` diff --git a/docs/design/togsim_cpp_trace_STATUS.md b/docs/design/togsim_cpp_trace_STATUS.md new file mode 100644 index 00000000..ebf05701 --- /dev/null +++ b/docs/design/togsim_cpp_trace_STATUS.md @@ -0,0 +1,226 @@ +# TOGSim C++ Trace Generation — Status Report + +Branch: `feature/togsim-cpp-trace`. Design of record: `togsim_cpp_trace.md` (esp. +§9); continuation notes: `togsim_cpp_trace_HANDOFF.md`. This file is a snapshot of +progress. + +## 1. Goal + +Replace the timing-path TOG producer (`MLIR -> Python dict -> ONNX -> C++ +TileGraphParser`) with a compiled, shape-parametric trace producer +(`MLIR -> skeleton -> EmitC -> C++ -> .so`). TOGSim's timing core is preserved; +only the producer of its input changes. The key idea: do not flatten the TOG; +instead **run** a compiled C++ producer that emits the trace as a stream of API +calls. + +Each API call emits one trace record = one modeled instruction, fed to the +existing timing Core. Dependencies are an explicit dataflow DAG (SRAM +last-writer per buffer + the vcix preload/matmul FSM). An asynchronous DMA is +synced to the consumer of its data by the **runtime tag slot** `(tag_id, +tag_slot)` through an explicit `togsim.memory_barrier` (ABI v11). An earlier +design used a compile-time `event_id` / event handle with `wait`/`signal`; that +was removed because one static DMA op runs once per loop iteration into a +different tag slot, which a single compile-time id cannot pair per iteration. + +## 2. Pipeline + +``` +post-vcix .mlir (torch.compile output) + | build_skeleton.py + dep_analysis.py (P1) keep loops; + | memref.dma_start -> togsim.dma(tag_id, %tag[%idx], is_async, read/write_bufs); + | memref.dma_wait -> togsim.memory_barrier(tag_id, tag_slot, write_bufs); + | compute block -> togsim.compute; DCE the rest + v +skeleton+API MLIR + | lower_to_emitc.py (P2/C4) togsim.* -> emitc.call_opaque; ABI signature; drive upstream + | lower-affine/convert-*-to-emitc; _retype_for_to_size_t fixups + v +EmitC --mlir-translate--> C++ --g++ -shared--> trace.so + | TOGSim loader (C6): dlopen + EmitCtx callbacks + v + TraceRec stream (materializing sink) + | togsim_trace_bridge.cc -> existing Core timing + v + cycles / DRAM traffic (real Core) +``` + +Side artifact: cycle table `tile_id -> (cycle, overlapping_cycle)` (cycle_table.py). + +## 3. Milestones + +| | State | +|---|---| +| P0 ABI header + togsim vocabulary | DONE (ABI evolved to v11) | +| P1 build_skeleton | DONE, verified (compute/dma/barrier match legacy TOG) | +| P2 lower_to_emitc -> .so | DONE (real GEMM .so built and run) | +| P3 loader/runtime + cycle table + real-Core run | DONE (runs end-to-end through the real Simulator/Core; below) | +| P4 symbolic/dynamic shape, streaming sink | TODO | +| P5 op-family migration (conv/SDPA/vector) | TODO | + +P3 detail: + +| | State | +|---|---| +| ABI (core_alloc, runtime tag pairing, dma address) | DONE (v11) | +| work-item boundary (togsim_core_alloc) | DONE | +| real tile DRAM addresses (approach A) | DONE, verified on 1024^3 | +| cycle_table builder (cycle + overlapping) | DONE | +| async DMA <-> consumer sync (runtime tag slot, memory_barrier) | DONE | +| explicit dataflow DAG (read/write_bufs last-writer) | DONE | +| C6 runtime + dlopen loader (materializing) | DONE | +| TraceRec -> existing Core timing feed | DONE (runs end-to-end through real Core) | +| cycle comparison vs build_tog (real gem5 table) | DONE: trace 2518 vs legacy 2698 | +| SRAM tile lifecycle / preload-occupancy refinements | partial (see §7) | + +### TraceRec -> Core: now running end-to-end + +`TOGSim/src/togsim_trace_bridge.cc` (`trace_to_tilegraph`) + a `--trace_so` mode +in `main.cc` feed the recorded trace into the REAL Simulator/Core. The producer +`.so` is `dlopen`'d (the Simulator is built with ENABLE_EXPORTS so the `.so` +resolves the `togsim_*` callbacks back into the binary), its trace recorded, then +bridged to a `TileGraph`: one `TileSubGraph` per work-item (core_alloc marker) +bound to its core, one `Tile` of MOVIN/MOVOUT/COMP/MEMORY_BAR/COMPUTE_BAR +`Instruction`s. Dependency edges are built by **last-writer per SRAM buffer** +(`read_bufs`/`write_bufs`); an async load's last-writer is the MEMORY_BAR paired +to it by the runtime `(tag_id, tag_slot)` (so a consumer waits actual data +arrival), and a COMPUTE_BAR drains the systolic-array pipeline before a store. +Build it (`cd TOGSim/build && cmake .. && make`) and run: +`bin/Simulator --config --trace_so gemm_trace.so`. + +### Cycle comparison vs legacy build_tog (256^3 GEMM, real gem5 table) + +Ran the same kernel through the legacy path (torch.compile -> gem5 -> build_tog +-> Simulator) and the trace path (the same post-vcix IR -> trace .so + the SAME +gem5 cycle_list -> --trace_so), both through the REAL Core. extension_codecache +has an opt-in TORCHSIM_DUMP_TRACE_SO=1 hook that dumps trace.so + trace_cycles.tsv +from the same cycle_list/offsets (best-effort, never breaks the legacy path); +compute-unit routing uses compute_type and the tag key uses a per-tensor addr_id +(set_addr_name(arg_id)+prepare_tag_key) so A and B don't collide on tag_slot 0. + +**Result: the trace path totals 2518 cycles vs the legacy path's 2698 on the +same gem5 cycle table.** All togsim python tests pass; TOGSim builds. Compute +work and DRAM traffic match; the remaining difference is scheduling (the +explicit dataflow DAG plus the occupancy/latency SA-pipeline model overlap +differently than legacy's per-iteration BARs). + +**Subtile + multi-tile-K now runs** (256x512x256 forced to 128x128 subtiles, 2 +K-tiles: 5774 cycles, no crash). This needed `build_skeleton` to strip the +`-acc_iv` accumulation marker from the dma_wait tag index so the memory_barrier +slot stays subtile-only and pairs with its load (see §3, `tag_slot`); before the +strip the producer evaluated `-acc_iv` to a negative slot at the 2nd K-tile and +TOGSim aborted with "Key does not exist in ... tag table". + +## 4. Components + +- `build_skeleton.py` + `dep_analysis.py` — in-place reduction of post-vcix to + "loop skeleton + togsim.* API"; `memref.dma_wait` mapped through to an explicit + `togsim.memory_barrier`; read/write SRAM buffer ids attached; reuses legacy + `TogBuilder` traversal. +- `lower_to_emitc.py` — skeleton -> EmitC by driving the upstream conversion + passes plus `_retype_for_to_size_t` (clears residual index<->size_t casts). + `togsim_dma` carries `(tag_id, runtime tag-index, is_async, read/write_bufs)` + and returns void; `togsim_memory_barrier` carries `(tag_id, tag_slot, + write_bufs)`; `togsim_core_alloc` inserted at the work-item boundary. +- `cycle_table.py` — `tile_id -> (cycle, overlapping)`, overlapping + `= max(cycle - offset[type], 0)` (legacy formula); JSON sidecar. +- `TOGSim/src/togsim_runtime.cc` + `TOGSim/include/togsim_loader.h` — C6 runtime + and `run_producer` (dlopen -> togsim_kernel -> records TraceRec). dma resolves + `base[arg] + offset*elem_bytes` and signals its tag at data arrival; the + matching memory_barrier waits the `(tag_id, tag_slot)`; compute looks up the + cycle table; core_alloc round-robins a runtime core pool. +- `TOGSim/src/togsim_trace_bridge.cc` — bridges the recorded TraceRec stream into + the existing `TileGraph`/`Instruction` form for the real Core. +- `TOGSim/include/togsim_runtime.h` — producer ABI v11. + +## 5. Locked design decisions + +1. **Trace is a DAG, not a time order.** The consumer (existing Core) schedules + per-core timelines from: op kind -> hardware unit, SRAM-buffer last-writer -> + data dependency, same-core -> serial (reduction accumulate), SRAM slot -> + capacity. Emission order != execution order. +2. **Async-DMA sync = runtime tag slot.** A `togsim.dma` carries `(tag_id, + tag_slot)`; the matching `togsim.memory_barrier` (lowered from the source + `memref.dma_wait`) waits on the same pair through the existing Core tag table + (`prepare_tag_key`/`set_tag_finish`/`register_tag_waiter`). The DMA signals at + data arrival; the barrier becomes the loaded buffer's last-writer so consumers + gate on arrival. A sync DMA is blocking (no barrier). This replaced an earlier + `event_id` / heap event-handle design, which could not pair a DMA op with its + wait per loop iteration (one static op, a different tag slot each iteration). + No `calc_tag` content-hash, no magic values, no FIFO. +3. **Core = runtime allocation.** `togsim_core_alloc` returns a core id (no free). + `num_cores` is never baked into the producer -- it is the runtime pool size. + A work-item's reduction stays on one core (sticky); different work-items get + different cores -> multi-core. +4. **Intrinsic baked / extrinsic parametric.** vlane / tile sizes / systolic + define instructions (baked); num_cores only distributes (runtime). +5. **Execution model:** P3 materializing (run producer to completion -> record -> + feed existing Core); P4 streaming (coroutine, alloc-blocks on resources). +6. **Double-buffer = resource constraint.** Producer emits everything (no skew); + capacity is the consumer's throttle. Requires SRAM tile lifecycle + (alloc/free) in the trace -- the currently missing piece. + +## 6. Verification (reproducible) + +- togsim python tests pass: skeleton (contract + fixture), emitc (build + dlopen + run), cycle_table, runtime. TOGSim builds. +- 256^3 GEMM: core_alloc -> dma(tag_id, tag_slot) -> memory_barrier(tag_id, + tag_slot) -> compute; addresses A/B/C resolved (offset 0, single tile). +- 1024^3 GEMM: per-tile addresses correct (A[m,k]=m*1024+k -> 0,256,512; + B[k,n]=k*1024+n -> 0,262144,524288). +- End-to-end through the real Core (256^3 GEMM, real gem5 table): trace 2518 + cycles vs legacy 2698. +- Legacy ONNX-TOG path untouched (comment-only diff), marked DEPRECATED, kept as + the comparison reference. + +## 6b. Reference timer (early sanity check; superseded by the real Core feed) + +`togsim::simulate(RunResult, TimingParams)` (togsim_runtime.cc) was an early +standalone scheduler that timed the recorded TraceRec to prove the stream is +sufficient to be timed: per core a DMA-engine timeline (DMAs serialize, overlap +compute), a compute timeline (serial = reduction accumulate, with the `finish = +prev.finish + cycle - overlapped` pipeline overlap of Core.cc), and data deps. +It is NOT the production Core (no DRAM/NoC/L2 contention). It has since been +superseded: the recorded stream is now bridged into the real Tile/TileGraph -> +Core (see §3, and the 2518-vs-2698 result above). Retained here as context. + +## 7. Remaining work (priority order) + +1. DONE. Map TraceRec -> existing TOGSim Core Instructions (Tile/TileGraph, + compute_cycle+overlapping, dataflow-buffer deps + runtime-tag barriers) and + run through the real Core. Result: trace 2518 vs legacy 2698 on the same gem5 + table. +2. SRAM tile lifecycle in the trace (double-buffer throttle). togsim_dma carries + `tag_slot` (the lowered SRAM tag index = the slot key the existing Core's + Instruction.tag_idx needs); 0 for single-buffer kernels. Remaining: the + consumer must use it to throttle in-flight loads to the buffer depth. The + SRAM-buffer key is effectively (arg_id, tag_slot) since each load's DRAM + tensor maps to its spad. +3. Preload concurrency cap / preload occupancy (design doc §10.5): give a preload + a non-zero occupancy so concurrent preloads are capped at the SA count. + Pre-existing in BOTH paths. +4. (later) deeper double-buffer pipelines (more tag slots), two-function outline, + P4 streaming, symbolic shape, P5 op coverage (conv/SDPA/vector). + +## 8. Risks / open + +- SRAM lifecycle (double-buffer throttle) not yet implemented -- central to + double-buffer/capacity accuracy on multi-tile kernels. +- LLVM 20 emitc constraints absorbed: emitc.for index bounds; old + subscript-returns-element model; arith.divui/remui not lowerable -> core id is + a runtime allocation (which became a design improvement). + +### Explicit dataflow-edge dependency model: implemented + +The dependency model is an explicit dataflow DAG, not in-order or runtime-tag +content-hashing. `togsim_dma`/`togsim_compute` carry read_bufs/write_bufs (SRAM +buffer ids; a virtual SA_WEIGHTS buffer folds the preload->matmul edge). +dep_analysis + build_skeleton attach them; lower_to_emitc emits them; the runtime +records them; the bridge builds the Instruction DAG by last-writer per buffer, +scoped per work-item. The one runtime-paired edge is the async-DMA data wait, +routed through an explicit `togsim.memory_barrier` keyed on `(tag_id, tag_slot)` +(see design doc §10.7.4). The systolic-array pipeline uses the occupancy/latency +split (§10.7), so accumulating matmuls pipeline rather than serialize. + +Net (256^3 GEMM, real gem5 table, real Core): trace 2518 vs legacy 2698. +Per-output-tile dispatch for multi-core distribution is the next refinement +(today one dispatch per work-item). diff --git a/scripts/trace_timeline.py b/scripts/trace_timeline.py new file mode 100644 index 00000000..5cf9608b --- /dev/null +++ b/scripts/trace_timeline.py @@ -0,0 +1,249 @@ +#!/usr/bin/env python3 +"""Convert a TOGSim `--log_level trace` log into a Chrome Trace Event JSON that +opens in Perfetto (https://ui.perfetto.dev) or chrome://tracing as an interactive +timeline (Gantt). + +Each instruction becomes one duration slice, grouped per core (pid). Lanes: + dram-rd -- loads crossing the DRAM bus (read bandwidth) + dram-wr -- stores crossing the DRAM bus (write bandwidth) + sa / sa0.. -- COMP compute_type 1 (matmul) / 2 (preload) + vector -- COMP compute_type 0 (vector) +Time unit = core cycles. Barriers (MEMORY_BAR/COMPUTE_BAR) are not drawn. A DMA bar +runs from the op's first DRAM response (DRAM_RESP_FIRST, logged by the Core -- so it +captures data moving even while still injecting) to its completion (load: data-ready; +store: finished), serialized per direction so each is one visible bar (packed row = +saturated bus). A compute slice's width is its occupancy (compute_cycle - overlapping). + +Usage: + bin/Simulator --config --trace_so --cycle_table --log_level trace \ + 2>&1 | python scripts/trace_timeline.py -o timeline.json + # or + python scripts/trace_timeline.py trace.log -o timeline.json +Then drag timeline.json into https://ui.perfetto.dev . +""" +import argparse +import json +import re +import sys + +# [cycle][Core C][TAG ][INST_ID=N] OPCODE (detail...) +_LINE = re.compile( + r"\[(\d+)\]\[Core (\d+)\]\[([A-Z_]+)\s*\](?:\[INST_ID=(-?\d+)\])?\s*(\w+)?(.*)") + +# Only 3 lanes per core. Barriers are dropped (see _HIDE). +_LANE = {"MOVIN": "dma", "MOVOUT": "dma"} +_HIDE = {"MEMORY_BAR", "COMPUTE_BAR", "TILE_BEGIN", "TILE_END"} +_CT_NAME = {0: "vector", 1: "matmul", 2: "preload"} + +# Perfetto/catapult reserved color names; slices are tinted by tile (= the +# togsim_dispatch work-item / output tile) so one tile's ops share a color across +# lanes/cores. 16 names so a core's tiles (which stride by num_cores) stay +# distinct -- an 8-name palette collapsed to 4 colors per core under 2-core +# even/odd assignment. +_TILE_PALETTE = ["good", "bad", "terrible", "yellow", "olive", "rail_response", + "rail_load", "rail_animation", "rail_idle", "thread_state_running", + "thread_state_runnable", "thread_state_iowait", + "thread_state_uninterruptible", "generic_work", "startup", + "vsync_highlight_color"] + + +def _tile_color(detail): + m = re.search(r"\btile=(\d+)", detail or "") + return _TILE_PALETTE[int(m.group(1)) % len(_TILE_PALETTE)] if m else None + + +_DMA_SHORT = {"MOVIN": "MVIN", "MOVOUT": "MVOUT"} + + +def _tile_of(detail): + m = re.search(r"\btile=(-?\d+)", detail or "") + return m.group(1) if m else "?" + + +def _label(opcode, detail): + if opcode == "COMP": + m = re.search(r"compute_type=(\d+)", detail) + ct = int(m.group(1)) if m else -1 + return f"T{_tile_of(detail)} {_CT_NAME.get(ct, 'comp')}" + # DMA: keep each load's OWN identity (addr_name) so the input/weight/K-panel + # loads stay distinct; tile is conveyed by color (and args), not the name. + m = re.search(r"addr_name=(\w+)", detail or "") + who = m.group(1) if m else "?" + return f"{who} (T{_tile_of(detail)} {_DMA_SHORT.get(opcode, opcode)})" + + +def _lane(opcode, detail): + if opcode == "COMP": + m = re.search(r"compute_type=(\d+)", detail) + ct = int(m.group(1)) if m else -1 + return "vector" if ct == 0 else "sa" + return _LANE.get(opcode, "dma") + + +def parse(lines): + # key = (core, inst_id) -> record + insts = {} + for ln in lines: + m = _LINE.search(ln) + if not m: + continue + cyc, core, tag, iid, opcode, detail = m.groups() + if iid is None or opcode is None: + continue + cyc, core, iid = int(cyc), int(core), int(iid) + key = (core, iid) + r = insts.setdefault(key, { + "core": core, "iid": iid, "opcode": opcode, "detail": detail, + "issued": None, "finished": None, "resp": None, "dma_issue": None, + "first_resp": None}) + if not r["opcode"] or r["opcode"] == opcode: + r["opcode"] = opcode + if detail.strip(): + r["detail"] = detail + if tag == "INST_ISSUED" and r["issued"] is None: + r["issued"] = cyc + elif tag == "INST_FINISHED": + r["finished"] = cyc + elif tag == "DRAM_RESP_DONE": + r["resp"] = cyc + elif tag == "DRAM_RESP_FIRST" and r["first_resp"] is None: # first data arrived + r["first_resp"] = cyc + elif tag == "ASYNC_DMA_ISSUE": # all requests injected (engine done) + r["dma_issue"] = cyc + return insts + + +def _occ(detail): + """(compute_cycle, overlapping_cycle) from a COMP detail string.""" + cc = re.search(r"compute_cycle=(\d+)", detail) + ov = re.search(r"overlapping_cycle=(\d+)", detail) + return (int(cc.group(1)) if cc else 0, int(ov.group(1)) if ov else 0) + + +def to_chrome(insts, num_sa=1): + """Model each hardware unit as a server and replay its ops in issue order, so + real idle gaps (bubbles) show and slices don't nest: + dma : MOVIN/MOVOUT -- 1 DMA engine; slice = actual transfer + (ASYNC_DMA_ISSUE -> data-ready). + vector : COMP type 0 -- 1 VPU. + sa : COMP type 1/2 -- each op on the SA the Core reports (`sa=` field; + weight-pinned), so lanes auto-split sa0..; rr fallback if absent. + A compute slice's width is compute_cycle - overlapping_cycle (its occupancy = + latency minus the tail that overlaps the next op), starting when the unit + actually picks it up: start = max(issue, unit_free). num_sa>1 -> lanes sa0.. .""" + by_core = {} + for r in insts.values(): + op, detail, core = r["opcode"], r["detail"], r["core"] + if op in _HIDE: + continue + u = by_core.setdefault(core, {"dma": [], "vector": [], "sa": []}) + if op == "COMP": + m = re.search(r"compute_type=(\d+)", detail) + ct = int(m.group(1)) if m else -1 + u["vector" if ct == 0 else "sa"].append(r) + else: + u["dma"].append(r) + + events, lanes, cores = [], set(), set() + + def add(core, lane, ts, dur, name, r): + lanes.add((core, lane)) + cores.add(core) + args = {"inst_id": r["iid"], "tile": _tile_of(r["detail"]), + "issued": r["issued"], "first_data": r["first_resp"], + "finished": r["finished"], "data_ready": r["resp"]} + am = re.search(r"addr_name=(\w+)", r["detail"] or "") + if am: + args["addr"] = am.group(1) + ev = {"name": name, "cat": lane, "ph": "X", "ts": ts, + "dur": max(dur, 1), "pid": core, "tid": lane, "args": args} + cname = _tile_color(r["detail"]) + if cname: + ev["cname"] = cname + events.append(ev) + + def issue_key(r): + return r["issued"] if r["issued"] is not None else 0 + + nsa = max(num_sa, 1) + for core, u in sorted(by_core.items()): + # DMA data crossing the DRAM bus, split by direction (reads and writes are + # asymmetric). A LOAD's data comes back on the response, so its bar runs + # [first DRAM response, data-ready]. A STORE's data goes out with the + # request (fire-and-forget; its acks arrive after it has finished), so its + # bar runs [issued, finished]. Serialized per direction so each op is one + # visible bar: a packed row = the bus is saturated, gaps = it is idle. + for lane, op, sk, ek in (("dram-rd", "MOVIN", "first_resp", "resp"), + ("dram-wr", "MOVOUT", "issued", "finished")): + free = 0 + rows = [r for r in u["dma"] if r["opcode"] == op + and r[sk] is not None and r[ek] is not None and r[ek] > r[sk]] + for r in sorted(rows, key=lambda r: r[ek]): + start = max(r[sk], free) + free = max(r[ek], start + 1) + add(core, lane, start, free - start, _label(r["opcode"], r["detail"]), r) + # VPU: one server; slice = occupancy (compute_cycle - overlapping_cycle). + free = 0 + for r in sorted(u["vector"], key=issue_key): + if r["issued"] is None: + continue + cc, ov = _occ(r["detail"]) + dur = max(cc - ov, 1) + start = max(r["issued"], free) + free = start + dur + add(core, "vector", start, dur, "vector", r) + # SA: each op runs on the systolic array the Core reports (the `sa=` field + # = its weight-pinned / round-robin assignment); fall back to round-robin + # by issue order for older logs without the field. Each SA is one server. + rows = sorted(u["sa"], key=issue_key) + + def _sa_of(r, i): + m = re.search(r"\bsa=(-?\d+)", r["detail"]) + return int(m.group(1)) if (m and int(m.group(1)) >= 0) else (i % nsa) + + max_sa = max([nsa] + [_sa_of(r, i) + 1 for i, r in enumerate(rows)]) + sa_free = [0] * max_sa + for i, r in enumerate(rows): + if r["issued"] is None: + continue + s = _sa_of(r, i) + cc, ov = _occ(r["detail"]) + dur = max(cc - ov, 1) + start = max(r["issued"], sa_free[s]) + sa_free[s] = start + dur + lane = "sa" if max_sa == 1 else f"sa{s}" + add(core, lane, start, dur, _label(r["opcode"], r["detail"]), r) + + for c in sorted(cores): + events.append({"name": "process_name", "ph": "M", "pid": c, "tid": 0, + "args": {"name": f"Core {c}"}}) + order = {"dram-rd": 0, "dram-wr": 1, + "sa": 2, "sa0": 2, "sa1": 3, "sa2": 4, "sa3": 5, "vector": 7} + for c, lane in sorted(lanes, key=lambda x: (x[0], order.get(x[1], 5))): + events.append({"name": "thread_name", "ph": "M", "pid": c, "tid": lane, + "args": {"name": lane}}) + events.append({"name": "thread_sort_index", "ph": "M", "pid": c, "tid": lane, + "args": {"sort_index": order.get(lane, 5)}}) + return {"traceEvents": events, "displayTimeUnit": "ns"} + + +def main(argv): + ap = argparse.ArgumentParser() + ap.add_argument("input", nargs="?", help="trace log file (default: stdin)") + ap.add_argument("-o", "--out", default="timeline.json") + ap.add_argument("-s", "--num-sa", type=int, default=1, + help="systolic arrays per core (num_systolic_array_per_core); " + ">1 splits into sa0..saN-1 lanes") + a = ap.parse_args(argv[1:]) + src = open(a.input) if a.input else sys.stdin + insts = parse(src) + trace = to_chrome(insts, a.num_sa) + with open(a.out, "w") as fh: + json.dump(trace, fh) + n = sum(1 for e in trace["traceEvents"] if e["ph"] == "X") + sys.stderr.write(f"wrote {a.out}: {n} slices -> open in https://ui.perfetto.dev\n") + return 0 + + +if __name__ == "__main__": + sys.exit(main(sys.argv)) diff --git a/tests/fixtures/gemm256_postvcix.mlir b/tests/fixtures/gemm256_postvcix.mlir new file mode 100644 index 00000000..740e8ab2 --- /dev/null +++ b/tests/fixtures/gemm256_postvcix.mlir @@ -0,0 +1,419 @@ +#map = affine_map<(d0, d1) -> (d0 * 256 + d1)> +#map1 = affine_map<(d0, d1) -> (d0 * 65536 + d1 * 256)> +#map2 = affine_map<(d0, d1) -> (d0 + d1)> +#map3 = affine_map<(d0, d1) -> (d0 * 256 + d1 * 512)> +#map4 = affine_map<(d0, d1, d2) -> (-d0 + d1 + d2 floordiv 2)> +#map5 = affine_map<(d0, d1, d2)[s0, s1] -> (d0 * s0 + d1 * s1 + d2)> +#map6 = affine_map<(d0)[s0] -> (d0 floordiv s0)> +#map7 = affine_map<(d0)[s0] -> (d0 mod s0)> +#map8 = affine_map<(d0, d1, d2) -> (-d0 + d1 * 2 + d2)> +module { + memref.global @X_spad : memref<256x256xf32, 1> + memref.global @W_spad : memref<256x256xf32, 1> + memref.global @Y_spad : memref<256x256xf32, 1> + func.func @kernel(%arg0: memref<65536xf32>, %arg1: memref<65536xf32>, %arg2: memref<65536xf32>) { + %0 = memref.get_global @X_spad : memref<256x256xf32, 1> + %1 = memref.get_global @W_spad : memref<256x256xf32, 1> + %2 = memref.get_global @Y_spad : memref<256x256xf32, 1> + %cst = arith.constant dense<0.000000e+00> : vector<512xf32> + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c3 = arith.constant 3 : index + %c2 = arith.constant 2 : index + %alloc = memref.alloc() : memref<1xi32> + affine.for %arg3 = 0 to 256 step 256 { + affine.for %arg4 = 0 to 256 step 256 { + affine.vector_store %cst, %2[0, 0] : memref<256x256xf32, 1>, vector<512xf32> + affine.for %arg5 = 0 to 256 step 256 { + %4 = affine.apply #map(%arg3, %arg5) + %c1_1 = arith.constant 1 : index + %alloc_2 = memref.alloc() : memref<1xi32> + %5 = affine.apply #map(%arg5, %arg4) + %c1_3 = arith.constant 1 : index + %alloc_4 = memref.alloc() : memref<1xi32> + %c0_5 = arith.constant 0 : index + %c0_6 = arith.constant 0 : index + %c0_7 = arith.constant 0 : index + %6 = affine.apply #map1(%c0_5, %c0_6) + %7 = affine.apply #map2(%6, %4) + %8 = affine.apply #map3(%c0_5, %c0_6) + %9 = affine.apply #map2(%c0_5, %c0_6) + memref.dma_start %arg0[%7], %0[%c0_7, %8], %c2, %alloc_2[%9], %c1_1, %c1 : memref<65536xf32>, memref<256x256xf32, 1>, memref<1xi32> {async = true, dram_stride = [256, 1], fine_grained = true, sram_stride = [1, 256], subtile_size = [256, 256]} + %c0_8 = arith.constant 0 : index + %c0_9 = arith.constant 0 : index + %c0_10 = arith.constant 0 : index + %10 = affine.apply #map1(%c0_8, %c0_9) + %11 = affine.apply #map2(%10, %5) + %12 = affine.apply #map3(%c0_8, %c0_9) + %13 = affine.apply #map2(%c0_8, %c0_9) + memref.dma_start %arg1[%11], %1[%c0_10, %12], %c2, %alloc_4[%13], %c1_3, %c1 : memref<65536xf32>, memref<256x256xf32, 1>, memref<1xi32> {async = true, dram_stride = [256, 1], fine_grained = true, sram_stride = [1, 256], subtile_size = [256, 256]} + %c0_11 = arith.constant 0 : index + %c8_i64 = arith.constant 8 : i64 + %c256 = arith.constant 256 : index + %c256_12 = arith.constant 256 : index + %c256_13 = arith.constant 256 : index + %c128 = arith.constant 128 : index + %c1_14 = arith.constant 1 : index + %cst_15 = arith.constant 0.000000e+00 : f32 + affine.for %arg6 = 0 to 2 { + affine.for %arg7 = 0 to 2 { + %14 = affine.apply #map4(%arg5, %c0_11, %c0_11) + memref.dma_wait %alloc_4[%14], %c1_14 : memref<1xi32> + %c0_16 = arith.constant 0 : index + %c128_17 = arith.constant 128 : index + %15 = affine.apply #map5(%arg6, %arg7, %c0_16)[%c256, %c128_17] + %16 = affine.apply #map6(%15)[%c256_12] + %17 = affine.apply #map7(%15)[%c256_12] + %18 = vector.transfer_read %1[%16, %17], %cst_15 : memref<256x256xf32, 1>, vector<8xf32> + "vcix.iv"(%18, %c8_i64) {imm = 0 : i64, opcode = 1 : i64, rd = 0 : i64} : (vector<8xf32>, i64) -> () + %c8 = arith.constant 8 : index + %c128_18 = arith.constant 128 : index + %19 = affine.apply #map5(%arg6, %arg7, %c8)[%c256, %c128_18] + %20 = affine.apply #map6(%19)[%c256_12] + %21 = affine.apply #map7(%19)[%c256_12] + %22 = vector.transfer_read %1[%20, %21], %cst_15 : memref<256x256xf32, 1>, vector<8xf32> + "vcix.iv"(%22, %c8_i64) {imm = 0 : i64, opcode = 1 : i64, rd = 0 : i64} : (vector<8xf32>, i64) -> () + %c16 = arith.constant 16 : index + %c128_19 = arith.constant 128 : index + %23 = affine.apply #map5(%arg6, %arg7, %c16)[%c256, %c128_19] + %24 = affine.apply #map6(%23)[%c256_12] + %25 = affine.apply #map7(%23)[%c256_12] + %26 = vector.transfer_read %1[%24, %25], %cst_15 : memref<256x256xf32, 1>, vector<8xf32> + "vcix.iv"(%26, %c8_i64) {imm = 0 : i64, opcode = 1 : i64, rd = 0 : i64} : (vector<8xf32>, i64) -> () + %c24 = arith.constant 24 : index + %c128_20 = arith.constant 128 : index + %27 = affine.apply #map5(%arg6, %arg7, %c24)[%c256, %c128_20] + %28 = affine.apply #map6(%27)[%c256_12] + %29 = affine.apply #map7(%27)[%c256_12] + %30 = vector.transfer_read %1[%28, %29], %cst_15 : memref<256x256xf32, 1>, vector<8xf32> + "vcix.iv"(%30, %c8_i64) {imm = 0 : i64, opcode = 1 : i64, rd = 0 : i64} : (vector<8xf32>, i64) -> () + %c32 = arith.constant 32 : index + %c128_21 = arith.constant 128 : index + %31 = affine.apply #map5(%arg6, %arg7, %c32)[%c256, %c128_21] + %32 = affine.apply #map6(%31)[%c256_12] + %33 = affine.apply #map7(%31)[%c256_12] + %34 = vector.transfer_read %1[%32, %33], %cst_15 : memref<256x256xf32, 1>, vector<8xf32> + "vcix.iv"(%34, %c8_i64) {imm = 0 : i64, opcode = 1 : i64, rd = 0 : i64} : (vector<8xf32>, i64) -> () + %c40 = arith.constant 40 : index + %c128_22 = arith.constant 128 : index + %35 = affine.apply #map5(%arg6, %arg7, %c40)[%c256, %c128_22] + %36 = affine.apply #map6(%35)[%c256_12] + %37 = affine.apply #map7(%35)[%c256_12] + %38 = vector.transfer_read %1[%36, %37], %cst_15 : memref<256x256xf32, 1>, vector<8xf32> + "vcix.iv"(%38, %c8_i64) {imm = 0 : i64, opcode = 1 : i64, rd = 0 : i64} : (vector<8xf32>, i64) -> () + %c48 = arith.constant 48 : index + %c128_23 = arith.constant 128 : index + %39 = affine.apply #map5(%arg6, %arg7, %c48)[%c256, %c128_23] + %40 = affine.apply #map6(%39)[%c256_12] + %41 = affine.apply #map7(%39)[%c256_12] + %42 = vector.transfer_read %1[%40, %41], %cst_15 : memref<256x256xf32, 1>, vector<8xf32> + "vcix.iv"(%42, %c8_i64) {imm = 0 : i64, opcode = 1 : i64, rd = 0 : i64} : (vector<8xf32>, i64) -> () + %c56 = arith.constant 56 : index + %c128_24 = arith.constant 128 : index + %43 = affine.apply #map5(%arg6, %arg7, %c56)[%c256, %c128_24] + %44 = affine.apply #map6(%43)[%c256_12] + %45 = affine.apply #map7(%43)[%c256_12] + %46 = vector.transfer_read %1[%44, %45], %cst_15 : memref<256x256xf32, 1>, vector<8xf32> + "vcix.iv"(%46, %c8_i64) {imm = 0 : i64, opcode = 1 : i64, rd = 0 : i64} : (vector<8xf32>, i64) -> () + %c64 = arith.constant 64 : index + %c128_25 = arith.constant 128 : index + %47 = affine.apply #map5(%arg6, %arg7, %c64)[%c256, %c128_25] + %48 = affine.apply #map6(%47)[%c256_12] + %49 = affine.apply #map7(%47)[%c256_12] + %50 = vector.transfer_read %1[%48, %49], %cst_15 : memref<256x256xf32, 1>, vector<8xf32> + "vcix.iv"(%50, %c8_i64) {imm = 0 : i64, opcode = 1 : i64, rd = 0 : i64} : (vector<8xf32>, i64) -> () + %c72 = arith.constant 72 : index + %c128_26 = arith.constant 128 : index + %51 = affine.apply #map5(%arg6, %arg7, %c72)[%c256, %c128_26] + %52 = affine.apply #map6(%51)[%c256_12] + %53 = affine.apply #map7(%51)[%c256_12] + %54 = vector.transfer_read %1[%52, %53], %cst_15 : memref<256x256xf32, 1>, vector<8xf32> + "vcix.iv"(%54, %c8_i64) {imm = 0 : i64, opcode = 1 : i64, rd = 0 : i64} : (vector<8xf32>, i64) -> () + %c80 = arith.constant 80 : index + %c128_27 = arith.constant 128 : index + %55 = affine.apply #map5(%arg6, %arg7, %c80)[%c256, %c128_27] + %56 = affine.apply #map6(%55)[%c256_12] + %57 = affine.apply #map7(%55)[%c256_12] + %58 = vector.transfer_read %1[%56, %57], %cst_15 : memref<256x256xf32, 1>, vector<8xf32> + "vcix.iv"(%58, %c8_i64) {imm = 0 : i64, opcode = 1 : i64, rd = 0 : i64} : (vector<8xf32>, i64) -> () + %c88 = arith.constant 88 : index + %c128_28 = arith.constant 128 : index + %59 = affine.apply #map5(%arg6, %arg7, %c88)[%c256, %c128_28] + %60 = affine.apply #map6(%59)[%c256_12] + %61 = affine.apply #map7(%59)[%c256_12] + %62 = vector.transfer_read %1[%60, %61], %cst_15 : memref<256x256xf32, 1>, vector<8xf32> + "vcix.iv"(%62, %c8_i64) {imm = 0 : i64, opcode = 1 : i64, rd = 0 : i64} : (vector<8xf32>, i64) -> () + %c96 = arith.constant 96 : index + %c128_29 = arith.constant 128 : index + %63 = affine.apply #map5(%arg6, %arg7, %c96)[%c256, %c128_29] + %64 = affine.apply #map6(%63)[%c256_12] + %65 = affine.apply #map7(%63)[%c256_12] + %66 = vector.transfer_read %1[%64, %65], %cst_15 : memref<256x256xf32, 1>, vector<8xf32> + "vcix.iv"(%66, %c8_i64) {imm = 0 : i64, opcode = 1 : i64, rd = 0 : i64} : (vector<8xf32>, i64) -> () + %c104 = arith.constant 104 : index + %c128_30 = arith.constant 128 : index + %67 = affine.apply #map5(%arg6, %arg7, %c104)[%c256, %c128_30] + %68 = affine.apply #map6(%67)[%c256_12] + %69 = affine.apply #map7(%67)[%c256_12] + %70 = vector.transfer_read %1[%68, %69], %cst_15 : memref<256x256xf32, 1>, vector<8xf32> + "vcix.iv"(%70, %c8_i64) {imm = 0 : i64, opcode = 1 : i64, rd = 0 : i64} : (vector<8xf32>, i64) -> () + %c112 = arith.constant 112 : index + %c128_31 = arith.constant 128 : index + %71 = affine.apply #map5(%arg6, %arg7, %c112)[%c256, %c128_31] + %72 = affine.apply #map6(%71)[%c256_12] + %73 = affine.apply #map7(%71)[%c256_12] + %74 = vector.transfer_read %1[%72, %73], %cst_15 : memref<256x256xf32, 1>, vector<8xf32> + "vcix.iv"(%74, %c8_i64) {imm = 0 : i64, opcode = 1 : i64, rd = 0 : i64} : (vector<8xf32>, i64) -> () + %c120 = arith.constant 120 : index + %c128_32 = arith.constant 128 : index + %75 = affine.apply #map5(%arg6, %arg7, %c120)[%c256, %c128_32] + %76 = affine.apply #map6(%75)[%c256_12] + %77 = affine.apply #map7(%75)[%c256_12] + %78 = vector.transfer_read %1[%76, %77], %cst_15 : memref<256x256xf32, 1>, vector<8xf32> + "vcix.iv"(%78, %c8_i64) {imm = 0 : i64, opcode = 1 : i64, rd = 0 : i64} : (vector<8xf32>, i64) -> () + affine.for %arg8 = 0 to 2 { + %79 = affine.apply #map8(%arg5, %c0_11, %c0_11) + memref.dma_wait %alloc_2[%79], %c1_14 : memref<1xi32> + %c0_33 = arith.constant 0 : index + %80 = affine.apply #map5(%arg7, %arg8, %c0_33)[%c256_13, %c128] + %81 = affine.apply #map6(%80)[%c256] + %82 = affine.apply #map7(%80)[%c256] + %83 = vector.transfer_read %0[%81, %82], %cst_15 : memref<256x256xf32, 1>, vector<8xf32> + "vcix.iv"(%83, %c8_i64) {imm = 0 : i64, opcode = 0 : i64, rd = 0 : i64} : (vector<8xf32>, i64) -> () + %c8_34 = arith.constant 8 : index + %84 = affine.apply #map5(%arg7, %arg8, %c8_34)[%c256_13, %c128] + %85 = affine.apply #map6(%84)[%c256] + %86 = affine.apply #map7(%84)[%c256] + %87 = vector.transfer_read %0[%85, %86], %cst_15 : memref<256x256xf32, 1>, vector<8xf32> + "vcix.iv"(%87, %c8_i64) {imm = 0 : i64, opcode = 0 : i64, rd = 0 : i64} : (vector<8xf32>, i64) -> () + %c16_35 = arith.constant 16 : index + %88 = affine.apply #map5(%arg7, %arg8, %c16_35)[%c256_13, %c128] + %89 = affine.apply #map6(%88)[%c256] + %90 = affine.apply #map7(%88)[%c256] + %91 = vector.transfer_read %0[%89, %90], %cst_15 : memref<256x256xf32, 1>, vector<8xf32> + "vcix.iv"(%91, %c8_i64) {imm = 0 : i64, opcode = 0 : i64, rd = 0 : i64} : (vector<8xf32>, i64) -> () + %c24_36 = arith.constant 24 : index + %92 = affine.apply #map5(%arg7, %arg8, %c24_36)[%c256_13, %c128] + %93 = affine.apply #map6(%92)[%c256] + %94 = affine.apply #map7(%92)[%c256] + %95 = vector.transfer_read %0[%93, %94], %cst_15 : memref<256x256xf32, 1>, vector<8xf32> + "vcix.iv"(%95, %c8_i64) {imm = 0 : i64, opcode = 0 : i64, rd = 0 : i64} : (vector<8xf32>, i64) -> () + %c32_37 = arith.constant 32 : index + %96 = affine.apply #map5(%arg7, %arg8, %c32_37)[%c256_13, %c128] + %97 = affine.apply #map6(%96)[%c256] + %98 = affine.apply #map7(%96)[%c256] + %99 = vector.transfer_read %0[%97, %98], %cst_15 : memref<256x256xf32, 1>, vector<8xf32> + "vcix.iv"(%99, %c8_i64) {imm = 0 : i64, opcode = 0 : i64, rd = 0 : i64} : (vector<8xf32>, i64) -> () + %c40_38 = arith.constant 40 : index + %100 = affine.apply #map5(%arg7, %arg8, %c40_38)[%c256_13, %c128] + %101 = affine.apply #map6(%100)[%c256] + %102 = affine.apply #map7(%100)[%c256] + %103 = vector.transfer_read %0[%101, %102], %cst_15 : memref<256x256xf32, 1>, vector<8xf32> + "vcix.iv"(%103, %c8_i64) {imm = 0 : i64, opcode = 0 : i64, rd = 0 : i64} : (vector<8xf32>, i64) -> () + %c48_39 = arith.constant 48 : index + %104 = affine.apply #map5(%arg7, %arg8, %c48_39)[%c256_13, %c128] + %105 = affine.apply #map6(%104)[%c256] + %106 = affine.apply #map7(%104)[%c256] + %107 = vector.transfer_read %0[%105, %106], %cst_15 : memref<256x256xf32, 1>, vector<8xf32> + "vcix.iv"(%107, %c8_i64) {imm = 0 : i64, opcode = 0 : i64, rd = 0 : i64} : (vector<8xf32>, i64) -> () + %c56_40 = arith.constant 56 : index + %108 = affine.apply #map5(%arg7, %arg8, %c56_40)[%c256_13, %c128] + %109 = affine.apply #map6(%108)[%c256] + %110 = affine.apply #map7(%108)[%c256] + %111 = vector.transfer_read %0[%109, %110], %cst_15 : memref<256x256xf32, 1>, vector<8xf32> + "vcix.iv"(%111, %c8_i64) {imm = 0 : i64, opcode = 0 : i64, rd = 0 : i64} : (vector<8xf32>, i64) -> () + %c64_41 = arith.constant 64 : index + %112 = affine.apply #map5(%arg7, %arg8, %c64_41)[%c256_13, %c128] + %113 = affine.apply #map6(%112)[%c256] + %114 = affine.apply #map7(%112)[%c256] + %115 = vector.transfer_read %0[%113, %114], %cst_15 : memref<256x256xf32, 1>, vector<8xf32> + "vcix.iv"(%115, %c8_i64) {imm = 0 : i64, opcode = 0 : i64, rd = 0 : i64} : (vector<8xf32>, i64) -> () + %c72_42 = arith.constant 72 : index + %116 = affine.apply #map5(%arg7, %arg8, %c72_42)[%c256_13, %c128] + %117 = affine.apply #map6(%116)[%c256] + %118 = affine.apply #map7(%116)[%c256] + %119 = vector.transfer_read %0[%117, %118], %cst_15 : memref<256x256xf32, 1>, vector<8xf32> + "vcix.iv"(%119, %c8_i64) {imm = 0 : i64, opcode = 0 : i64, rd = 0 : i64} : (vector<8xf32>, i64) -> () + %c80_43 = arith.constant 80 : index + %120 = affine.apply #map5(%arg7, %arg8, %c80_43)[%c256_13, %c128] + %121 = affine.apply #map6(%120)[%c256] + %122 = affine.apply #map7(%120)[%c256] + %123 = vector.transfer_read %0[%121, %122], %cst_15 : memref<256x256xf32, 1>, vector<8xf32> + "vcix.iv"(%123, %c8_i64) {imm = 0 : i64, opcode = 0 : i64, rd = 0 : i64} : (vector<8xf32>, i64) -> () + %c88_44 = arith.constant 88 : index + %124 = affine.apply #map5(%arg7, %arg8, %c88_44)[%c256_13, %c128] + %125 = affine.apply #map6(%124)[%c256] + %126 = affine.apply #map7(%124)[%c256] + %127 = vector.transfer_read %0[%125, %126], %cst_15 : memref<256x256xf32, 1>, vector<8xf32> + "vcix.iv"(%127, %c8_i64) {imm = 0 : i64, opcode = 0 : i64, rd = 0 : i64} : (vector<8xf32>, i64) -> () + %c96_45 = arith.constant 96 : index + %128 = affine.apply #map5(%arg7, %arg8, %c96_45)[%c256_13, %c128] + %129 = affine.apply #map6(%128)[%c256] + %130 = affine.apply #map7(%128)[%c256] + %131 = vector.transfer_read %0[%129, %130], %cst_15 : memref<256x256xf32, 1>, vector<8xf32> + "vcix.iv"(%131, %c8_i64) {imm = 0 : i64, opcode = 0 : i64, rd = 0 : i64} : (vector<8xf32>, i64) -> () + %c104_46 = arith.constant 104 : index + %132 = affine.apply #map5(%arg7, %arg8, %c104_46)[%c256_13, %c128] + %133 = affine.apply #map6(%132)[%c256] + %134 = affine.apply #map7(%132)[%c256] + %135 = vector.transfer_read %0[%133, %134], %cst_15 : memref<256x256xf32, 1>, vector<8xf32> + "vcix.iv"(%135, %c8_i64) {imm = 0 : i64, opcode = 0 : i64, rd = 0 : i64} : (vector<8xf32>, i64) -> () + %c112_47 = arith.constant 112 : index + %136 = affine.apply #map5(%arg7, %arg8, %c112_47)[%c256_13, %c128] + %137 = affine.apply #map6(%136)[%c256] + %138 = affine.apply #map7(%136)[%c256] + %139 = vector.transfer_read %0[%137, %138], %cst_15 : memref<256x256xf32, 1>, vector<8xf32> + "vcix.iv"(%139, %c8_i64) {imm = 0 : i64, opcode = 0 : i64, rd = 0 : i64} : (vector<8xf32>, i64) -> () + %c120_48 = arith.constant 120 : index + %140 = affine.apply #map5(%arg7, %arg8, %c120_48)[%c256_13, %c128] + %141 = affine.apply #map6(%140)[%c256] + %142 = affine.apply #map7(%140)[%c256] + %143 = vector.transfer_read %0[%141, %142], %cst_15 : memref<256x256xf32, 1>, vector<8xf32> + "vcix.iv"(%143, %c8_i64) {imm = 0 : i64, opcode = 0 : i64, rd = 0 : i64} : (vector<8xf32>, i64) -> () + "vcix.i"(%c8_i64) {imm = 4 : i64, lmul = 0 : i64, opcode = 1 : i64, rd = 0 : i64, rs2 = 0 : i64, sew = 32 : i64} : (i64) -> () + %c0_49 = arith.constant 0 : index + %144 = affine.apply #map5(%arg6, %arg8, %c0_49)[%c256_13, %c128] + %145 = "vcix.v.i"(%c8_i64) {imm = 0 : i64, opcode = 2 : i64, rs2 = 0 : i64} : (i64) -> vector<8xf32> + %146 = affine.apply #map6(%144)[%c256_12] + %147 = affine.apply #map7(%144)[%c256_12] + %148 = vector.transfer_read %2[%146, %147], %cst_15 : memref<256x256xf32, 1>, vector<8xf32> + %149 = arith.addf %148, %145 : vector<8xf32> + vector.transfer_write %149, %2[%146, %147] : vector<8xf32>, memref<256x256xf32, 1> + %c8_50 = arith.constant 8 : index + %150 = affine.apply #map5(%arg6, %arg8, %c8_50)[%c256_13, %c128] + %151 = "vcix.v.i"(%c8_i64) {imm = 0 : i64, opcode = 2 : i64, rs2 = 0 : i64} : (i64) -> vector<8xf32> + %152 = affine.apply #map6(%150)[%c256_12] + %153 = affine.apply #map7(%150)[%c256_12] + %154 = vector.transfer_read %2[%152, %153], %cst_15 : memref<256x256xf32, 1>, vector<8xf32> + %155 = arith.addf %154, %151 : vector<8xf32> + vector.transfer_write %155, %2[%152, %153] : vector<8xf32>, memref<256x256xf32, 1> + %c16_51 = arith.constant 16 : index + %156 = affine.apply #map5(%arg6, %arg8, %c16_51)[%c256_13, %c128] + %157 = "vcix.v.i"(%c8_i64) {imm = 0 : i64, opcode = 2 : i64, rs2 = 0 : i64} : (i64) -> vector<8xf32> + %158 = affine.apply #map6(%156)[%c256_12] + %159 = affine.apply #map7(%156)[%c256_12] + %160 = vector.transfer_read %2[%158, %159], %cst_15 : memref<256x256xf32, 1>, vector<8xf32> + %161 = arith.addf %160, %157 : vector<8xf32> + vector.transfer_write %161, %2[%158, %159] : vector<8xf32>, memref<256x256xf32, 1> + %c24_52 = arith.constant 24 : index + %162 = affine.apply #map5(%arg6, %arg8, %c24_52)[%c256_13, %c128] + %163 = "vcix.v.i"(%c8_i64) {imm = 0 : i64, opcode = 2 : i64, rs2 = 0 : i64} : (i64) -> vector<8xf32> + %164 = affine.apply #map6(%162)[%c256_12] + %165 = affine.apply #map7(%162)[%c256_12] + %166 = vector.transfer_read %2[%164, %165], %cst_15 : memref<256x256xf32, 1>, vector<8xf32> + %167 = arith.addf %166, %163 : vector<8xf32> + vector.transfer_write %167, %2[%164, %165] : vector<8xf32>, memref<256x256xf32, 1> + %c32_53 = arith.constant 32 : index + %168 = affine.apply #map5(%arg6, %arg8, %c32_53)[%c256_13, %c128] + %169 = "vcix.v.i"(%c8_i64) {imm = 0 : i64, opcode = 2 : i64, rs2 = 0 : i64} : (i64) -> vector<8xf32> + %170 = affine.apply #map6(%168)[%c256_12] + %171 = affine.apply #map7(%168)[%c256_12] + %172 = vector.transfer_read %2[%170, %171], %cst_15 : memref<256x256xf32, 1>, vector<8xf32> + %173 = arith.addf %172, %169 : vector<8xf32> + vector.transfer_write %173, %2[%170, %171] : vector<8xf32>, memref<256x256xf32, 1> + %c40_54 = arith.constant 40 : index + %174 = affine.apply #map5(%arg6, %arg8, %c40_54)[%c256_13, %c128] + %175 = "vcix.v.i"(%c8_i64) {imm = 0 : i64, opcode = 2 : i64, rs2 = 0 : i64} : (i64) -> vector<8xf32> + %176 = affine.apply #map6(%174)[%c256_12] + %177 = affine.apply #map7(%174)[%c256_12] + %178 = vector.transfer_read %2[%176, %177], %cst_15 : memref<256x256xf32, 1>, vector<8xf32> + %179 = arith.addf %178, %175 : vector<8xf32> + vector.transfer_write %179, %2[%176, %177] : vector<8xf32>, memref<256x256xf32, 1> + %c48_55 = arith.constant 48 : index + %180 = affine.apply #map5(%arg6, %arg8, %c48_55)[%c256_13, %c128] + %181 = "vcix.v.i"(%c8_i64) {imm = 0 : i64, opcode = 2 : i64, rs2 = 0 : i64} : (i64) -> vector<8xf32> + %182 = affine.apply #map6(%180)[%c256_12] + %183 = affine.apply #map7(%180)[%c256_12] + %184 = vector.transfer_read %2[%182, %183], %cst_15 : memref<256x256xf32, 1>, vector<8xf32> + %185 = arith.addf %184, %181 : vector<8xf32> + vector.transfer_write %185, %2[%182, %183] : vector<8xf32>, memref<256x256xf32, 1> + %c56_56 = arith.constant 56 : index + %186 = affine.apply #map5(%arg6, %arg8, %c56_56)[%c256_13, %c128] + %187 = "vcix.v.i"(%c8_i64) {imm = 0 : i64, opcode = 2 : i64, rs2 = 0 : i64} : (i64) -> vector<8xf32> + %188 = affine.apply #map6(%186)[%c256_12] + %189 = affine.apply #map7(%186)[%c256_12] + %190 = vector.transfer_read %2[%188, %189], %cst_15 : memref<256x256xf32, 1>, vector<8xf32> + %191 = arith.addf %190, %187 : vector<8xf32> + vector.transfer_write %191, %2[%188, %189] : vector<8xf32>, memref<256x256xf32, 1> + %c64_57 = arith.constant 64 : index + %192 = affine.apply #map5(%arg6, %arg8, %c64_57)[%c256_13, %c128] + %193 = "vcix.v.i"(%c8_i64) {imm = 0 : i64, opcode = 2 : i64, rs2 = 0 : i64} : (i64) -> vector<8xf32> + %194 = affine.apply #map6(%192)[%c256_12] + %195 = affine.apply #map7(%192)[%c256_12] + %196 = vector.transfer_read %2[%194, %195], %cst_15 : memref<256x256xf32, 1>, vector<8xf32> + %197 = arith.addf %196, %193 : vector<8xf32> + vector.transfer_write %197, %2[%194, %195] : vector<8xf32>, memref<256x256xf32, 1> + %c72_58 = arith.constant 72 : index + %198 = affine.apply #map5(%arg6, %arg8, %c72_58)[%c256_13, %c128] + %199 = "vcix.v.i"(%c8_i64) {imm = 0 : i64, opcode = 2 : i64, rs2 = 0 : i64} : (i64) -> vector<8xf32> + %200 = affine.apply #map6(%198)[%c256_12] + %201 = affine.apply #map7(%198)[%c256_12] + %202 = vector.transfer_read %2[%200, %201], %cst_15 : memref<256x256xf32, 1>, vector<8xf32> + %203 = arith.addf %202, %199 : vector<8xf32> + vector.transfer_write %203, %2[%200, %201] : vector<8xf32>, memref<256x256xf32, 1> + %c80_59 = arith.constant 80 : index + %204 = affine.apply #map5(%arg6, %arg8, %c80_59)[%c256_13, %c128] + %205 = "vcix.v.i"(%c8_i64) {imm = 0 : i64, opcode = 2 : i64, rs2 = 0 : i64} : (i64) -> vector<8xf32> + %206 = affine.apply #map6(%204)[%c256_12] + %207 = affine.apply #map7(%204)[%c256_12] + %208 = vector.transfer_read %2[%206, %207], %cst_15 : memref<256x256xf32, 1>, vector<8xf32> + %209 = arith.addf %208, %205 : vector<8xf32> + vector.transfer_write %209, %2[%206, %207] : vector<8xf32>, memref<256x256xf32, 1> + %c88_60 = arith.constant 88 : index + %210 = affine.apply #map5(%arg6, %arg8, %c88_60)[%c256_13, %c128] + %211 = "vcix.v.i"(%c8_i64) {imm = 0 : i64, opcode = 2 : i64, rs2 = 0 : i64} : (i64) -> vector<8xf32> + %212 = affine.apply #map6(%210)[%c256_12] + %213 = affine.apply #map7(%210)[%c256_12] + %214 = vector.transfer_read %2[%212, %213], %cst_15 : memref<256x256xf32, 1>, vector<8xf32> + %215 = arith.addf %214, %211 : vector<8xf32> + vector.transfer_write %215, %2[%212, %213] : vector<8xf32>, memref<256x256xf32, 1> + %c96_61 = arith.constant 96 : index + %216 = affine.apply #map5(%arg6, %arg8, %c96_61)[%c256_13, %c128] + %217 = "vcix.v.i"(%c8_i64) {imm = 0 : i64, opcode = 2 : i64, rs2 = 0 : i64} : (i64) -> vector<8xf32> + %218 = affine.apply #map6(%216)[%c256_12] + %219 = affine.apply #map7(%216)[%c256_12] + %220 = vector.transfer_read %2[%218, %219], %cst_15 : memref<256x256xf32, 1>, vector<8xf32> + %221 = arith.addf %220, %217 : vector<8xf32> + vector.transfer_write %221, %2[%218, %219] : vector<8xf32>, memref<256x256xf32, 1> + %c104_62 = arith.constant 104 : index + %222 = affine.apply #map5(%arg6, %arg8, %c104_62)[%c256_13, %c128] + %223 = "vcix.v.i"(%c8_i64) {imm = 0 : i64, opcode = 2 : i64, rs2 = 0 : i64} : (i64) -> vector<8xf32> + %224 = affine.apply #map6(%222)[%c256_12] + %225 = affine.apply #map7(%222)[%c256_12] + %226 = vector.transfer_read %2[%224, %225], %cst_15 : memref<256x256xf32, 1>, vector<8xf32> + %227 = arith.addf %226, %223 : vector<8xf32> + vector.transfer_write %227, %2[%224, %225] : vector<8xf32>, memref<256x256xf32, 1> + %c112_63 = arith.constant 112 : index + %228 = affine.apply #map5(%arg6, %arg8, %c112_63)[%c256_13, %c128] + %229 = "vcix.v.i"(%c8_i64) {imm = 0 : i64, opcode = 2 : i64, rs2 = 0 : i64} : (i64) -> vector<8xf32> + %230 = affine.apply #map6(%228)[%c256_12] + %231 = affine.apply #map7(%228)[%c256_12] + %232 = vector.transfer_read %2[%230, %231], %cst_15 : memref<256x256xf32, 1>, vector<8xf32> + %233 = arith.addf %232, %229 : vector<8xf32> + vector.transfer_write %233, %2[%230, %231] : vector<8xf32>, memref<256x256xf32, 1> + %c120_64 = arith.constant 120 : index + %234 = affine.apply #map5(%arg6, %arg8, %c120_64)[%c256_13, %c128] + %235 = "vcix.v.i"(%c8_i64) {imm = 0 : i64, opcode = 2 : i64, rs2 = 0 : i64} : (i64) -> vector<8xf32> + %236 = affine.apply #map6(%234)[%c256_12] + %237 = affine.apply #map7(%234)[%c256_12] + %238 = vector.transfer_read %2[%236, %237], %cst_15 : memref<256x256xf32, 1>, vector<8xf32> + %239 = arith.addf %238, %235 : vector<8xf32> + vector.transfer_write %239, %2[%236, %237] : vector<8xf32>, memref<256x256xf32, 1> + } {inner_loop = true} + } {inner_loop = true} + } {inner_loop = true} + } {accumulation_loop = true, subtile_loop = "k"} + affine.for %arg5 = 0 to 1 { + } {inner_loop = false} + %3 = affine.apply #map(%arg3, %arg4) + %c1_0 = arith.constant 1 : index + memref.dma_start %2[%c0, %c0], %arg2[%3], %c3, %alloc[%c0], %c1_0, %c1 : memref<256x256xf32, 1>, memref<65536xf32>, memref<1xi32> {dram_stride = [256, 1], padding = 0 : i64, sram_stride = [1, 256]} + } {outer_loop = true, subtile_loop = "n"} + } {outer_loop = true, subtile_loop = "m"} + return + } + func.func @wrapper_kernel(%arg0: memref<65536xf32>, %arg1: memref<65536xf32>, %arg2: memref<65536xf32>) { + call @kernel(%arg0, %arg1, %arg2) : (memref<65536xf32>, memref<65536xf32>, memref<65536xf32>) -> () + return + } +} diff --git a/tests/test_togsim_emitc.py b/tests/test_togsim_emitc.py new file mode 100644 index 00000000..b0bd2d8e --- /dev/null +++ b/tests/test_togsim_emitc.py @@ -0,0 +1,152 @@ +"""Tests for the C4 emitc lowering + compiled .so trace producer (P2). + +The pipeline under test (docs/design/togsim_cpp_trace.md, sec 5-7): + + post-vcix .mlir --build_skeleton--> skeleton+API + --lower_to_emitc--> EmitC module + --mlir-translate--> C++ + --g++ -shared----> trace .so (exports togsim_kernel; + togsim_* left undefined) + +`test_build_trace_so` builds the .so and checks the EmitC/symbol-table shape. +`test_trace_so_runs` additionally dlopens it against a stub runtime and confirms +the producer executes and emits a non-empty deterministic trace. + +Both are skipped unless the MLIR bindings, `mlir-translate` (from +TORCHSIM_LLVM_PATH), a host C++ compiler, AND a post-vcix `.mlir` fixture (via +`TOGSIM_SKELETON_FIXTURE`) are available -- the same fixture used by +test_togsim_skeleton.py. +""" +import importlib.util +import os +import pathlib +import shutil +import subprocess +import sys +import tempfile + +import pytest + +_ROOT = pathlib.Path(__file__).resolve().parents[1] +_CXX = os.environ.get("CXX", "g++") +_INCLUDE = _ROOT / "TOGSim" / "include" + + +def _mlir_translate(): + return os.path.join(os.environ.get("TORCHSIM_LLVM_PATH", "/usr/bin"), + "mlir-translate") + + +def _tools_ready(): + return (importlib.util.find_spec("mlir") is not None + and os.path.isfile(_mlir_translate()) + and shutil.which(_CXX) is not None) + + +def _fixture(): + fix = os.environ.get("TOGSIM_SKELETON_FIXTURE") + if not fix or not os.path.isfile(fix): + pytest.skip("set TOGSIM_SKELETON_FIXTURE to a post-vcix kernel .mlir") + return fix + + +_HARNESS = r''' +#include +#include +#include +#include +#include "togsim_runtime.h" +static int n_dma=0, n_membar=0, n_compute=0, n_core=0, bad=0; +extern "C" { +void togsim_dma(EmitCtx*, int32_t, int32_t, uint64_t, int32_t, + const int64_t*, const int64_t*, int32_t, int32_t, + int32_t, uint64_t, const int64_t*, int32_t, + const int64_t*, int32_t){ ++n_dma; } +void togsim_compute(EmitCtx*, uint64_t, int32_t, int32_t, const int64_t*, + const int64_t*, int32_t, const int64_t*, int32_t){ ++n_compute; } +void togsim_memory_barrier(EmitCtx*, int32_t tag_id, uint64_t, const int64_t*, int32_t){ + ++n_membar; if(tag_id<0) ++bad; } // tag_id pairs it with its async dma +void togsim_dispatch(EmitCtx* ctx, togsim_tile_fn fn, int64_t* iv, int32_t n){ + ++n_core; fn(ctx, iv, n); } // count a work-item + run its (outlined) body +void togsim_compute_barrier(EmitCtx*){} +} +int main(int argc, char** argv){ + void* h = dlopen(argv[1], RTLD_NOW | RTLD_GLOBAL); + if(!h){ printf("dlopen failed: %s\n", dlerror()); return 2; } + auto emit = (void(*)(EmitCtx*, int64_t*, int32_t))dlsym(h, "togsim_kernel"); + if(!emit){ printf("dlsym failed: %s\n", dlerror()); return 3; } + emit(nullptr, nullptr, 0); + printf("TRACE core=%d dma=%d membar=%d compute=%d bad=%d\n", + n_core, n_dma, n_membar, n_compute, bad); + return 0; +} +''' + + +@pytest.mark.skipif(not _tools_ready(), + reason="need mlir bindings + mlir-translate + C++ compiler") +def test_build_trace_so(): + fix = _fixture() + sys.path.insert(0, str(_ROOT)) + from PyTorchSimFrontend.mlir.passes import lower_to_emitc as c4 + + with tempfile.TemporaryDirectory() as d: + so = os.path.join(d, "trace.so") + emitc_text = c4.build_trace_so(fix, so) + assert os.path.isfile(so) + + # EmitC form: one entry func, dma/memory_barrier/compute as call_opaque targets. + assert "emitc.func" in emitc_text + assert ("@%s" % c4.ENTRY) in emitc_text + assert 'emitc.call_opaque "togsim_dma"' in emitc_text + assert 'emitc.call_opaque "togsim_memory_barrier"' in emitc_text + assert 'emitc.call_opaque "togsim_compute"' in emitc_text + + # Symbol table: entry exported (defined, text), runtime hooks undefined + # so the TOGSim loader resolves them at dlopen. + nm = subprocess.run(["nm", "-D", so], capture_output=True, text=True).stdout + syms = {parts[-1]: parts[-2] for parts in + (ln.split() for ln in nm.splitlines()) if len(parts) >= 2} + assert syms.get("togsim_kernel") == "T", nm + assert syms.get("togsim_dma") == "U", nm + assert syms.get("togsim_dispatch") == "U", nm + assert syms.get("togsim_memory_barrier") == "U", nm + # The per-work-item dispatch wrapper is emitted (outlined tile fn). + assert 'emitc.call_opaque "togsim_dispatch"' in emitc_text + + +@pytest.mark.skipif(not _tools_ready(), + reason="need mlir bindings + mlir-translate + C++ compiler") +def test_trace_so_runs(): + fix = _fixture() + sys.path.insert(0, str(_ROOT)) + from PyTorchSimFrontend.mlir.passes import lower_to_emitc as c4 + + with tempfile.TemporaryDirectory() as d: + so = os.path.join(d, "trace.so") + c4.build_trace_so(fix, so) + + harness_cpp = os.path.join(d, "harness.cpp") + harness_bin = os.path.join(d, "harness") + with open(harness_cpp, "w") as fh: + fh.write(_HARNESS) + # -rdynamic so the harness's togsim_* are visible to the dlopened .so. + build = subprocess.run( + [_CXX, "-std=gnu++17", "-O2", "-rdynamic", "-I", str(_INCLUDE), + harness_cpp, "-o", harness_bin, "-ldl"], + capture_output=True, text=True) + assert build.returncode == 0, build.stderr + + run = subprocess.run([harness_bin, so], capture_output=True, text=True) + assert run.returncode == 0, run.stdout + run.stderr + out = run.stdout.strip() + assert out.startswith("TRACE "), out + counts = dict(kv.split("=") for kv in out.split()[1:]) + # The producer ran and emitted a real trace, with >=1 work-item (core alloc). + assert int(counts["core"]) >= 1 + assert int(counts["dma"]) >= 1 + assert int(counts["compute"]) >= 1 + # Async loads are synced by explicit memory barriers, each carrying a + # valid (non-negative) tag_id that pairs it with its dma. + assert int(counts["membar"]) >= 1, out + assert int(counts["bad"]) == 0, out diff --git a/tests/test_togsim_runtime.py b/tests/test_togsim_runtime.py new file mode 100644 index 00000000..a5d6cb3d --- /dev/null +++ b/tests/test_togsim_runtime.py @@ -0,0 +1,189 @@ +"""P3 task 5: the TOGSim C6 runtime + loader (togsim_runtime.cc / togsim_loader.h). + +Builds a producer `.so` from a post-vcix fixture, links the real C6 runtime, runs +the loader (`run_producer`) against the `.so`, and checks the recorded trace: +DRAM addresses are resolved (base[arg_id] + offset*elem_bytes), compute cycles +are looked up from the cycle table, and every wait gets a handle a dma minted. + +Uses a checked-in post-vcix `.mlir` fixture (tests/fixtures/), so it is +self-contained; skipped only when the MLIR bindings, `mlir-translate`, or a C++ +compiler are missing. +""" +import importlib.util +import os +import pathlib +import shutil +import subprocess +import sys +import tempfile + +import pytest + +_ROOT = pathlib.Path(__file__).resolve().parents[1] +_CXX = os.environ.get("CXX", "g++") +_INCLUDE = _ROOT / "TOGSim" / "include" +_RUNTIME = _ROOT / "TOGSim" / "src" / "togsim_runtime.cc" + + +def _mlir_translate(): + return os.path.join(os.environ.get("TORCHSIM_LLVM_PATH", "/usr/bin"), + "mlir-translate") + + +def _tools_ready(): + return (importlib.util.find_spec("mlir") is not None + and os.path.isfile(_mlir_translate()) + and shutil.which(_CXX) is not None + and _RUNTIME.is_file()) + + +# Checked-in post-vcix kernel: a 256^3 single-output-tile GEMM (X/W/Y_spad +# 256x256), matching the trace assertions below. Self-contained so the test +# runs wherever the tools are present -- no setup/env needed. +_FIXTURE = pathlib.Path(__file__).resolve().parent / "fixtures" / "gemm256_postvcix.mlir" + + +def _fixture(): + if not _FIXTURE.is_file(): + pytest.skip(f"missing checked-in fixture {_FIXTURE}") + return str(_FIXTURE) + + +# Drives the loader with known tensor bases + a synthetic cycle table, then +# checks the recorded trace. Tailored to a single-output-tile GEMM (256^3): +# 3 dmas A/B/C at offset 0 -> addr == base; args 0/1/2; dirs load/load/store. +_MAIN = r''' +#include +#include +#include +#include +#include "togsim_loader.h" +using namespace togsim; +int main(int argc, char** argv) { + uint64_t bases[3] = {0x1000, 0x2000, 0x3000}; + int64_t cyc[3] = {100, 200, 300}; + int64_t ovl[3] = {0, 200, 172}; + int32_t pcores[1] = {0}; // round-robin work-items over core 0 (single-core harness) + RunResult r = run_producer(argv[1], nullptr, 0, bases, 3, cyc, ovl, 3, pcores, 1); + if (!r.ok) { printf("run failed\n"); return 2; } + int ndisp=0, nd=0, nc=0, nm=0, fail=0; + std::vector dma_a; std::vector dma_arg, dma_dir; + std::vector> async_tags; // (tag_id, tag_slot) of async dmas + for (auto& t : r.trace) { + if (t.kind == TraceRec::TILE_BEGIN) ndisp++; // one per work-item + else if (t.kind == TraceRec::DMA) { + nd++; dma_a.push_back(t.addr); + dma_arg.push_back(t.arg_id); dma_dir.push_back(t.dir); + if (t.is_async) async_tags.push_back({t.tag_id, t.tag_slot}); + } else if (t.kind == TraceRec::COMPUTE) { + nc++; + int64_t want = (t.tile_id < 3) ? cyc[t.tile_id] : -1; + if (t.cycle != want) { printf("compute %lu cyc %ld!=%ld\n", + (unsigned long)t.tile_id, (long)t.cycle, (long)want); fail++; } + } else if (t.kind == TraceRec::MEMORY_BAR) { + nm++; bool ok=false; + for (auto& k : async_tags) if (k.first==t.tag_id && k.second==t.tag_slot) ok=true; + if (!ok) { printf("membar tag (%d,%lu) pairs no async dma\n", + t.tag_id, (unsigned long)t.tag_slot); fail++; } + } + } + const uint64_t exp[3] = {0x1000, 0x2000, 0x3000}; + const int ea[3] = {0,1,2}, ed[3] = {0,0,1}; + for (int i = 0; i < nd && i < 3; ++i) + if (dma_a[i]!=exp[i] || dma_arg[i]!=ea[i] || dma_dir[i]!=ed[i]) { + printf("dma[%d] addr=%#lx arg=%d dir=%d\n", i, + (unsigned long)dma_a[i], dma_arg[i], dma_dir[i]); fail++; + } + printf("dispatch=%d dma=%d compute=%d membar=%d fail=%d\n", ndisp, nd, nc, nm, fail); + printf(fail ? "RESULT FAIL\n" : "RESULT PASS\n"); + return fail ? 1 : 0; +} +''' + + +@pytest.mark.skipif(not _tools_ready(), + reason="need mlir bindings + mlir-translate + C++ compiler + runtime") +def test_runtime_loads_and_records(): + fix = _fixture() + sys.path.insert(0, str(_ROOT)) + from PyTorchSimFrontend.mlir.passes import lower_to_emitc as c4 + + with tempfile.TemporaryDirectory() as d: + so = os.path.join(d, "trace.so") + c4.build_trace_so(fix, so) + + main_cpp = os.path.join(d, "main.cpp") + binp = os.path.join(d, "runtime_test") + with open(main_cpp, "w") as fh: + fh.write(_MAIN) + build = subprocess.run( + [_CXX, "-std=gnu++17", "-O2", "-rdynamic", "-I", str(_INCLUDE), + main_cpp, str(_RUNTIME), "-o", binp, "-ldl"], + capture_output=True, text=True) + assert build.returncode == 0, build.stderr + + run = subprocess.run([binp, so], capture_output=True, text=True) + out = run.stdout + assert "RESULT PASS" in out, out + run.stderr + assert run.returncode == 0, out + # at least the GEMM's 3 dmas were recorded with resolved addresses. + line = [l for l in out.splitlines() if l.startswith("dispatch=")][0] + counts = dict(kv.split("=") for kv in line.split()) + assert int(counts["dma"]) >= 1 + assert int(counts["compute"]) >= 1 + assert int(counts["fail"]) == 0 + + +_SIM_MAIN = r''' +#include +#include +#include "togsim_loader.h" +using namespace togsim; +int main(int argc, char** argv) { + uint64_t bases[3] = {0x1000, 0x2000, 0x3000}; + int64_t cyc[3] = {100, 200, 300}; + int64_t ovl[3] = {0, 200, 172}; + int32_t pcores[1] = {0}; // round-robin work-items over core 0 (single-core harness) + RunResult r = run_producer(argv[1], nullptr, 0, bases, 3, cyc, ovl, 3, pcores, 1); + if (!r.ok) { printf("run failed\n"); return 2; } + TimingParams p; p.dma_latency = 100; + SimResult s = simulate(r, p); + // serial baseline: no overlap at all. + uint64_t serial = 0; + for (auto& t : r.trace) { + if (t.kind == TraceRec::DMA) serial += p.dma_latency; + else if (t.kind == TraceRec::COMPUTE) serial += (uint64_t)t.cycle; + } + printf("SIM total=%lu compute=%d dma=%d serial=%lu\n", + (unsigned long)s.total_cycle, s.n_compute, s.n_dma, (unsigned long)serial); + // The trace is schedulable into cycles; overlap (dma||compute, compute + // pipelining) makes it no worse than the fully-serial baseline. + bool ok = s.total_cycle > 0 && s.n_compute > 0 && s.total_cycle <= serial; + printf(ok ? "RESULT PASS\n" : "RESULT FAIL\n"); + return ok ? 0 : 1; +} +''' + + +@pytest.mark.skipif(not _tools_ready(), + reason="need mlir bindings + mlir-translate + C++ compiler + runtime") +def test_simulate_produces_cycles(): + fix = _fixture() + sys.path.insert(0, str(_ROOT)) + from PyTorchSimFrontend.mlir.passes import lower_to_emitc as c4 + + with tempfile.TemporaryDirectory() as d: + so = os.path.join(d, "trace.so") + c4.build_trace_so(fix, so) + main_cpp = os.path.join(d, "sim.cpp") + binp = os.path.join(d, "sim_test") + with open(main_cpp, "w") as fh: + fh.write(_SIM_MAIN) + build = subprocess.run( + [_CXX, "-std=gnu++17", "-O2", "-rdynamic", "-I", str(_INCLUDE), + main_cpp, str(_RUNTIME), "-o", binp, "-ldl"], + capture_output=True, text=True) + assert build.returncode == 0, build.stderr + run = subprocess.run([binp, so], capture_output=True, text=True) + assert "RESULT PASS" in run.stdout, run.stdout + run.stderr + assert run.returncode == 0, run.stdout diff --git a/tests/test_togsim_skeleton.py b/tests/test_togsim_skeleton.py new file mode 100644 index 00000000..56601966 --- /dev/null +++ b/tests/test_togsim_skeleton.py @@ -0,0 +1,184 @@ +"""Tests for the C++ trace-generation front-end pieces (docs/design/togsim_cpp_trace.md). + +Two layers: + +* `test_togsim_ops_contract` runs anywhere (no MLIR bindings, no torch). It pins + the skeleton+API vocabulary (`togsim_ops.py`) and checks it stays in lockstep + with the runtime ABI header (`togsim_runtime.h`) -- the single thing most + likely to silently drift. +* `test_build_skeleton_on_fixture` exercises the real `build_skeleton` pass, and + is skipped unless the MLIR bindings are importable AND a post-vcix `.mlir` + fixture is supplied via the `TOGSIM_SKELETON_FIXTURE` env var. (A valid + build_tog-compatible fixture is hard to hand-write reliably; point this at a + kernel dump from a real run.) +""" +import os +import importlib.util +import pathlib + +import pytest + +_ROOT = pathlib.Path(__file__).resolve().parents[1] +_OPS_PY = _ROOT / "PyTorchSimFrontend" / "mlir" / "passes" / "togsim_ops.py" +_HEADER = _ROOT / "TOGSim" / "include" / "togsim_runtime.h" + + +def _load_togsim_ops(): + spec = importlib.util.spec_from_file_location("togsim_ops", _OPS_PY) + mod = importlib.util.module_from_spec(spec) + spec.loader.exec_module(mod) + return mod + + +def test_togsim_ops_contract(): + ts = _load_togsim_ops() + header = _HEADER.read_text() + + # Every op maps to a callee, and every callee is the header's free function. + assert set(ts.EMITC_CALLEE) == set(ts.OP_NAMES) + for callee in ts.EMITC_CALLEE.values(): + assert callee in header, f"{callee} missing from togsim_runtime.h" + + # Entry point symbol agrees with the header. + assert ts.ENTRY_SYMBOL == "togsim_kernel" + assert ts.ENTRY_SYMBOL in header + + # Runtime callee emitted directly by lower_to_emitc: the work-item dispatch + # wrapper. (The outlined tile fn TILE_SYMBOL is producer-generated.) + assert ts.DISPATCH_CALLEE in header + + # Direction enum agrees with the header's togsim_dma_dir. + assert (ts.DIR_LOAD, ts.DIR_STORE) == (0, 1) + assert "TOGSIM_DMA_LOAD = 0" in header + assert "TOGSIM_DMA_STORE = 1" in header + + +def _mlir_available(): + return importlib.util.find_spec("mlir") is not None + + +@pytest.mark.skipif(not _mlir_available(), reason="MLIR Python bindings not installed") +def test_build_skeleton_on_fixture(): + fixture = os.environ.get("TOGSIM_SKELETON_FIXTURE") + if not fixture or not os.path.isfile(fixture): + pytest.skip("set TOGSIM_SKELETON_FIXTURE to a post-vcix kernel .mlir") + + import sys + sys.path.insert(0, str(_ROOT)) + from PyTorchSimFrontend.mlir.passes import build_skeleton + + import mlir.ir as ir + ctx = ir.Context() + ctx.allow_unregistered_dialects = True + with ctx: + module = ir.Module.parse(pathlib.Path(fixture).read_text(), ctx) + report = build_skeleton.build_skeleton(module) + out = str(module) + + # The data-movement ops are gone; the API ops took their place. + assert "memref.dma_start" not in out + assert "memref.dma_wait" not in out + assert "togsim.dma" in out + assert "togsim.memory_barrier" in out # the explicit async-DMA sync (was dma_wait) + assert "event_id" not in out # static pairing replaced by the runtime tag + # Loop skeleton is preserved. + assert ("affine.for" in out) or ("scf.for" in out) + assert module.operation.verify() + print(report) + + +@pytest.mark.skipif(not _mlir_available(), reason="MLIR Python bindings not installed") +def test_strip_accum_terms_drops_reduction_marker(): + """Regression: the dma_wait tag index built by lower_to_vcix carries a `-d_i` + term for each accumulation (reduction) loop var -- a sentinel marker, not an + offset. build_skeleton must drop those so a memory_barrier waits on the same + subtile slot the async load wrote; otherwise the producer evaluates `-acc_iv` + to a negative slot at reduction iteration > 0, the recorded barrier slot + diverges from the load slot, and TOGSim aborts with "Key does not exist in ... + tag table" on subtile + multi-tile-K. See docs/design/togsim_cpp_trace.md and + legacy TileGraphParser.cc (which skips stride -1 for the same reason).""" + import sys + sys.path.insert(0, str(_ROOT)) + from PyTorchSimFrontend.mlir.passes import build_skeleton as bs + + import mlir.ir as ir + ctx = ir.Context() + ctx.allow_unregistered_dialects = True + with ctx, ir.Location.unknown(ctx): + module = ir.Module.parse( + "func.func @k() {\n" + " %r = arith.constant 1 : index\n" # stand-in reduction iv + " %a = arith.constant 0 : index\n" # subtile dim 1 + " %b = arith.constant 0 : index\n" # subtile dim 2 + " return\n" + "}", ctx) + block = module.body.operations[0].regions[0].blocks[0] + consts = [op.results[0] for op in block.operations if op.name == "arith.constant"] + anchor = [op for op in block.operations if op.name == "func.return"][0] + r, a, b = consts + + def neg_dims(val): + amap = ir.AffineMapAttr(val.owner.attributes["map"]).value + return [p for p in (bs._neg_coeff_dim(s) for s in bs._flatten_add(amap.results[0])) + if p is not None] + + # #map8-style: -d0 (reduction) + d1 + d2 floordiv 2. + d0, d1, d2 = (ir.AffineDimExpr.get(i) for i in range(3)) + expr = d0 * -1 + d1 + ir.AffineExpr.get_floor_div(d2, 2) + with ir.InsertionPoint(anchor): + apply = ir.Operation.create( + "affine.apply", results=[ir.IndexType.get()], operands=[r, a, b], + attributes={"map": ir.AffineMapAttr.get(ir.AffineMap.get(3, 0, [expr]))}) + tag_in = apply.results[0] + assert neg_dims(tag_in) == [0] # the reduction marker is present + + tag_out = bs._strip_accum_terms(ctx, tag_in, anchor) + assert tag_out is not tag_in # a new, reduced apply was emitted + out_map = ir.AffineMapAttr(tag_out.owner.attributes["map"]).value + assert out_map.n_dims == 2 # the reduction dim was dropped + assert neg_dims(tag_out) == [] # no reduction marker remains + assert list(tag_out.owner.operands) == [a, b] # only the subtile operands survive + + # No-op: an index with no reduction marker is returned unchanged. + plain = d0 + d1 + with ir.InsertionPoint(anchor): + papply = ir.Operation.create( + "affine.apply", results=[ir.IndexType.get()], operands=[a, b], + attributes={"map": ir.AffineMapAttr.get(ir.AffineMap.get(2, 0, [plain]))}) + pin = papply.results[0] + assert bs._strip_accum_terms(ctx, pin, anchor) is pin + + assert module.operation.verify() + + +@pytest.mark.skipif(not _mlir_available(), reason="MLIR Python bindings not installed") +def test_cycle_table_on_fixture(): + fixture = os.environ.get("TOGSIM_SKELETON_FIXTURE") + if not fixture or not os.path.isfile(fixture): + pytest.skip("set TOGSIM_SKELETON_FIXTURE to a post-vcix kernel .mlir") + + import sys + sys.path.insert(0, str(_ROOT)) + from PyTorchSimFrontend.mlir.passes import build_skeleton, cycle_table + + import mlir.ir as ir + ctx = ir.Context() + ctx.allow_unregistered_dialects = True + with ctx: + module = ir.Module.parse(pathlib.Path(fixture).read_text(), ctx) + build_skeleton.build_skeleton(module) + types = cycle_table._compute_types(module) + # synthetic per-tile cycles (gem5 sample-mode is reused at P3 task 5). + cyc = [10 * (i + 1) for i in range(len(types))] + x_off, w_off = 4, 0 + table = cycle_table.build_cycle_table(module, cyc, x_off, w_off) + + assert len(table) == len(types) >= 1 + # cycle is carried verbatim; overlapping_cycle follows the legacy formula. + for (cy, ov), t, raw in zip(table, types, cyc): + assert cy == raw + if t == cycle_table.VECTOR_COMPUTE: + assert ov == 0 + else: + off = w_off if t == cycle_table.MATMUL_PRELOAD else x_off + assert ov == max(raw - off, 0)