PSAL-POSTECH · YWHyuk · Jun 24, 2026 · Jun 24, 2026 · Jun 24, 2026 · Jun 24, 2026
diff --git a/AsmParser/tog_generator.py b/AsmParser/tog_generator.py
@@ -1,3 +1,9 @@
+# DEPRECATED (timing path): legacy ONNX Tile-Operation-Graph producer. Builds
+# the TOG and serializes it to ONNX for the C++ TileGraphParser. Superseded by
+# the C++ trace pipeline (PyTorchSimFrontend/mlir/passes/build_skeleton.py +
+# lower_to_emitc.py + cycle_table.py -> a compiled trace .so). Kept live so the
+# current pipeline does not break; to be retired once the trace pipeline (P3+)
+# stabilizes. See docs/design/togsim_cpp_trace.md.
 import os
 import sys
 import importlib.util

diff --git a/CLAUDE.md b/CLAUDE.md
@@ -85,6 +85,7 @@ Note: `TOGSIM_CONFIG` is **overwritten** while inside a `with TOGSimulator(confi
 Located under `configs/*.yml`:
 
 - `num_cores`, `core_freq_mhz`, `num_systolic_array_per_core`
+- `sa_weight_buffer_depth` (per-SA resident weight slots; **must be > 0** — the simulator errors on 0. Raise it to effectively disable the preload run-ahead throttle. Defaults to 2 if the key is absent.)
 - `vpu_num_lanes`, `vpu_spad_size_kb_per_lane`, `vpu_vector_length_bits`
 - `dram_type` (`ramulator2` | `simple`), `dram_channels`, `dram_freq_mhz`, `ramulator_config_path`
 - `icnt_type` (`simple` | `booksim`), `icnt_latency_cycles`, `icnt_freq_mhz`, `icnt_config_path`

diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py
@@ -5,7 +5,7 @@
 import torch
 
 from PyTorchSimFrontend import extension_config
-from torch._inductor.codecache import get_hash, write
+from torch._inductor.codecache import get_hash, write, write_atomic
 from torch._inductor.async_compile import AsyncCompile
 from AsmParser.tog_generator import tog_generator
 from PyTorchSimFrontend.mlir.mlir_caller_codegen import MLIRKernelCallerCodeGen
@@ -23,6 +23,13 @@ def get_write_path(src_code):
     return os.path.join(extension_config.get_dump_path(), hash_prefix(get_hash(src_code.strip())))
 
 
+_HEADER_BY_HASH = {}
+def store_header(src_code, spike_header, gem5_header):
+    _HEADER_BY_HASH[get_hash(src_code.strip())] = (spike_header, gem5_header)
+def get_header(src_code):
+    return _HEADER_BY_HASH.get(get_hash(src_code.strip()))
+
+
 def get_lock_path(write_path):
     """Return lock file path for the given write_path (per-source_code lock)."""
     return os.path.join(write_path, ".compile.lock")
@@ -128,40 +135,52 @@ def load(cls, source_code,
         vlen = kwargs['vlen']
         vlenb = vlen // 8
         write_path = get_write_path(source_code)
-        key, input_path = write(source_code, "mlir", specified_dir=write_path)
-        # Run the Python out-of-line MLIR passes (MLIR bindings) on the kernel
-        # .mlir in place, before mlir-opt. Currently lowers torchsim.vlane_idx
-        # (replaces the old C++ -global-idx pass); add more in passes/__init__.py.
+        os.makedirs(write_path, exist_ok=True)
+        global_var_header = kwargs.get("global_var_header")
+        if global_var_header is not None:
+            write_atomic(os.path.join(write_path, "global_var.h"), global_var_header)
+        gem5_global_var_header = kwargs.get("gem5_global_var_header")
+        if gem5_global_var_header is not None:
+            write_atomic(os.path.join(write_path, "gem5_global_var.h"), gem5_global_var_header)
+        # The compile rewrites the kernel .mlir in place (run_python_passes) and reads
+        # it back (mlir-opt). Two compiles of the same source -- the autotune's chosen
+        # candidate and the final kernel -- share a write_path, so hold the per-path
+        # lock across the whole build to keep them from interleaving, and skip the
+        # rebuild when a prior build already finished (its tile_graph.onnx exists).
+        from filelock import FileLock
         from PyTorchSimFrontend.mlir.passes import (
             run_python_passes, run_module_passes, POST_OPT_PASSES,
             run_standard_lowering, run_tog,
         )
-        run_python_passes(input_path, vectorlane=vectorlane_size)
-        new_input_path = os.path.splitext(input_path)[0]
-        raw_tog_path = new_input_path + "_tog.py"
         tog_path = os.path.join(write_path, "tile_graph.onnx")
-        sample_mlir_path = new_input_path + "_sample"
-        validation_binary_path = os.path.join(write_path, validation_binary_name)
-        gem5_cmds = mlir_gem5_compile_command(new_input_path, sample_mlir_path, raw_tog_path, vectorlane_size)
-
-        from filelock import FileLock
-        os.makedirs(write_path, exist_ok=True)
         lock = FileLock(get_lock_path(write_path), timeout=LOCK_TIMEOUT)
-
-        if spad_info is not None:
-            link_option = f"-Wl,--section-start=.spad=0x{spad_info['spad_vaddr']:x}"
-        else:
-            link_option = ""
-        # Generate LLVM kernel calller and binary for validation
-        if extension_config.pytorchsim_functional_mode:
-            # Use custom malloc to avoid size error
-            new_link_option = link_option + " -Wl,--wrap=malloc -Wl,--wrap=free"
-            cmds = mlir_compile_command(new_input_path, vectorlane_size, vlen=vlen)
-            opt_pad_cmd = shlex.split(cmds[0])
-            translate_cmd = shlex.split(cmds[1])
-            llc_cmd = shlex.split(cmds[2])
-            llc_asm_cmd = shlex.split(cmds[3])
-            with lock:
+        with lock:
+            key, input_path = write(source_code, "mlir", specified_dir=write_path)
+            if os.path.isfile(tog_path):
+                return key
+            # Run the Python out-of-line MLIR passes (MLIR bindings) on the kernel
+            # .mlir in place, before mlir-opt. Currently lowers torchsim.vlane_idx
+            # (replaces the old C++ -global-idx pass); add more in passes/__init__.py.
+            run_python_passes(input_path, vectorlane=vectorlane_size)
+            new_input_path = os.path.splitext(input_path)[0]
+            raw_tog_path = new_input_path + "_tog.py"
+            sample_mlir_path = new_input_path + "_sample"
+            validation_binary_path = os.path.join(write_path, validation_binary_name)
+            gem5_cmds = mlir_gem5_compile_command(new_input_path, sample_mlir_path, raw_tog_path, vectorlane_size)
+
+            if spad_info is not None:
+                link_option = f"-Wl,--section-start=.spad=0x{spad_info['spad_vaddr']:x}"
+            else:
+                link_option = ""
+            # Generate LLVM kernel calller and binary for validation
+            if extension_config.pytorchsim_functional_mode:
+                # Use custom malloc to avoid size error
+                new_link_option = link_option + " -Wl,--wrap=malloc -Wl,--wrap=free"
+                cmds = mlir_compile_command(new_input_path, vectorlane_size, vlen=vlen)
+                opt_pad_cmd = shlex.split(cmds[0])
+                translate_cmd = shlex.split(cmds[1])
+                llc_cmd = shlex.split(cmds[2])
+                llc_asm_cmd = shlex.split(cmds[3])
                 try:
                     # loop-padding (mlir-opt) -> Python fine-grained + vcix (one parse/print)
                     subprocess.check_call(opt_pad_cmd)
@@ -195,17 +214,11 @@ def load(cls, source_code,
                     )
                     raise SpadOverflowError()
 
-        # Skip if TOG file already exists
-        if os.path.isfile(tog_path):
-            return key
-
-        # Launch tile graph generator
-        gem5_pad_cmd = shlex.split(gem5_cmds[0])
-        gem5_translate_cmd = shlex.split(gem5_cmds[1])
-        gem5_llc_cmd = shlex.split(gem5_cmds[2])
+            # Launch tile graph generator
+            gem5_pad_cmd = shlex.split(gem5_cmds[0])
+            gem5_translate_cmd = shlex.split(gem5_cmds[1])
+            gem5_llc_cmd = shlex.split(gem5_cmds[2])
 
-        lock = FileLock(get_lock_path(write_path), timeout=LOCK_TIMEOUT)
-        with lock:
             try:
                 # mlir-opt now runs only loop-padding/dma-fine-grained/pytorchsim-to-vcix
                 # and writes the post-vcix IR. The tile-operation-graph pass is ported
@@ -241,8 +254,19 @@ def load(cls, source_code,
             # Run cyclesim
             cyclesim = CycleSimulator()
             cycle_list = cyclesim.compile_and_simulate(os.path.join(write_path, cycle_binary_name), vectorlane_size, silent_mode=silent_mode)
+            # Snapshot for the P3-trace hook below: generate_tile_graph consumes
+            # cycle_list in place (cycle_list.pop(0) per tile), leaving it empty.
+            cycle_list_for_trace = list(cycle_list)
 
             # Create TOG
+            # DEPRECATED (timing path): this ONNX-TOG producer -- run_tog ->
+            # tog_generator.generate_tile_graph -> ONNX -> C++ TileGraphParser --
+            # is being superseded by the C++ trace pipeline (build_skeleton +
+            # lower_to_emitc -> compiled .so, + the cycle_table sidecar). The
+            # per-tile cycle_list / x_offset / w_offset computed here are exactly
+            # what cycle_table.build_cycle_table will reuse, so both paths stay
+            # cycle-consistent during the transition. Kept live (pipeline must not
+            # break); to be retired once the trace pipeline (P3+) stabilizes.
             w_offset, x_offset = vectorlane_size, vectorlane_size
             if kwargs['loop_size'] is not None and kwargs['loop_size'][-3] < vectorlane_size:
                 x_offset = kwargs['loop_size'][-3]
@@ -258,6 +282,33 @@ def load(cls, source_code,
                 w_offset=w_offset, # FIXME.
                 vector_lane=vectorlane_size
             )
+
+            # Trace pipeline (DEFAULT): emit the compiled trace producer .so + the
+            # cycle-table TSV from the post-vcix IR and gem5 cycle_list/offsets. This
+            # is the default simulation path (the C++ TOG); the legacy ONNX TOG is
+            # DEPRECATED, an opt-in fallback via TORCHSIM_LEGACY_TOG=1, in which case the
+            # .so is unused so skip emitting it. Best-effort: never breaks the compile.
+            if os.environ.get("TORCHSIM_LEGACY_TOG") != "1":
+                try:
+                    import mlir.ir as ir
+                    from PyTorchSimFrontend.mlir.passes import (
+                        build_skeleton as _bs, cycle_table as _ct, lower_to_emitc as _l2e)
+                    pv = sample_mlir_path + "_postvcix.mlir"
+                    _ctx = ir.Context(); _ctx.allow_unregistered_dialects = True
+                    with _ctx:
+                        _mod = ir.Module.parse(open(pv).read(), _ctx)
+                        _bs.build_skeleton(_mod)
+                        _ntiles = len(_ct._compute_types(_mod))
+                        # align lengths: gem5 gives one numCycles per compute node;
+                        # pad with the last value / truncate if it disagrees.
+                        _cl = list(cycle_list_for_trace)
+                        if _cl and len(_cl) != _ntiles:
+                            _cl = (_cl + [_cl[-1]] * _ntiles)[:_ntiles]
+                        _tbl = _ct.build_cycle_table(_mod, _cl, x_offset, w_offset)
+                    _ct.dump_cycle_table_tsv(_tbl, os.path.join(write_path, "trace_cycles.tsv"))
+                    _l2e.build_trace_so(pv, os.path.join(write_path, "trace.so"))
+                except Exception as e:
+                    logger.warning(f"[P3-trace] trace .so/sidecar dump skipped: {e}")
         return key
 
 class CustomAsyncCompile(AsyncCompile):

diff --git a/PyTorchSimFrontend/mlir/mlir_autotune.py b/PyTorchSimFrontend/mlir/mlir_autotune.py
@@ -54,7 +54,7 @@ def __str__(self) -> str:
     def make_run_fn(
         self, input_tensors: torch.Tensor, output_tensors: torch.Tensor
     ) -> Callable[[], None]:
-        from PyTorchSimFrontend.extension_codecache import CustomAsyncCompile
+        from PyTorchSimFrontend.extension_codecache import CustomAsyncCompile, get_header
         custom_async_compile = CustomAsyncCompile()
 
         # Check already cached result.
@@ -80,12 +80,15 @@ def cached_run_fn(*args, autotune_subprocess_timeout_sec=None, **kwargs):
                 return cached_run_fn
 
         # Run a candidate code
+        _headers = get_header(self.source_code)
+        _header_kwargs = {} if _headers is None else {
+            "global_var_header": _headers[0], "gem5_global_var_header": _headers[1]}
         run_method = custom_async_compile.mlir(
             self.source_code, vectorlane_size=self.extra_args["vector_lane"],
             loop_size=self.extra_args["loop_size"], spad_info=self.extra_args["spad_info"],
             vlen=self.extra_args["vlen"], arg_attributes=self.extra_args["arg_attributes"],
             origins=self.extra_args["origins"], silent_mode=True,
-            autotune=self.extra_args['autotune'])
+            autotune=self.extra_args['autotune'], **_header_kwargs)
 
         args = [
             tensor

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -17,7 +17,6 @@
 from torch._inductor.codegen import cpp, wrapper, common, memory_planning
 from torch._inductor.ir import GraphPartitionSignature
 from torch._inductor.virtualized import V, _ops as ops
-from torch._inductor.codecache import write_atomic
 from torch._inductor.utils import (
     IndentedBuffer,
     is_welford_reduction,
@@ -1120,28 +1119,23 @@ def codegen_nodes(self, nodes, kernel_name):
         src_code, meta_code = super().codegen_nodes(nodes, kernel_name)
         self._prepare_simulator_headers(src_code)
         if "autotune" in extension_config.codegen_mapping_strategy and extension_config.pytorchsim_timing_mode:
-            optimal_src_code, meta_code = self.autotune(nodes, kernel_name)[:2]
+            # Use temporaries: autotune returns [None, None, None] when it cannot
+            # autotune (e.g. a size-1 pointwise kernel with ranges == [1]), and
+            # unpacking into meta_code would clobber the valid arg_attributes that
+            # the fall-through below returns.
+            optimal_src_code, optimal_meta_code = self.autotune(nodes, kernel_name)[:2]
             if optimal_src_code is not None:
-                return optimal_src_code, meta_code
+                return optimal_src_code, optimal_meta_code
         return src_code, meta_code
 
     def _prepare_simulator_headers(self, src_code):
-        from filelock import FileLock
-
-        write_path = extension_codecache.get_write_path(src_code)
-        os.makedirs(write_path, exist_ok=True)
-
-        spike_write_path = os.path.join(write_path, "global_var.h")
-        gem5_write_path = os.path.join(write_path, "gem5_global_var.h")
-
         spad_end_symbol = "int spad_end[0] __attribute__ ((section(\".spad\")));\n"
         spad_section_end_symbol = (
             f"int spad_section_end[0] __attribute__ ((section(\".spad\"), aligned({self.spad_info['spad_size']*self.vector_lane})));"
         )
-        lock = FileLock(extension_codecache.get_lock_path(write_path), timeout=extension_codecache.LOCK_TIMEOUT)
-        with lock:
-            write_atomic(spike_write_path, self.header.getvalue() + spad_end_symbol + spad_section_end_symbol)
-            write_atomic(gem5_write_path, self.gem5_header.getvalue())
+        spike_content = self.header.getvalue() + spad_end_symbol + spad_section_end_symbol
+        gem5_content = self.gem5_header.getvalue()
+        extension_codecache.store_header(src_code, spike_content, gem5_content)
 
     def get_arg_info(self, name):
         arg_info = dict()

diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -5,6 +5,7 @@
 import operator
 from sympy import symbols, sympify
 from PyTorchSimFrontend import extension_config
+from PyTorchSimFrontend import extension_codecache
 from PyTorchSimFrontend.mlir.mlir_codegen_backend import MLIRKernel
 
 from torch.utils._ordered_set import OrderedSet
@@ -333,6 +334,10 @@ def define_kernel(self, src_code, meta_code, kernel_name, vector_lane, spad_info
             codecache_def.writeline(f"spad_info={spad_info},")
             codecache_def.writeline(f"origins={origins},")
             codecache_def.writeline(f"arg_attributes={meta_code},")
+            headers = extension_codecache.get_header(src_code)
+            if headers is not None:
+                codecache_def.writeline(f"global_var_header='''{headers[0]}''',")
+                codecache_def.writeline(f"gem5_global_var_header='''{headers[1]}''',")
             codecache_def.writeline(f"vlen={extension_config.vpu_vector_length_bits})")
             wrapper.define_kernel(kernel_name, codecache_def.getvalue(), gpu=False)
         return kernel_name

diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -21,7 +21,6 @@
 from torch._inductor.autotune_process import TensorMeta
 from torch._inductor.virtualized import V, NullHandler, _ops as ops
 from torch._inductor.utils import IndentedBuffer
-from torch._inductor.codecache import write_atomic
 
 import PyTorchSimFrontend.extension_codecache as extension_codecache
 from PyTorchSimFrontend.mlir.mlir_autotune import MLIRBenchmarkRequest
@@ -613,22 +612,11 @@ def codegen_nodes(self, tile_candidates, render, template_node, prologue_nodes,
         return src_code, meta_code
 
     def _prepare_simulator_headers(self, src_code):
-        from filelock import FileLock
-
         spad_end_symbol = f"int spad_end[0] __attribute__ ((section(\".spad\")));\n"
         spad_section_end_symbol = f"int spad_section_end[0] __attribute__ ((section(\".spad\"), aligned({self.spad_info['spad_size']*self.vector_lane})));"
-
-        write_path = extension_codecache.get_write_path(src_code)
-        os.makedirs(write_path, exist_ok=True)
-        spike_write_path = os.path.join(write_path, "global_var.h")
-        gem5_write_path = os.path.join(write_path, "gem5_global_var.h")
-
-        lock = FileLock(extension_codecache.get_lock_path(write_path), timeout=extension_codecache.LOCK_TIMEOUT)
-        with lock:
-            if not os.path.exists(spike_write_path):
-                write_atomic(spike_write_path, self.header.getvalue()+spad_end_symbol+spad_section_end_symbol)
-            if not os.path.exists(gem5_write_path):
-                write_atomic(gem5_write_path, self.gem5_header.getvalue())
+        spike_content = self.header.getvalue()+spad_end_symbol+spad_section_end_symbol
+        gem5_content = self.gem5_header.getvalue()
+        extension_codecache.store_header(src_code, spike_content, gem5_content)
 
     def codegen_prologue_body(self):
         body = IndentedBuffer()

diff --git a/PyTorchSimFrontend/mlir/passes/__init__.py b/PyTorchSimFrontend/mlir/passes/__init__.py
@@ -76,8 +76,12 @@ def run_module_passes(in_path, out_path, passes, **opts):
             p.run(module, **opts)
         out = str(module)
 
-    with open(out_path, "w") as f:
-        f.write(out)
+    # Atomic write: run_python_passes rewrites the kernel .mlir in place outside
+    # load()'s FileLock, so a concurrent compile of the same source must never see a
+    # truncated file -- mlir-opt would parse it to an empty module and silently drop
+    # the kernel (-> undefined reference to wrapper_kernel at link).
+    from torch._inductor.codecache import write_atomic
+    write_atomic(out_path, out)
     return True