diff --git a/AsmParser/tog_generator.py b/AsmParser/tog_generator.py
index a12460e3..0de76246 100644
--- a/AsmParser/tog_generator.py
+++ b/AsmParser/tog_generator.py
@@ -1,3 +1,9 @@
+# DEPRECATED (timing path): legacy ONNX Tile-Operation-Graph producer. Builds
+# the TOG and serializes it to ONNX for the C++ TileGraphParser. Superseded by
+# the C++ trace pipeline (PyTorchSimFrontend/mlir/passes/build_skeleton.py +
+# lower_to_emitc.py + cycle_table.py -> a compiled trace .so). Kept live so the
+# current pipeline does not break; to be retired once the trace pipeline (P3+)
+# stabilizes. See docs/design/togsim_cpp_trace.md.
 import os
 import sys
 import importlib.util
diff --git a/CLAUDE.md b/CLAUDE.md
index 12d48082..5a3a47cd 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -85,6 +85,7 @@ Note: `TOGSIM_CONFIG` is **overwritten** while inside a `with TOGSimulator(confi
 Located under `configs/*.yml`:
 
 - `num_cores`, `core_freq_mhz`, `num_systolic_array_per_core`
+- `sa_weight_buffer_depth` (per-SA resident weight slots; **must be > 0** — the simulator errors on 0. Raise it to effectively disable the preload run-ahead throttle. Defaults to 2 if the key is absent.)
 - `vpu_num_lanes`, `vpu_spad_size_kb_per_lane`, `vpu_vector_length_bits`
 - `dram_type` (`ramulator2` | `simple`), `dram_channels`, `dram_freq_mhz`, `ramulator_config_path`
 - `icnt_type` (`simple` | `booksim`), `icnt_latency_cycles`, `icnt_freq_mhz`, `icnt_config_path`
diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py
index 492133a3..785a3d95 100644
--- a/PyTorchSimFrontend/extension_codecache.py
+++ b/PyTorchSimFrontend/extension_codecache.py
@@ -5,7 +5,7 @@
 import torch
 
 from PyTorchSimFrontend import extension_config
-from torch._inductor.codecache import get_hash, write
+from torch._inductor.codecache import get_hash, write, write_atomic
 from torch._inductor.async_compile import AsyncCompile
 from AsmParser.tog_generator import tog_generator
 from PyTorchSimFrontend.mlir.mlir_caller_codegen import MLIRKernelCallerCodeGen
@@ -23,6 +23,13 @@ def get_write_path(src_code):
     return os.path.join(extension_config.get_dump_path(), hash_prefix(get_hash(src_code.strip())))
 
 
+_HEADER_BY_HASH = {}
+def store_header(src_code, spike_header, gem5_header):
+    _HEADER_BY_HASH[get_hash(src_code.strip())] = (spike_header, gem5_header)
+def get_header(src_code):
+    return _HEADER_BY_HASH.get(get_hash(src_code.strip()))
+
+
 def get_lock_path(write_path):
     """Return lock file path for the given write_path (per-source_code lock)."""
     return os.path.join(write_path, ".compile.lock")
@@ -128,40 +135,52 @@ def load(cls, source_code,
         vlen = kwargs['vlen']
         vlenb = vlen // 8
         write_path = get_write_path(source_code)
-        key, input_path = write(source_code, "mlir", specified_dir=write_path)
-        # Run the Python out-of-line MLIR passes (MLIR bindings) on the kernel
-        # .mlir in place, before mlir-opt. Currently lowers torchsim.vlane_idx
-        # (replaces the old C++ -global-idx pass); add more in passes/__init__.py.
+        os.makedirs(write_path, exist_ok=True)
+        global_var_header = kwargs.get("global_var_header")
+        if global_var_header is not None:
+            write_atomic(os.path.join(write_path, "global_var.h"), global_var_header)
+        gem5_global_var_header = kwargs.get("gem5_global_var_header")
+        if gem5_global_var_header is not None:
+            write_atomic(os.path.join(write_path, "gem5_global_var.h"), gem5_global_var_header)
+        # The compile rewrites the kernel .mlir in place (run_python_passes) and reads
+        # it back (mlir-opt). Two compiles of the same source -- the autotune's chosen
+        # candidate and the final kernel -- share a write_path, so hold the per-path
+        # lock across the whole build to keep them from interleaving, and skip the
+        # rebuild when a prior build already finished (its tile_graph.onnx exists).
+        from filelock import FileLock
         from PyTorchSimFrontend.mlir.passes import (
             run_python_passes, run_module_passes, POST_OPT_PASSES,
             run_standard_lowering, run_tog,
         )
-        run_python_passes(input_path, vectorlane=vectorlane_size)
-        new_input_path = os.path.splitext(input_path)[0]
-        raw_tog_path = new_input_path + "_tog.py"
         tog_path = os.path.join(write_path, "tile_graph.onnx")
-        sample_mlir_path = new_input_path + "_sample"
-        validation_binary_path = os.path.join(write_path, validation_binary_name)
-        gem5_cmds = mlir_gem5_compile_command(new_input_path, sample_mlir_path, raw_tog_path, vectorlane_size)
-
-        from filelock import FileLock
-        os.makedirs(write_path, exist_ok=True)
         lock = FileLock(get_lock_path(write_path), timeout=LOCK_TIMEOUT)
-
-        if spad_info is not None:
-            link_option = f"-Wl,--section-start=.spad=0x{spad_info['spad_vaddr']:x}"
-        else:
-            link_option = ""
-        # Generate LLVM kernel calller and binary for validation
-        if extension_config.pytorchsim_functional_mode:
-            # Use custom malloc to avoid size error
-            new_link_option = link_option + " -Wl,--wrap=malloc -Wl,--wrap=free"
-            cmds = mlir_compile_command(new_input_path, vectorlane_size, vlen=vlen)
-            opt_pad_cmd = shlex.split(cmds[0])
-            translate_cmd = shlex.split(cmds[1])
-            llc_cmd = shlex.split(cmds[2])
-            llc_asm_cmd = shlex.split(cmds[3])
-            with lock:
+        with lock:
+            key, input_path = write(source_code, "mlir", specified_dir=write_path)
+            if os.path.isfile(tog_path):
+                return key
+            # Run the Python out-of-line MLIR passes (MLIR bindings) on the kernel
+            # .mlir in place, before mlir-opt. Currently lowers torchsim.vlane_idx
+            # (replaces the old C++ -global-idx pass); add more in passes/__init__.py.
+            run_python_passes(input_path, vectorlane=vectorlane_size)
+            new_input_path = os.path.splitext(input_path)[0]
+            raw_tog_path = new_input_path + "_tog.py"
+            sample_mlir_path = new_input_path + "_sample"
+            validation_binary_path = os.path.join(write_path, validation_binary_name)
+            gem5_cmds = mlir_gem5_compile_command(new_input_path, sample_mlir_path, raw_tog_path, vectorlane_size)
+
+            if spad_info is not None:
+                link_option = f"-Wl,--section-start=.spad=0x{spad_info['spad_vaddr']:x}"
+            else:
+                link_option = ""
+            # Generate LLVM kernel calller and binary for validation
+            if extension_config.pytorchsim_functional_mode:
+                # Use custom malloc to avoid size error
+                new_link_option = link_option + " -Wl,--wrap=malloc -Wl,--wrap=free"
+                cmds = mlir_compile_command(new_input_path, vectorlane_size, vlen=vlen)
+                opt_pad_cmd = shlex.split(cmds[0])
+                translate_cmd = shlex.split(cmds[1])
+                llc_cmd = shlex.split(cmds[2])
+                llc_asm_cmd = shlex.split(cmds[3])
                 try:
                     # loop-padding (mlir-opt) -> Python fine-grained + vcix (one parse/print)
                     subprocess.check_call(opt_pad_cmd)
@@ -195,17 +214,11 @@ def load(cls, source_code,
                     )
                     raise SpadOverflowError()
 
-        # Skip if TOG file already exists
-        if os.path.isfile(tog_path):
-            return key
-
-        # Launch tile graph generator
-        gem5_pad_cmd = shlex.split(gem5_cmds[0])
-        gem5_translate_cmd = shlex.split(gem5_cmds[1])
-        gem5_llc_cmd = shlex.split(gem5_cmds[2])
+            # Launch tile graph generator
+            gem5_pad_cmd = shlex.split(gem5_cmds[0])
+            gem5_translate_cmd = shlex.split(gem5_cmds[1])
+            gem5_llc_cmd = shlex.split(gem5_cmds[2])
 
-        lock = FileLock(get_lock_path(write_path), timeout=LOCK_TIMEOUT)
-        with lock:
             try:
                 # mlir-opt now runs only loop-padding/dma-fine-grained/pytorchsim-to-vcix
                 # and writes the post-vcix IR. The tile-operation-graph pass is ported
@@ -241,8 +254,19 @@ def load(cls, source_code,
             # Run cyclesim
             cyclesim = CycleSimulator()
             cycle_list = cyclesim.compile_and_simulate(os.path.join(write_path, cycle_binary_name), vectorlane_size, silent_mode=silent_mode)
+            # Snapshot for the P3-trace hook below: generate_tile_graph consumes
+            # cycle_list in place (cycle_list.pop(0) per tile), leaving it empty.
+            cycle_list_for_trace = list(cycle_list)
 
             # Create TOG
+            # DEPRECATED (timing path): this ONNX-TOG producer -- run_tog ->
+            # tog_generator.generate_tile_graph -> ONNX -> C++ TileGraphParser --
+            # is being superseded by the C++ trace pipeline (build_skeleton +
+            # lower_to_emitc -> compiled .so, + the cycle_table sidecar). The
+            # per-tile cycle_list / x_offset / w_offset computed here are exactly
+            # what cycle_table.build_cycle_table will reuse, so both paths stay
+            # cycle-consistent during the transition. Kept live (pipeline must not
+            # break); to be retired once the trace pipeline (P3+) stabilizes.
             w_offset, x_offset = vectorlane_size, vectorlane_size
             if kwargs['loop_size'] is not None and kwargs['loop_size'][-3] < vectorlane_size:
                 x_offset = kwargs['loop_size'][-3]
@@ -258,6 +282,33 @@ def load(cls, source_code,
                 w_offset=w_offset, # FIXME.
                 vector_lane=vectorlane_size
             )
+
+            # Trace pipeline (DEFAULT): emit the compiled trace producer .so + the
+            # cycle-table TSV from the post-vcix IR and gem5 cycle_list/offsets. This
+            # is the default simulation path (the C++ TOG); the legacy ONNX TOG is
+            # DEPRECATED, an opt-in fallback via TORCHSIM_LEGACY_TOG=1, in which case the
+            # .so is unused so skip emitting it. Best-effort: never breaks the compile.
+            if os.environ.get("TORCHSIM_LEGACY_TOG") != "1":
+                try:
+                    import mlir.ir as ir
+                    from PyTorchSimFrontend.mlir.passes import (
+                        build_skeleton as _bs, cycle_table as _ct, lower_to_emitc as _l2e)
+                    pv = sample_mlir_path + "_postvcix.mlir"
+                    _ctx = ir.Context(); _ctx.allow_unregistered_dialects = True
+                    with _ctx:
+                        _mod = ir.Module.parse(open(pv).read(), _ctx)
+                        _bs.build_skeleton(_mod)
+                        _ntiles = len(_ct._compute_types(_mod))
+                        # align lengths: gem5 gives one numCycles per compute node;
+                        # pad with the last value / truncate if it disagrees.
+                        _cl = list(cycle_list_for_trace)
+                        if _cl and len(_cl) != _ntiles:
+                            _cl = (_cl + [_cl[-1]] * _ntiles)[:_ntiles]
+                        _tbl = _ct.build_cycle_table(_mod, _cl, x_offset, w_offset)
+                    _ct.dump_cycle_table_tsv(_tbl, os.path.join(write_path, "trace_cycles.tsv"))
+                    _l2e.build_trace_so(pv, os.path.join(write_path, "trace.so"))
+                except Exception as e:
+                    logger.warning(f"[P3-trace] trace .so/sidecar dump skipped: {e}")
         return key
 
 class CustomAsyncCompile(AsyncCompile):
diff --git a/PyTorchSimFrontend/mlir/mlir_autotune.py b/PyTorchSimFrontend/mlir/mlir_autotune.py
index 396396f3..e4876b5b 100644
--- a/PyTorchSimFrontend/mlir/mlir_autotune.py
+++ b/PyTorchSimFrontend/mlir/mlir_autotune.py
@@ -54,7 +54,7 @@ def __str__(self) -> str:
     def make_run_fn(
         self, input_tensors: torch.Tensor, output_tensors: torch.Tensor
     ) -> Callable[[], None]:
-        from PyTorchSimFrontend.extension_codecache import CustomAsyncCompile
+        from PyTorchSimFrontend.extension_codecache import CustomAsyncCompile, get_header
         custom_async_compile = CustomAsyncCompile()
 
         # Check already cached result.
@@ -80,12 +80,15 @@ def cached_run_fn(*args, autotune_subprocess_timeout_sec=None, **kwargs):
                 return cached_run_fn
 
         # Run a candidate code
+        _headers = get_header(self.source_code)
+        _header_kwargs = {} if _headers is None else {
+            "global_var_header": _headers[0], "gem5_global_var_header": _headers[1]}
         run_method = custom_async_compile.mlir(
             self.source_code, vectorlane_size=self.extra_args["vector_lane"],
             loop_size=self.extra_args["loop_size"], spad_info=self.extra_args["spad_info"],
             vlen=self.extra_args["vlen"], arg_attributes=self.extra_args["arg_attributes"],
             origins=self.extra_args["origins"], silent_mode=True,
-            autotune=self.extra_args['autotune'])
+            autotune=self.extra_args['autotune'], **_header_kwargs)
 
         args = [
             tensor
diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 725e0dc6..8f695395 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -17,7 +17,6 @@
 from torch._inductor.codegen import cpp, wrapper, common, memory_planning
 from torch._inductor.ir import GraphPartitionSignature
 from torch._inductor.virtualized import V, _ops as ops
-from torch._inductor.codecache import write_atomic
 from torch._inductor.utils import (
     IndentedBuffer,
     is_welford_reduction,
@@ -1120,28 +1119,23 @@ def codegen_nodes(self, nodes, kernel_name):
         src_code, meta_code = super().codegen_nodes(nodes, kernel_name)
         self._prepare_simulator_headers(src_code)
         if "autotune" in extension_config.codegen_mapping_strategy and extension_config.pytorchsim_timing_mode:
-            optimal_src_code, meta_code = self.autotune(nodes, kernel_name)[:2]
+            # Use temporaries: autotune returns [None, None, None] when it cannot
+            # autotune (e.g. a size-1 pointwise kernel with ranges == [1]), and
+            # unpacking into meta_code would clobber the valid arg_attributes that
+            # the fall-through below returns.
+            optimal_src_code, optimal_meta_code = self.autotune(nodes, kernel_name)[:2]
             if optimal_src_code is not None:
-                return optimal_src_code, meta_code
+                return optimal_src_code, optimal_meta_code
         return src_code, meta_code
 
     def _prepare_simulator_headers(self, src_code):
-        from filelock import FileLock
-
-        write_path = extension_codecache.get_write_path(src_code)
-        os.makedirs(write_path, exist_ok=True)
-
-        spike_write_path = os.path.join(write_path, "global_var.h")
-        gem5_write_path = os.path.join(write_path, "gem5_global_var.h")
-
         spad_end_symbol = "int spad_end[0] __attribute__ ((section(\".spad\")));\n"
         spad_section_end_symbol = (
             f"int spad_section_end[0] __attribute__ ((section(\".spad\"), aligned({self.spad_info['spad_size']*self.vector_lane})));"
         )
-        lock = FileLock(extension_codecache.get_lock_path(write_path), timeout=extension_codecache.LOCK_TIMEOUT)
-        with lock:
-            write_atomic(spike_write_path, self.header.getvalue() + spad_end_symbol + spad_section_end_symbol)
-            write_atomic(gem5_write_path, self.gem5_header.getvalue())
+        spike_content = self.header.getvalue() + spad_end_symbol + spad_section_end_symbol
+        gem5_content = self.gem5_header.getvalue()
+        extension_codecache.store_header(src_code, spike_content, gem5_content)
 
     def get_arg_info(self, name):
         arg_info = dict()
diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index 41ec61af..8520596c 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -5,6 +5,7 @@
 import operator
 from sympy import symbols, sympify
 from PyTorchSimFrontend import extension_config
+from PyTorchSimFrontend import extension_codecache
 from PyTorchSimFrontend.mlir.mlir_codegen_backend import MLIRKernel
 
 from torch.utils._ordered_set import OrderedSet
@@ -333,6 +334,10 @@ def define_kernel(self, src_code, meta_code, kernel_name, vector_lane, spad_info
             codecache_def.writeline(f"spad_info={spad_info},")
             codecache_def.writeline(f"origins={origins},")
             codecache_def.writeline(f"arg_attributes={meta_code},")
+            headers = extension_codecache.get_header(src_code)
+            if headers is not None:
+                codecache_def.writeline(f"global_var_header='''{headers[0]}''',")
+                codecache_def.writeline(f"gem5_global_var_header='''{headers[1]}''',")
             codecache_def.writeline(f"vlen={extension_config.vpu_vector_length_bits})")
             wrapper.define_kernel(kernel_name, codecache_def.getvalue(), gpu=False)
         return kernel_name
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 529a49b5..2b8a0676 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -21,7 +21,6 @@
 from torch._inductor.autotune_process import TensorMeta
 from torch._inductor.virtualized import V, NullHandler, _ops as ops
 from torch._inductor.utils import IndentedBuffer
-from torch._inductor.codecache import write_atomic
 
 import PyTorchSimFrontend.extension_codecache as extension_codecache
 from PyTorchSimFrontend.mlir.mlir_autotune import MLIRBenchmarkRequest
@@ -613,22 +612,11 @@ def codegen_nodes(self, tile_candidates, render, template_node, prologue_nodes,
         return src_code, meta_code
 
     def _prepare_simulator_headers(self, src_code):
-        from filelock import FileLock
-
         spad_end_symbol = f"int spad_end[0] __attribute__ ((section(\".spad\")));\n"
         spad_section_end_symbol = f"int spad_section_end[0] __attribute__ ((section(\".spad\"), aligned({self.spad_info['spad_size']*self.vector_lane})));"
-
-        write_path = extension_codecache.get_write_path(src_code)
-        os.makedirs(write_path, exist_ok=True)
-        spike_write_path = os.path.join(write_path, "global_var.h")
-        gem5_write_path = os.path.join(write_path, "gem5_global_var.h")
-
-        lock = FileLock(extension_codecache.get_lock_path(write_path), timeout=extension_codecache.LOCK_TIMEOUT)
-        with lock:
-            if not os.path.exists(spike_write_path):
-                write_atomic(spike_write_path, self.header.getvalue()+spad_end_symbol+spad_section_end_symbol)
-            if not os.path.exists(gem5_write_path):
-                write_atomic(gem5_write_path, self.gem5_header.getvalue())
+        spike_content = self.header.getvalue()+spad_end_symbol+spad_section_end_symbol
+        gem5_content = self.gem5_header.getvalue()
+        extension_codecache.store_header(src_code, spike_content, gem5_content)
 
     def codegen_prologue_body(self):
         body = IndentedBuffer()
diff --git a/PyTorchSimFrontend/mlir/passes/__init__.py b/PyTorchSimFrontend/mlir/passes/__init__.py
index 82cadc2f..ab3cdcd3 100644
--- a/PyTorchSimFrontend/mlir/passes/__init__.py
+++ b/PyTorchSimFrontend/mlir/passes/__init__.py
@@ -76,8 +76,12 @@ def run_module_passes(in_path, out_path, passes, **opts):
             p.run(module, **opts)
         out = str(module)
 
-    with open(out_path, "w") as f:
-        f.write(out)
+    # Atomic write: run_python_passes rewrites the kernel .mlir in place outside
+    # load()'s FileLock, so a concurrent compile of the same source must never see a
+    # truncated file -- mlir-opt would parse it to an empty module and silently drop
+    # the kernel (-> undefined reference to wrapper_kernel at link).
+    from torch._inductor.codecache import write_atomic
+    write_atomic(out_path, out)
     return True
 
 
diff --git a/PyTorchSimFrontend/mlir/passes/_mlir_util.py b/PyTorchSimFrontend/mlir/passes/_mlir_util.py
new file mode 100644
index 00000000..e39f9d6f
--- /dev/null
+++ b/PyTorchSimFrontend/mlir/passes/_mlir_util.py
@@ -0,0 +1,87 @@
+"""Small, dependency-light helpers shared across the MLIR passes.
+
+Every pass had its own copy of the same op-walk generator (named variously
+`_iter_ops` / `_walk` / `_walk_ops`) and the same one-line attribute builders
+(`_i32` / `_i64` / ...). This module is the single source for both.
+
+Import-safety: `walk_ops` is pure block/op attribute access and needs no MLIR
+bindings, so this module does NOT import `mlir.ir` at top level -- some passes
+(e.g. lower_vlane_idx, decompose_transfer) are deliberately importable without
+the bindings present and only touch `mlir.ir` inside their run functions. The
+attribute builders therefore import `mlir.ir` lazily; they require an active
+MLIR context (the caller's `with ctx:`), exactly as the per-pass copies did.
+"""
+
+
+def walk_ops(block):
+    """Yield every op under `block` in program order, recursing into regions.
+
+    Snapshots each block's operation list, so a caller may erase ops while
+    iterating (the strictest of the former copies; a superset of the rest)."""
+    for op in list(block.operations):
+        yield op
+        for region in op.operation.regions:
+            for b in region.blocks:
+                yield from walk_ops(b)
+
+
+def _ir():
+    import mlir.ir as ir
+    return ir
+
+
+def i32(v):
+    """`i32` IntegerAttr for `v` (uses the active MLIR context)."""
+    ir = _ir()
+    return ir.IntegerAttr.get(ir.IntegerType.get_signless(32), int(v))
+
+
+def i64(v):
+    """`i64` IntegerAttr for `v`."""
+    ir = _ir()
+    return ir.IntegerAttr.get(ir.IntegerType.get_signless(64), int(v))
+
+
+def i64_array(vals):
+    """ArrayAttr of `i64` IntegerAttrs for `vals`."""
+    ir = _ir()
+    i = ir.IntegerType.get_signless(64)
+    return ir.ArrayAttr.get([ir.IntegerAttr.get(i, int(v)) for v in vals])
+
+
+def str_attr(v):
+    """StringAttr of `str(v)`."""
+    ir = _ir()
+    return ir.StringAttr.get(str(v))
+
+
+# ---------------------------------------------------------------------------
+# attribute readers -- accept an OpView or an Operation; `default` is returned
+# when `key` is absent (callers that want the strict "must be present" behaviour
+# simply never pass an absent key).
+# ---------------------------------------------------------------------------
+def _attrs(op):
+    return getattr(op, "operation", op).attributes
+
+
+def attr_int(op, key, default=None):
+    """Integer value of `op`'s `key` attribute, or `default` if absent."""
+    ir = _ir()
+    a = _attrs(op)
+    return ir.IntegerAttr(a[key]).value if key in a else default
+
+
+def attr_bool(op, key, default=False):
+    """Bool value of `op`'s `key` attribute, or `default` if absent."""
+    ir = _ir()
+    a = _attrs(op)
+    return bool(ir.BoolAttr(a[key]).value) if key in a else default
+
+
+def attr_i64_array(op, key, default=None):
+    """`op`'s `key` ArrayAttr of integers as a Python list, or `default` if
+    absent (pass `default=[]` for the "missing -> empty" convention)."""
+    ir = _ir()
+    a = _attrs(op)
+    return ([ir.IntegerAttr(x).value for x in ir.ArrayAttr(a[key])]
+            if key in a else default)
diff --git a/PyTorchSimFrontend/mlir/passes/build_skeleton.py b/PyTorchSimFrontend/mlir/passes/build_skeleton.py
new file mode 100644
index 00000000..4c3d89cb
--- /dev/null
+++ b/PyTorchSimFrontend/mlir/passes/build_skeleton.py
@@ -0,0 +1,566 @@
+"""build_skeleton pass (C2): reduce a kernel's post-vcix MLIR to the
+*skeleton + API* form, in place.
+
+The trace pipeline (docs/design/togsim_cpp_trace.md) compiles a kernel to a
+shape-parametric C++ trace producer. The producer is just the kernel's loop
+skeleton with the data computation replaced by calls to the event-based runtime
+API. This pass performs that reduction at the MLIR level:
+
+  * `memref.dma_start`  -> `togsim.dma(...) {tag_id, is_async, ...}` carrying the
+                            runtime tag index operand (`%tag[%idx]`).
+  * `memref.dma_wait`   -> `togsim.memory_barrier(tag_idx) {tag_id, write_bufs}`,
+                            the explicit async-DMA sync. It pairs with its dma by
+                            the RUNTIME tag slot (tag_id + the tag index), not a
+                            compile-time id: one static dma op runs once per loop
+                            iteration with a different `%tag[%idx]`, so only the
+                            runtime slot can pair iteration i's dma with its wait.
+  * each compute node   -> a single `togsim.compute {tile_id, compute_type}`
+  * everything else      -> removed by a use-based DCE, keeping the loops and the
+                            index/address arithmetic the survivors depend on.
+
+It reuses build_tog's traversal (`TogBuilder` / `_build`): loops, DMAs and
+compute blocks are already identified there, each with a back-pointer to its
+MLIR op(s), so this pass only adds the *rewrite*. Keeping a single traversal
+guarantees the skeleton and the legacy TOG see the same structure.
+
+Counterpart to `build_tog.build_tog_and_mutate`.
+
+The DCE is safe by construction: it never erases an op whose results still have
+uses, so at worst it leaves extra ops in the dump (visible for diagnosis) rather
+than producing invalid IR.
+
+Requires the MLIR Python bindings (importing `build_tog` pulls in `mlir.ir`).
+"""
+
+from . import togsim_ops as ts
+from ._mlir_util import walk_ops, i32, i64, i64_array, str_attr
+from .build_tog import (
+    ir,
+    TogBuilder,
+    _build,
+    _reset_ids,
+    _find_kernel,
+    _value_key,
+    TOGDMANode,
+    TOGDMAWaitNode,
+    _COMPUTE_TYPE_NAME,
+)
+
+#: Marker op names for the passes/__init__ fast-path (skip parsing if absent).
+MARKERS = ("memref.dma_start", "memref.dma_wait")
+
+#: Ops the DCE must never remove (loops, terminators, our API ops).
+_KEEP = {
+    "affine.for", "scf.for", "scf.while",
+    "affine.yield", "scf.yield", "func.return",
+    ts.DMA, ts.COMPUTE, ts.MEMORY_BAR,
+}
+
+
+def _kernel_block(module):
+    func_op = _find_kernel(module)
+    if func_op is None:
+        return None
+    return func_op.regions[0].blocks[0]
+
+
+# ---------------------------------------------------------------------------
+# op construction
+# ---------------------------------------------------------------------------
+def _arg_id_of(base_addr):
+    """Tensor func-arg ordinal from a build_tog base name ("arg3" -> 3); -1 if
+    it is not a plain block-arg base."""
+    s = str(base_addr)
+    return int(s[3:]) if s.startswith("arg") and s[3:].isdigit() else -1
+
+
+def _emit_dma(ctx, dma_node, tag_id, dram_index, tag_index, read_bufs, write_bufs):
+    """Insert a `togsim.dma` before the original `memref.dma_start`.
+
+    `tag_id` is the identity of this DMA's tag memref. An async DMA pairs with
+    its `togsim.memory_barrier` (the original dma_wait) by the RUNTIME tag slot
+    -- (tag_id, tag_index) -- not a compile-time identifier: one static dma op runs
+    once per loop iteration, each with a different runtime `%tag[%idx]` slot, so
+    only a runtime key can pair iteration i's dma with iteration i's wait.
+
+    `dram_index` is the original linear DRAM index Value (the `affine.apply`
+    result that indexed the tensor in the `memref.dma_start`) -- carried as an
+    operand so the DCE keeps the address arithmetic live and the C4 lowering can
+    compute the real `base_addr = base[arg_id] + index*elem` (P3, approach A).
+
+    `tag_index` is the original SRAM tag index Value (`%tag[%idx]`), carried as a
+    second operand: the runtime tag slot, used both to pair with the barrier and
+    for the double-buffer / SRAM-capacity (WAR) model.
+    Operand order: [dram_index, tag_index] (each omitted if absent)."""
+    op = dma_node.op
+    attrs = {
+        ts.ATTR_DIR: i32(ts.DIR_STORE if dma_node.is_write else ts.DIR_LOAD),
+        ts.ATTR_DIMS: i64_array(dma_node.tile_size),
+        ts.ATTR_STRIDES: i64_array(dma_node.tile_stride),
+        ts.ATTR_ELEM_BITS: i32(dma_node.element_size),
+        ts.ATTR_IS_ASYNC: ir.BoolAttr.get(bool(dma_node.is_async)),
+        ts.ATTR_TAG_ID: i32(tag_id),
+        ts.ATTR_ARG_ID: i32(_arg_id_of(dma_node.base_addr)),
+        "base": str_attr(dma_node.base_addr),
+        # SRAM spad this DMA touches (load writes it, store reads it) -- sec 10.
+        ts.ATTR_READ_BUFS: i64_array(read_bufs),
+        ts.ATTR_WRITE_BUFS: i64_array(write_bufs),
+    }
+    operands = [v for v in (dram_index, tag_index) if v is not None]
+    ir.Operation.create(
+        ts.DMA,
+        results=[],
+        operands=operands,
+        attributes=attrs,
+        loc=ir.Location.unknown(ctx),
+        ip=ir.InsertionPoint(op),
+    )
+
+
+def _emit_memory_bar(ctx, anchor_op, tag_id, tag_index, write_bufs):
+    """Insert a `togsim.memory_barrier` before `anchor_op` -- the explicit
+    async-DMA sync that was the original `memref.dma_wait`. It pairs with its
+    async `togsim.dma` by the RUNTIME tag slot (tag_id + tag_index), and carries
+    the SRAM buffer that dma loaded so consumers gate on data-arrival, not on the
+    async dma's issue-complete."""
+    attrs = {
+        ts.ATTR_TAG_ID: i32(tag_id),
+        ts.ATTR_WRITE_BUFS: i64_array(write_bufs),
+    }
+    operands = [tag_index] if tag_index is not None else []
+    ir.Operation.create(
+        ts.MEMORY_BAR, results=[], operands=operands, attributes=attrs,
+        loc=ir.Location.unknown(ctx), ip=ir.InsertionPoint(anchor_op))
+
+
+def _flatten_add(expr):
+    """Top-level additive summands of an AffineExpr (`.lhs`/`.rhs` come back typed
+    as the base AffineExpr, so use the `isinstance`/cast pattern, not Python
+    isinstance)."""
+    if ir.AffineAddExpr.isinstance(expr):
+        a = ir.AffineAddExpr(expr)
+        return _flatten_add(a.lhs) + _flatten_add(a.rhs)
+    return [expr]
+
+
+def _neg_coeff_dim(summand):
+    """If `summand` is `dim * c` with a negative constant `c`, return that dim's
+    position; else None. lower_to_vcix tags each accumulation (reduction) loop var
+    with coefficient -1 in the dma_wait tag index -- a SENTINEL marking the
+    reduction axis, not an arithmetic offset (legacy TileGraphParser skips stride
+    -1 for the same reason)."""
+    if not ir.AffineMulExpr.isinstance(summand):
+        return None
+    mul = ir.AffineMulExpr(summand)
+    l, r = mul.lhs, mul.rhs
+    dim = l if ir.AffineDimExpr.isinstance(l) else (r if ir.AffineDimExpr.isinstance(r) else None)
+    con = l if ir.AffineConstantExpr.isinstance(l) else (r if ir.AffineConstantExpr.isinstance(r) else None)
+    if dim is None or con is None or ir.AffineConstantExpr(con).value >= 0:
+        return None
+    return ir.AffineDimExpr(dim).position
+
+
+def _strip_accum_terms(ctx, tag_index, anchor_op):
+    """Return a tag-index Value with the accumulation-marked (-1 coefficient) terms
+    dropped, so a memory_barrier waits on the SAME subtile slot its async load
+    wrote.
+
+    The wait tag index built by lower_to_vcix carries `-acc_iv` for each reduction
+    loop var; the matching load index (dma_fine_grained) is subtile-only. Without
+    this, at reduction iteration > 0 the producer EVALUATES `-acc_iv` to a negative
+    slot, so the recorded barrier slot diverges from the load slot and the runtime
+    tag pairing fails (TOGSim aborts with "Key does not exist in ... tag table").
+    Dropping the -1 terms mirrors legacy TileGraphParser.cc, which skips stride -1
+    and routes the reduction axis to a separate accum tag component; here the
+    per-iteration tag alloc (dma_fine_grained) already separates the reductions, so
+    the barrier only needs the subtile slot.
+
+    Falls through (returns `tag_index` unchanged) for anything that is not an
+    affine.apply whose single result carries such a term -- e.g. the single-tile
+    case, whose index has no reduction term."""
+    if tag_index is None:
+        return None
+    try:
+        apply_op = tag_index.owner
+        if apply_op.name != "affine.apply":
+            return tag_index
+        amap = ir.AffineMapAttr(apply_op.attributes["map"]).value
+    except Exception:
+        return tag_index
+    if amap.n_dims == 0 or amap.n_symbols != 0 or len(amap.results) != 1:
+        return tag_index
+    expr = amap.results[0]
+    dropped = sorted({p for p in (_neg_coeff_dim(s) for s in _flatten_add(expr))
+                      if p is not None})
+    if not dropped:
+        return tag_index
+    n = amap.n_dims
+    kept = [i for i in range(n) if i not in dropped]
+    new_pos = {old: i for i, old in enumerate(kept)}
+    # compose the original expr with a selector that sends each dropped dim to 0
+    # and renumbers the kept dims 0..k-1.
+    sel = [ir.AffineConstantExpr.get(0) if i in dropped
+           else ir.AffineDimExpr.get(new_pos[i]) for i in range(n)]
+    new_expr = expr.compose(ir.AffineMap.get(len(kept), 0, sel))
+    new_map = ir.AffineMap.get(len(kept), 0, [new_expr])
+    operands = list(apply_op.operands)
+    new_operands = [operands[i] for i in kept]
+    new_apply = ir.Operation.create(
+        "affine.apply",
+        results=[ir.IndexType.get(ctx)],
+        operands=new_operands,
+        attributes={"map": ir.AffineMapAttr.get(new_map)},
+        loc=ir.Location.unknown(ctx),
+        ip=ir.InsertionPoint(anchor_op),
+    )
+    return new_apply.results[0]
+
+
+def _emit_compute(ctx, compute_node, tile_id, read_bufs, write_bufs):
+    front = compute_node.operations[0]
+    attrs = {
+        ts.ATTR_TILE_ID: i64(tile_id),
+        # int code (0 vector / 1 matmul / 2 preload) consumed by the C4 lowering;
+        # maps directly to the Core compute-unit enum. Keep the readable name too.
+        ts.ATTR_COMPUTE_TYPE: i32(int(compute_node.compute_type)),
+        "compute_type_name": str_attr(_COMPUTE_TYPE_NAME[compute_node.compute_type]),
+        # SRAM buffer ids read/written (sec 10 dataflow); the bridge builds the
+        # dependency DAG by last-writer per buffer.
+        ts.ATTR_READ_BUFS: i64_array(read_bufs),
+        ts.ATTR_WRITE_BUFS: i64_array(write_bufs),
+    }
+    ir.Operation.create(
+        ts.COMPUTE,
+        results=[],
+        operands=[],
+        attributes=attrs,
+        loc=ir.Location.unknown(ctx),
+        ip=ir.InsertionPoint(front),
+    )
+
+
+# ---------------------------------------------------------------------------
+# DCE
+# ---------------------------------------------------------------------------
+def _has_nonempty_region(op):
+    for region in op.operation.regions:
+        for b in region.blocks:
+            if len(list(b.operations)) > 0:
+                return True
+    return False
+
+
+def _results_unused(op):
+    for r in op.operation.results:
+        if len(list(r.uses)) > 0:
+            return False
+    return True
+
+
+def _strip_loop_iter_args(block):
+    """Drop loop-carried values (iter_args) from every affine.for/scf.for.
+
+    The skeleton only needs the loop STRUCTURE (iteration counts) and the
+    togsim.* markers -- not the data flowing through the loop. Reduction kernels
+    carry a *vector* accumulator as an iter_arg; EmitC/C++ cannot represent a
+    loop carrying a vector, so the trace .so emission fails. Since the trace is
+    timing-only (values come from the recorded run), we rebuild each loop without
+    iter_args: body uses of an iter_arg become its init value, the loop result
+    becomes its init, and the now-orphaned accumulate ops are removed by _dce.
+    """
+    # Only strip a loop whose RESULTS are unused (dead for the trace): the carried
+    # value goes nowhere live, so dropping it is safe. A loop whose result still
+    # feeds a kept op (e.g. an index accumulator consumed by a togsim.dma address)
+    # is left untouched. Run after _dce so the result store is already gone; then
+    # nested reductions free up inner results round by round (outer stripped first).
+    while True:
+        tgt = None
+        for op in walk_ops(block):
+            n = op.operation.name
+            if (n in ("affine.for", "scf.for") and len(op.operation.results) > 0
+                    and _results_unused(op)):
+                tgt = op
+                break
+        if tgt is None:
+            return
+        _rebuild_loop_no_iter(tgt)
+
+
+def _rebuild_loop_no_iter(op):
+    o = op.operation
+    nres = len(o.results)
+    n_in = len(o.operands)
+    inits = [o.operands[n_in - nres + i] for i in range(nres)]
+    keep_operands = [o.operands[i] for i in range(n_in - nres)]  # bound operands only
+    old_block = o.regions[0].blocks[0]
+    oargs = list(old_block.arguments)  # [iv, *iter_args]
+
+    attrs = {na.name: na.attr for na in o.attributes}
+    # affine.for tags its operand groups; zero the iter-arg group (last entry).
+    if "operandSegmentSizes" in attrs:
+        seg = [int(x) for x in str(attrs["operandSegmentSizes"]).split(":")[1].strip(" >").split(",")]
+        seg[-1] = 0
+        attrs["operandSegmentSizes"] = ir.Attribute.parse(
+            "array<i32: " + ", ".join(str(s) for s in seg) + ">")
+
+    loc = ir.Location.unknown(o.context)
+    with loc:                                                  # default loc for new block args
+        new = ir.Operation.create(o.name, results=[], operands=keep_operands,
+                                  attributes=attrs, regions=1, loc=loc,
+                                  ip=ir.InsertionPoint(o))
+        nb = new.regions[0].blocks.append(oargs[0].type)      # block with the iv arg only
+
+        oargs[0].replace_all_uses_with(nb.arguments[0])       # iv
+        for ba, ini in zip(oargs[1:], inits):                 # iter-arg uses -> init
+            ba.replace_all_uses_with(ini)
+        for res, ini in zip(o.results, inits):                # loop result -> init
+            res.replace_all_uses_with(ini)
+
+        term_name = "affine.yield" if o.name == "affine.for" else "scf.yield"
+        with ir.InsertionPoint(nb):
+            ir.Operation.create(term_name, results=[], operands=[], loc=loc)
+        new_term = list(nb.operations)[0]
+        for bop in list(old_block.operations)[:-1]:           # move body (drop old yield)
+            bop.operation.move_before(new_term)
+        o.erase()
+
+
+def _dce(block):
+    """Erase non-kept ops with no used results, to a fixed point. Safe: an op
+    with live SSA uses is never touched."""
+    changed = True
+    while changed:
+        changed = False
+        victims = []
+        for op in walk_ops(block):
+            name = op.operation.name
+            if name in _KEEP:
+                continue
+            if _has_nonempty_region(op):
+                continue
+            if _results_unused(op):
+                victims.append(op)
+        for op in victims:
+            try:
+                op.operation.erase()
+                changed = True
+            except Exception:
+                # Still referenced via something we will erase next round; retry.
+                pass
+
+
+# ---------------------------------------------------------------------------
+# driver
+# ---------------------------------------------------------------------------
+def _collect_dma_nodes(builder):
+    """Map op-identity -> DMA/DMAWait node, by walking the built tree."""
+    by_op = {}
+    seen = set()
+
+    def visit(n):
+        if id(n) in seen:
+            return
+        seen.add(id(n))
+        if isinstance(n, (TOGDMANode, TOGDMAWaitNode)) and n.op is not None:
+            by_op[id(n.op.operation)] = n
+        for c in n.children:
+            visit(c)
+
+    for ln in builder.loop_nodes:
+        visit(ln)
+    return by_op
+
+
+class _BufferIds:
+    """Assigns each SRAM buffer name a stable small int id, shared by DMA and
+    compute so the bridge can match a reader to its buffer's writer (sec 10).
+    The virtual SA_WEIGHTS buffer (preload -> matmul) is numbered here too, on
+    first sight. `None` (a non-buffer base) is -1."""
+
+    def __init__(self):
+        self._ids = {}
+
+    def of(self, name):
+        if name is None:
+            return -1
+        return self._ids.setdefault(name, len(self._ids))
+
+
+class _TagIds:
+    """Identity of a DMA's tag memref -> stable small int, plus the SRAM buffer
+    that tag's async DMA loads. An async dma and its memory_barrier (the original
+    dma_wait) share a tag memref; this assigns it a tag_id (so the runtime can
+    pair them by the runtime tag slot) and remembers the loaded buffer so the
+    barrier can release it to consumers. Pairing is by tag, never a static id."""
+
+    def __init__(self):
+        self._ids = {}   # tag value-key -> tag_id
+        self._buf = {}   # tag value-key -> SRAM buffer id the dma loads
+
+    def bind(self, key, buf):
+        tag_id = self._ids.setdefault(key, len(self._ids))
+        self._buf[key] = buf
+        return tag_id
+
+    def lookup(self, key):
+        """(tag_id, buffer) for a tag memref, or None if no dma used it."""
+        if key not in self._ids:
+            return None
+        return self._ids[key], self._buf[key]
+
+
+def _emit_computes(ctx, builder, bufs):
+    """Step 1: each compute node -> one togsim.compute carrying its tile_id and
+    the ids of the SRAM buffers it reads/writes. Returns the count."""
+    from . import dep_analysis as dep  # lazy: dep_analysis imports build_skeleton
+    n = 0
+    for tile_id, cn in enumerate(builder.compute_nodes):
+        if not cn.operations:
+            continue
+        reads, writes = dep.compute_buffers(cn)
+        _emit_compute(ctx, cn, tile_id,
+                      sorted(bufs.of(b) for b in reads),
+                      sorted(bufs.of(b) for b in writes))
+        n += 1
+    return n
+
+
+def _emit_one_dma(ctx, op, node, builder, bufs, tags):
+    """Rewrite one memref.dma_start as togsim.dma. A load reads DRAM and writes
+    its SRAM spad; a store reads the spad and writes DRAM -- which sets the
+    read/write buffer that drives the dependency edge (sec 10). The tag memref is
+    bound to a tag_id (with its loaded buffer) so the paired memory_barrier finds
+    it by the runtime tag slot."""
+    from . import dep_analysis as dep  # lazy: dep_analysis imports build_skeleton
+    f = builder._dma_start_fields(op)
+    dram_indices = f["dst_indices"] if node.is_write else f["src_indices"]
+    dram_index = dram_indices[0] if dram_indices else None
+    tag_indices = f["tag_indices"]
+    tag_index = tag_indices[0] if tag_indices else None
+    # the spad is the SRAM side of the copy: dst for a load, src for a store.
+    spad_id = bufs.of(dep._global_of(f["src"] if node.is_write else f["dst"]))
+    read_bufs = [spad_id] if node.is_write else []
+    write_bufs = [] if node.is_write else [spad_id]
+    tag_id = tags.bind(_value_key(f["tag"]), spad_id)
+    _emit_dma(ctx, node, tag_id, dram_index, tag_index, read_bufs, write_bufs)
+
+
+def _emit_one_wait(ctx, op, tags):
+    """Rewrite one memref.dma_wait as togsim.memory_barrier -- the explicit
+    async-DMA sync already in the IR. Paired with its dma by the tag memref
+    (tag_id) and the runtime tag index; carries the buffer the dma loaded.
+    Returns True iff emitted (a wait whose tag no dma used is dropped)."""
+    operands = list(op.operation.operands)
+    tag = operands[0]
+    tag_index = operands[1] if len(operands) >= 2 else None
+    binding = tags.lookup(_value_key(tag))
+    if binding is None:
+        return False
+    tag_id, buf = binding
+    # honor lower_to_vcix's -1 accumulation marker: strip the reduction terms so
+    # the barrier slot equals the subtile slot the paired async load wrote.
+    tag_index = _strip_accum_terms(ctx, tag_index, op)
+    _emit_memory_bar(ctx, op, tag_id, tag_index, [buf])
+    return True
+
+
+def _emit_dmas_and_waits(ctx, block, builder, dma_by_op, bufs):
+    """Step 2: rewrite memref.dma_start -> togsim.dma and memref.dma_wait ->
+    togsim.memory_barrier in program order. An async dma and its barrier are
+    paired by the RUNTIME tag slot (tag_id + tag index), not a compile-time id:
+    one static dma op runs per loop iteration with a different `%tag[%idx]`, so
+    only the runtime slot can pair iteration i's dma with iteration i's wait.
+    Returns the original ops to erase and the (dma, wait) counts."""
+    tags = _TagIds()
+    originals = []
+    n_dma = n_wait = 0
+    for op in list(walk_ops(block)):
+        name = op.operation.name
+        if name == "memref.dma_start":
+            node = dma_by_op.get(id(op.operation))
+            if node is None:
+                continue
+            _emit_one_dma(ctx, op, node, builder, bufs, tags)
+            originals.append(op)
+            n_dma += 1
+        elif name == "memref.dma_wait":
+            if _emit_one_wait(ctx, op, tags):
+                n_wait += 1
+            originals.append(op)
+    return originals, n_dma, n_wait
+
+
+def build_skeleton(module):
+    """Reduce `func.func @kernel` in `module` to the skeleton+API form, in place.
+
+    Four steps: analyze the kernel into loop/compute/DMA nodes, emit a
+    togsim.compute per compute node, rewrite the DMAs/waits to togsim.dma/wait,
+    then DCE the leftover data computation. Returns a short text report (counts).
+    """
+    _reset_ids()
+    builder = TogBuilder()
+    _build(module, builder)  # populates loop/compute nodes + op back-pointers
+
+    block = _kernel_block(module)
+    if block is None:
+        return "no @kernel found"
+    ctx = module.context
+    dma_by_op = _collect_dma_nodes(builder)
+    bufs = _BufferIds()
+
+    n_compute = _emit_computes(ctx, builder, bufs)
+    originals, n_dma, n_wait = _emit_dmas_and_waits(ctx, block, builder, dma_by_op, bufs)
+
+    # erase the now-replaced originals (result-less -> safe), then strip the
+    # leftover data computation.
+    for op in originals:
+        try:
+            op.operation.erase()
+        except Exception:
+            pass
+    _dce(block)                    # drop dead consumers (e.g. the result store) first,
+    _strip_loop_iter_args(block)   # so a now-unused loop result lets us strip its iter_args
+    _dce(block)                    # then clean the orphaned accumulate ops
+
+    return ("skeleton: compute=%d dma=%d wait=%d (unpaired waits dropped)"
+            % (n_compute, n_dma, n_wait))
+
+
+def run(module, vectorlane=128):
+    """passes/__init__ pass protocol entry (vectorlane unused; kept for parity)."""
+    build_skeleton(module)
+
+
+def run_skeleton(in_path, out_path=None):
+    """Read post-vcix MLIR at `in_path`, reduce to skeleton+API, write it out.
+
+    Requires the MLIR bindings.
+    """
+    if out_path is None:
+        out_path = in_path
+    ctx = ir.Context()
+    ctx.allow_unregistered_dialects = True
+    with ctx:
+        module = ir.Module.parse(open(in_path).read(), ctx)
+        report = build_skeleton(module)
+        with open(out_path, "w") as fh:
+            fh.write(str(module))
+    return report
+
+
+def main(argv):
+    import argparse
+
+    parser = argparse.ArgumentParser(prog="build_skeleton.py")
+    parser.add_argument("input")
+    parser.add_argument("--out", default=None)
+    args = parser.parse_args(argv[1:])
+    report = run_skeleton(args.input, args.out)
+    import sys
+    sys.stderr.write(report + "\n")
+    return 0
+
+
+if __name__ == "__main__":
+    import sys
+    sys.exit(main(sys.argv))
diff --git a/PyTorchSimFrontend/mlir/passes/cycle_table.py b/PyTorchSimFrontend/mlir/passes/cycle_table.py
new file mode 100644
index 00000000..40dd3459
--- /dev/null
+++ b/PyTorchSimFrontend/mlir/passes/cycle_table.py
@@ -0,0 +1,103 @@
+"""cycle_table (C3): the precomputed tile_id -> (cycle, overlapping_cycle) table
+the C++ trace pipeline looks up at runtime (docs/design/togsim_cpp_trace.md sec
+6, sec 9.8 task 4).
+
+A `togsim.compute(tile_id=...)` in the trace says *which* tile to compute, not
+how long it takes. Because tiles are fixed size, each tile's cost is invariant
+(only the trip count varies with shape), so it is sampled once and stored here,
+keyed by `tile_id`. Two numbers per tile, mirroring the legacy TOG:
+
+  * `cycle`            -- full compute latency, sampled by gem5 sample-mode
+                          (the existing measurement: `_rewrite_loop_steps` +
+                          `_insert_compute_markers` in build_tog, run through
+                          CycleSimulator -> the per-tile `cycle_list`).
+  * `overlapping_cycle` -- the portion that overlaps the previous instruction in
+                          the systolic pipeline; the timing core uses it as
+                          `finish = prev.finish + cycle - overlapped` (Core.cc).
+                          Derived exactly as the legacy path does
+                          (tog_generator.generate_tile_graph):
+                              type 0 (VectorCompute)  -> 0
+                              type 1 (MatmulCompute)  -> max(cycle - x_offset, 0)
+                              type 2 (MatmulPreload)  -> max(cycle - w_offset, 0)
+
+This module only *builds/serializes* the table from a cycle_list; obtaining the
+cycle_list reuses the existing sample-mode + gem5 path (wired in P3 task 5). The
+`tile_id` order matches build_skeleton's `compute_nodes` order, which matches the
+legacy TOG, so the same sampling keys both paths.
+
+Requires the MLIR Python bindings (to read the skeleton's togsim.compute ops).
+"""
+
+import json
+
+from . import togsim_ops as ts
+from ._mlir_util import walk_ops
+from .build_tog import (
+    ir,
+    VECTOR_COMPUTE,
+    MATMUL_COMPUTE,   # noqa: F401 (documents the type enum used by the formula)
+    MATMUL_PRELOAD,
+)
+
+
+def overlapping_cycle(cycle, compute_type, x_offset, w_offset):
+    """Hideable (pipeline-overlapped) portion of `cycle`. Mirrors
+    tog_generator.generate_tile_graph."""
+    if compute_type <= VECTOR_COMPUTE:           # VectorCompute: no systolic overlap
+        return 0
+    offset = w_offset if compute_type == MATMUL_PRELOAD else x_offset
+    return max(int(cycle) - int(offset), 0)
+
+
+def _compute_types(skeleton_module):
+    """tile_id-ordered list of compute_type ints, from the skeleton's
+    togsim.compute ops."""
+    items = []
+    for op in walk_ops(skeleton_module.body):
+        if op.operation.name != ts.COMPUTE:
+            continue
+        tid = ir.IntegerAttr(op.operation.attributes[ts.ATTR_TILE_ID]).value
+        ct = ir.IntegerAttr(op.operation.attributes[ts.ATTR_COMPUTE_TYPE]).value
+        items.append((tid, ct))
+    items.sort()
+    return [t for _, t in items]
+
+
+def build_cycle_table(skeleton_module, cycle_list, x_offset, w_offset):
+    """Return `[(cycle, overlapping_cycle), ...]` indexed by tile_id.
+
+    `cycle_list` is the per-tile gem5 measurement (compute_nodes order ==
+    tile_id order). `x_offset`/`w_offset` are the systolic-fill offsets the
+    legacy path computes from the vector-lane size / loop size."""
+    types = _compute_types(skeleton_module)
+    if len(cycle_list) != len(types):
+        raise ValueError(
+            "cycle_list (%d) does not match #compute tiles (%d)"
+            % (len(cycle_list), len(types)))
+    return [(int(c), overlapping_cycle(c, t, x_offset, w_offset))
+            for c, t in zip(cycle_list, types)]
+
+
+def dump_cycle_table(table, path, x_offset=None, w_offset=None):
+    """Serialize the table as a sidecar JSON next to the trace `.so`. The P3 C6
+    loader reads it and sets compute_cycle + overlapping_cycle on each emitted
+    Instruction."""
+    with open(path, "w") as fh:
+        json.dump({"x_offset": x_offset, "w_offset": w_offset,
+                   "table": [list(e) for e in table]}, fh)
+    return path
+
+
+def load_cycle_table(path):
+    with open(path) as fh:
+        return json.load(fh)
+
+
+def dump_cycle_table_tsv(table, path):
+    """Plain `cycle<TAB>overlapping` per line, in tile_id order -- the trivial
+    format the C++ `--cycle_table` loader (main.cc, P3 trace pipeline) reads with
+    ifstream (no JSON dependency in TOGSim)."""
+    with open(path, "w") as fh:
+        for cycle, overlapping in table:
+            fh.write("%d\t%d\n" % (int(cycle), int(overlapping)))
+    return path
diff --git a/PyTorchSimFrontend/mlir/passes/decompose_transfer.py b/PyTorchSimFrontend/mlir/passes/decompose_transfer.py
index c0e82b66..10b2edfb 100644
--- a/PyTorchSimFrontend/mlir/passes/decompose_transfer.py
+++ b/PyTorchSimFrontend/mlir/passes/decompose_transfer.py
@@ -32,13 +32,7 @@
 OP_NAME = "togsim.transfer"
 MARKERS = (OP_NAME,)
 
-
-def _iter_ops(block):
-    for op in list(block.operations):
-        yield op
-        for region in op.operation.regions:
-            for b in region.blocks:
-                yield from _iter_ops(b)
+from ._mlir_util import walk_ops
 
 
 def _int_array(attr):
@@ -92,7 +86,7 @@ def run(module, vectorlane=128, **_):
     targets = []
     for region in module.operation.regions:
         for b in region.blocks:
-            for op in _iter_ops(b):
+            for op in walk_ops(b):
                 if op.operation.name == OP_NAME:
                     targets.append(op.operation)
 
diff --git a/PyTorchSimFrontend/mlir/passes/dep_analysis.py b/PyTorchSimFrontend/mlir/passes/dep_analysis.py
new file mode 100644
index 00000000..06d8270d
--- /dev/null
+++ b/PyTorchSimFrontend/mlir/passes/dep_analysis.py
@@ -0,0 +1,234 @@
+"""dep_analysis.py -- dependency-edge analysis for the C++ trace pipeline (P3, sec 10).
+
+The current TOG pass does NO dependency analysis (it emits a lexical loop tree +
+runtime tags). This module derives the producer->consumer edges that the explicit
+dataflow trace needs, from two sources available on the post-vcix IR (before
+build_skeleton collapses the compute regions):
+
+  1. SRAM access: each DMA/compute's read/write SRAM buffer(s), recovered by
+     following SSA (a vcix.iv's input vector -> its vector.transfer_read -> the
+     memref -> @global), and the DMA's spad operand. Edge: a reader depends on
+     the last node that wrote the same buffer.
+  2. vcix preload/matmul pairing: a matmul (vcix opcode 0) consumes the weights a
+     preceding preload (opcode 1) loaded into the systolic array -- an SA-internal
+     dependency NOT visible as a memref access, so it comes from the opcode order.
+
+This is a node-level analysis (one node per build_tog compute/DMA node); the loops
+replay the nodes, so loop-carried edges (the Y_spad accumulator) are materialized
+per iteration downstream. First cut: buffer granularity (slot-level value matching
+is a later refinement). Output is an edge list for validation / to drive emit.
+"""
+import sys
+import os
+
+from .build_tog import TogBuilder, ir, _reset_ids
+from . import build_skeleton as _bs
+
+
+def _global_of(memref_val):
+    """memref SSA value -> @global symbol name (e.g. 'X_spad'), or None."""
+    owner = memref_val.owner
+    op = owner if isinstance(owner, ir.Operation) else getattr(owner, "operation", None)
+    if op is None:
+        return None
+    if op.name == "memref.get_global":
+        return str(op.attributes["name"]).strip('@" ')
+    # walk through view-like ops (subview/cast) to their source
+    if op.operands:
+        try:
+            return _global_of(op.operands[0])
+        except Exception:
+            return None
+    return None
+
+
+# Ops that touch SRAM-buffer DATA, by category. A view op (subview/reinterpret_cast)
+# instead PRODUCES a memref -- pure address computation, skipped here; the real access
+# is the load/store using it, whose memref operand _global_of traces back through the
+# view to the @global. Anything else carrying a memref operand raises, so a NEW fusion
+# pattern is caught at compile time rather than as a silent runtime deadlock.
+_LOAD_OPS = {"vector.transfer_read", "affine.vector_load", "vector.load",
+             "memref.load", "affine.load"}
+_STORE_OPS = {"vector.transfer_write", "affine.vector_store", "vector.store",
+              "memref.store", "affine.store"}
+_IGNORE_OPS = {"memref.dealloc"}   # lifetime, not a data access
+
+
+def _is_memref(v):
+    try:
+        return ir.MemRefType.isinstance(v.type)
+    except Exception:
+        return False
+
+
+def _walk_compute_ops(cn):
+    """Every op in the compute node, recursing into nested regions (loop bodies). A
+    fused epilogue (BatchNorm/ReLU) keeps its ops inside an un-unrolled affine.for, so
+    a top-level-only scan (cn.operations) sees just the loop and misses every access."""
+    for top in cn.operations:
+        stack = [top]
+        while stack:
+            op = stack.pop()
+            yield op
+            for region in op.operation.regions:
+                for block in region.blocks:
+                    stack.extend(block.operations)
+
+
+def _rw_buffers_of_compute(cn):
+    """(reads, writes): the @global SRAM buffers a compute node reads/writes, walking
+    nested regions and classifying each op that touches a memref."""
+    reads, writes = set(), set()
+    def rd(v):
+        b = _global_of(v)
+        if b:
+            reads.add(b)
+    def wr(v):
+        b = _global_of(v)
+        if b:
+            writes.add(b)
+    for op in _walk_compute_ops(cn):
+        if any(_is_memref(r) for r in op.results):
+            continue                                   # view/cast/alloc -- address only
+        mrefs = [v for v in op.operands if _is_memref(v)]
+        if not mrefs:
+            continue
+        name = op.name
+        if name in _LOAD_OPS:
+            for v in mrefs:
+                rd(v)
+        elif name in _STORE_OPS:
+            for v in mrefs:
+                wr(v)                                  # the store target memref
+        elif name == "memref.copy":
+            rd(mrefs[0])
+            wr(mrefs[-1])
+        elif name.startswith("linalg."):               # DPS: ins read, outs read+write
+            for v in op.inputs:
+                if _is_memref(v):
+                    rd(v)
+            for v in op.outputs:
+                if _is_memref(v):
+                    rd(v)
+                    wr(v)
+        elif name in _IGNORE_OPS:
+            continue
+        else:
+            raise RuntimeError(
+                f"dep_analysis: unclassified memref op '{name}' in a compute node -- "
+                f"it touches an SRAM buffer; classify it in _LOAD_OPS/_STORE_OPS")
+    return reads, writes
+
+
+def _dma_buffer(builder, dma_node):
+    """The SRAM spad buffer a DMA touches (dst for load, src for store)."""
+    try:
+        f = builder._dma_start_fields(dma_node.op)
+    except Exception:
+        return None
+    val = f["dst"] if not dma_node.is_write else f["src"]
+    return _global_of(val)
+
+
+# Virtual buffer for the systolic-array weight registers: a preload writes it,
+# the following matmul reads it. This folds the SA-internal preload->matmul
+# dependency (not a memref access) into the uniform "last-writer per buffer" rule.
+SA_WEIGHTS = "__SA_WEIGHTS__"
+
+
+def compute_buffers(cn):
+    """(read_buffers, write_buffers) for one compute node, including the virtual
+    SA_WEIGHTS edge (preload writes it, matmul reads it)."""
+    reads, writes = _rw_buffers_of_compute(cn)
+    if cn.compute_type == 1:      # MATMUL consumes the preloaded weights
+        reads.add(SA_WEIGHTS)
+    elif cn.compute_type == 2:    # PRELOAD loads them
+        writes.add(SA_WEIGHTS)
+    return reads, writes
+
+
+def analyze(module):
+    """Return (nodes, edges). nodes: list of dicts; edges: list of (consumer_idx,
+    producer_idx, reason)."""
+    _reset_ids()
+    builder = TogBuilder()
+    _bs._build(module, builder)
+
+    nodes = []
+    # DMA nodes only (the map also contains TOGDMAWaitNode; keep real DMAs).
+    dma_nodes = [dn for dn in dict.fromkeys(_bs._collect_dma_nodes(builder).values())
+                 if hasattr(dn, "is_write")]
+    for dn in dma_nodes:
+        buf = _dma_buffer(builder, dn)
+        nodes.append({
+            "kind": "STORE" if dn.is_write else "LOAD",
+            "buf": buf, "arg": str(dn.base_addr),
+            "reads": {buf} if dn.is_write else set(),
+            "writes": {buf} if not dn.is_write else set(),
+            "node": dn,
+        })
+    for cn in builder.compute_nodes:
+        if not cn.operations:
+            continue
+        ct = {0: "VECTOR", 1: "MATMUL", 2: "PRELOAD"}.get(cn.compute_type, f"c{cn.compute_type}")
+        creads, cwrites = _rw_buffers_of_compute(cn)
+        nodes.append({
+            "kind": ct,
+            "reads": creads,
+            "writes": cwrites,
+            "node": cn,
+            "compute_type": cn.compute_type,
+        })
+
+    # Order nodes by program position (last-writer needs program order: e.g. the
+    # store reads Y_spad written by the matmul, which lexically precedes it).
+    pos = {}
+    idx = [0]
+    def _index(op):
+        pos[op] = idx[0]; idx[0] += 1
+        for r in op.regions:
+            for b in r.blocks:
+                for o in b.operations:
+                    _index(o)
+    _index(module.operation)
+    def _key(n):
+        node = n["node"]
+        op = getattr(node, "op", None) or (node.operations[0] if getattr(node, "operations", None) else None)
+        return pos.get(op, 1 << 30)
+    nodes.sort(key=_key)
+
+    # Edges: (1) buffer last-writer, (2) preload->matmul.
+    edges = []
+    last_writer = {}  # buffer -> node idx
+    prev_preload = None
+    for i, n in enumerate(nodes):
+        for b in sorted(n["reads"]):
+            if b in last_writer:
+                edges.append((i, last_writer[b], f"reads {b}"))
+        if n["kind"] == "MATMUL" and prev_preload is not None:
+            edges.append((i, prev_preload, "uses preloaded weights (vcix op1->op0)"))
+        for b in n["writes"]:
+            last_writer[b] = i
+        if n["kind"] == "PRELOAD":
+            prev_preload = i
+    return nodes, edges
+
+
+def _main():
+    path = sys.argv[1]
+    ctx = ir.Context(); ctx.allow_unregistered_dialects = True
+    with ctx:
+        module = ir.Module.parse(open(path).read(), ctx)
+        nodes, edges = analyze(module)
+    print("=== nodes ===")
+    for i, n in enumerate(nodes):
+        r = ",".join(sorted(n["reads"])) or "-"
+        w = ",".join(sorted(n["writes"])) or "-"
+        print(f"  #{i:<2} {n['kind']:<8} reads[{r}] writes[{w}]")
+    print("=== edges (consumer -> producer) ===")
+    for c, p, why in edges:
+        print(f"  #{c} ({nodes[c]['kind']}) -> #{p} ({nodes[p]['kind']})   [{why}]")
+
+
+if __name__ == "__main__":
+    _main()
diff --git a/PyTorchSimFrontend/mlir/passes/dma_fine_grained.py b/PyTorchSimFrontend/mlir/passes/dma_fine_grained.py
index 3f583ef2..d7571d2b 100644
--- a/PyTorchSimFrontend/mlir/passes/dma_fine_grained.py
+++ b/PyTorchSimFrontend/mlir/passes/dma_fine_grained.py
@@ -21,6 +21,7 @@
 
 Pipeline entry point: run_fine_grained(in_path, out_path, vectorlane).
 """
+import itertools
 import os
 import sys
 
@@ -30,6 +31,8 @@
 
 import mlir.ir as ir  # noqa: E402
 
+from ._mlir_util import walk_ops, attr_i64_array
+
 MARKERS = ("subtile_size",)   # only subtile DMAs are split
 
 MVIN, MVIN2, MVIN3, MVOUT = 2, 1, 14, 3
@@ -54,12 +57,6 @@ def _const_int(value, default=-1):
         return default
 
 
-def _int_array_attr(op, key):
-    if key not in op.attributes:
-        return []
-    return [ir.IntegerAttr(a).value for a in ir.ArrayAttr(op.attributes[key])]
-
-
 def _is_block_arg(v):
     return isinstance(v, ir.BlockArgument)
 
@@ -106,13 +103,13 @@ def tile_shape(self):
         return list(mt.shape)
 
     def subtile_size(self):
-        return _int_array_attr(self.op, "subtile_size")
+        return attr_i64_array(self.op, "subtile_size", default=[])
 
     def sram_stride(self):
-        return _int_array_attr(self.op, "sram_stride")
+        return attr_i64_array(self.op, "sram_stride", default=[])
 
     def dram_stride(self):
-        return _int_array_attr(self.op, "dram_stride")
+        return attr_i64_array(self.op, "dram_stride", default=[])
 
     def is_async(self):
         a = self.op.attributes
@@ -244,6 +241,27 @@ def _const_index(v, ip):
                             ir.IntegerAttr.get(ir.IndexType.get(), v), ip=ip).result
 
 
+def _fresh_tag(dma):
+    """Give this DMA a fresh tag memref.alloc right BEFORE the (pre-split) coarse
+    dma_start, and rewire every use of the old tag -- the dma_start re-emitted
+    below AND its dma_wait -- to it. The coarse dma sits at the reduction-loop body
+    level (it has not been wrapped in a subtile load nest yet), so the alloc there
+    dominates both the load nest fine-grained is about to build and the sibling
+    wait nest. Each reduction iteration thus allocates its own tag -> successive
+    iterations are distinct (multi-tile-K / conv) and the per-iteration tag
+    semantics is in the IR, not reconstructed downstream. Old alloc becomes dead."""
+    old = dma.tag
+    new_tag = ir.Operation.create("memref.alloc", results=[old.type],
+                                  operands=[], ip=ir.InsertionPoint(dma.op)).results[0]
+    old.replace_all_uses_with(new_tag)
+    dma.tag = new_tag
+    # the old (func-entry, per-tensor unique) alloc is now dead -- erase it.
+    try:
+        old.owner.erase()
+    except Exception:
+        pass
+
+
 # ---------------------------------------------------------------------------
 # Loop-nest construction
 # ---------------------------------------------------------------------------
@@ -293,20 +311,12 @@ def _reaches(value, target):
 # ---------------------------------------------------------------------------
 # Pass driver
 # ---------------------------------------------------------------------------
-def _iter_ops(block):
-    for op in list(block.operations):
-        yield op
-        for region in op.operation.regions:
-            for b in region.blocks:
-                yield from _iter_ops(b)
-
-
 def _run_func(func, vectorlane):
     from mlir.dialects import linalg
     # First matmul only.
     matmul = None
     dmas = []
-    for op in _iter_ops(func.regions[0].blocks[0]):
+    for op in walk_ops(func.regions[0].blocks[0]):
         name = op.operation.name
         if name == "linalg.matmul" and matmul is None:
             matmul = op
@@ -363,16 +373,30 @@ def _run_func(func, vectorlane):
     for d, f in enumerate(fuse["w_to_fused"]):
         bounds[f] = w_counts[d]
 
+    # Give each load a fresh per-iteration tag alloc just before its coarse dma
+    # (rewiring its dma_wait via the old tag's uses), so the tag is distinct per
+    # reduction iteration -- positioned to match the per-iteration tag semantics.
+    _fresh_tag(mvin_input)
+    _fresh_tag(mvin_weight)
+
     # Insert the fused nest at the weight DMA (the later of the two): both DMAs'
     # original DRAM base indices (src_idx[0], computed in the enclosing loops) must
     # dominate the nest. Codegen emits input before weight, matching the C++ pass
     # which fuses after the weight subtile loop.
     ip = ir.InsertionPoint(mvin_weight.op)
-    fused_ivs, body_ip = _build_for_nest(bounds, ip)
-    in_ivs = [fused_ivs[fuse["in_to_fused"][d]] for d in range(rank)]
-    w_ivs = [fused_ivs[fuse["w_to_fused"][d]] for d in range(rank)]
-    _emit_dma(mvin_input, in_ivs, vectorlane, body_ip)
-    _emit_dma(mvin_weight, w_ivs, vectorlane, body_ip)
+    # Unroll the fused nest, emitting each distinct input/weight subtile ONCE (a load
+    # is invariant to the other operand's dims, so the cross-product re-emits it
+    # identically). Dedup by the operand's own coords; keep the fused issue order.
+    seen_in, seen_w = set(), set()
+    for it in itertools.product(*[range(b) for b in bounds]):
+        in_key = tuple(it[fuse["in_to_fused"][d]] for d in range(rank))
+        if in_key not in seen_in:
+            seen_in.add(in_key)
+            _emit_dma(mvin_input, [_const_index(c, ip) for c in in_key], vectorlane, ip)
+        w_key = tuple(it[fuse["w_to_fused"][d]] for d in range(rank))
+        if w_key not in seen_w:
+            seen_w.add(w_key)
+            _emit_dma(mvin_weight, [_const_index(c, ip) for c in w_key], vectorlane, ip)
     mvin_input.op.erase()
     mvin_weight.op.erase()
 
diff --git a/PyTorchSimFrontend/mlir/passes/lower_dma_to_gemmini.py b/PyTorchSimFrontend/mlir/passes/lower_dma_to_gemmini.py
index f5b841bb..998a6db5 100644
--- a/PyTorchSimFrontend/mlir/passes/lower_dma_to_gemmini.py
+++ b/PyTorchSimFrontend/mlir/passes/lower_dma_to_gemmini.py
@@ -22,6 +22,8 @@
 WAIT_NAME = "memref.dma_wait"
 MARKERS = (OP_NAME, WAIT_NAME)
 
+from ._mlir_util import attr_i64_array
+
 # func7 instruction codes (CustomDMAAttribute.h)
 CONFIG, CONFIG2, CONFIG3, CONFIG4 = 0, 4, 5, 6
 MVIN, MVIN2, MVIN3, MVOUT = 2, 1, 14, 3
@@ -124,8 +126,8 @@ def elem_addr_i64(memref_val, indices, mtype, elem_bytes):
         tile_shape = _subtile(op)
         if tile_shape is None:
             tile_shape = list(dst_ty.shape) if is_mvin else list(src_ty.shape)
-        dram_strides = _int_array(op, "dram_stride")
-        spad_strides = _int_array(op, "sram_stride")
+        dram_strides = attr_i64_array(op, "dram_stride")
+        spad_strides = attr_i64_array(op, "sram_stride")
         assert len(tile_shape) == len(dram_strides) == len(spad_strides), \
             f"shape/stride rank mismatch: {tile_shape} {dram_strides} {spad_strides}"
 
@@ -180,11 +182,6 @@ def _subtile(op):
     return [IntegerAttr(a).value for a in ArrayAttr(op.attributes["subtile_size"])]
 
 
-def _int_array(op, name):
-    from mlir.ir import ArrayAttr, IntegerAttr
-    return [IntegerAttr(a).value for a in ArrayAttr(op.attributes[name])]
-
-
 def _elem_bytes(elem_type):
     from mlir.ir import IntegerType, FloatType
     bits = (IntegerType(elem_type).width if IntegerType.isinstance(elem_type)
diff --git a/PyTorchSimFrontend/mlir/passes/lower_to_emitc.py b/PyTorchSimFrontend/mlir/passes/lower_to_emitc.py
new file mode 100644
index 00000000..3d1f7cde
--- /dev/null
+++ b/PyTorchSimFrontend/mlir/passes/lower_to_emitc.py
@@ -0,0 +1,607 @@
+"""lower_to_emitc pass (C4): skeleton+API MLIR -> EmitC -> C++ -> trace `.so`.
+
+Second stage of the C++ trace pipeline (docs/design/togsim_cpp_trace.md, sec
+5-7). Takes the skeleton+API module from `build_skeleton` (loop nest +
+`togsim.*` ops) and produces an EmitC module whose single entry function
+
+    extern "C" void togsim_kernel(EmitCtx* ctx, int64_t* shape_args, int32_t n)
+
+mirrors the loop skeleton, with every `togsim.*` op as an `emitc.call_opaque`
+to the matching `togsim_runtime.h` free function (`togsim_ops.EMITC_CALLEE`).
+`mlir-translate --mlir-to-cpp` renders it to C++, compiled to a `.so` that
+exports `togsim_kernel` and leaves `togsim_dma/wait/compute/signal` undefined for
+the TOGSim loader to resolve at `dlopen`.
+
+How the lowering is done -- it drives the *upstream* EmitC conversion passes and
+adds only the glue they cannot do:
+
+  1. (python) Rewrite the unregistered `togsim.*` ops to `emitc.call_opaque`.
+     Unregistered ops have no registered conversion patterns, so this must be a
+     custom rewrite (design sec 8). Also rewrite the kernel's signature to the
+     ABI form (drop the memref tensor args -- the trace producer never touches
+     tensor data; base addresses are deferred to P3) and drop the aux
+     globals / wrapper func.
+  2. (upstream passes, in-process PassManager)
+        func.func(lower-affine) -> convert-scf-to-emitc
+        -> convert-arith-to-emitc -> convert-func-to-emitc
+     This is the EmitC infrastructure: it lowers the affine/scf loop nest to
+     `emitc.for`, the index/arith (loop bounds, and in P3 the address
+     arithmetic) to EmitC, and the func to `emitc.func`.
+  3. (python) Two small fixups the passes leave behind in this LLVM 20 build:
+       * `convert-scf-to-emitc` emits `emitc.for` with `index`-typed bounds, so
+         `convert-arith-to-emitc` (which makes constants `!emitc.size_t`) leaves
+         `builtin.unrealized_conversion_cast` on the bounds that nothing folds
+         and `mlir-to-cpp` cannot print (design sec 8 "EmitC coverage" risk).
+         `_fold_for_bound_casts` rewrites those casts away.
+       * add the `extern "C"` specifier so `dlsym` finds the entry unmangled.
+
+Requires the MLIR Python bindings (incl. `mlir.passmanager`); the .cpp/.so
+steps additionally require `mlir-translate` (TORCHSIM_LLVM_PATH) and a host C++
+compiler.
+"""
+
+import os
+import subprocess
+
+from mlir.passmanager import PassManager
+
+from . import togsim_ops as ts
+from ._mlir_util import walk_ops, i32, i64, attr_int, attr_i64_array
+from .build_tog import ir, _find_kernel
+
+#: emitted entry symbol (== ts.ENTRY_SYMBOL == "togsim_kernel").
+ENTRY = ts.ENTRY_SYMBOL
+
+#: EmitC type of the opaque EmitCtx* threaded through every call.
+CTX_TYPE = '!emitc.ptr<!emitc.opaque<"EmitCtx">>'
+
+#: upstream EmitC conversion pipeline (the infrastructure this pass drives).
+_PIPELINE = ("builtin.module("
+             "convert-vector-to-scf{full-unroll=true},"
+             "func.func(lower-affine),"
+             "func.func(lower-vector-multi-reduction),"
+             "convert-scf-to-emitc,"
+             "convert-arith-to-emitc,"
+             "convert-func-to-emitc)")
+
+#: prepended to the mlir-to-cpp output; pulls in size_t/intN_t and the ABI.
+_PRELUDE = (
+    "#include <cstddef>\n"
+    "#include <cstdint>\n"
+    "using std::size_t;\n"
+    '#include "togsim_runtime.h"\n'
+)
+
+
+# ---------------------------------------------------------------------------
+# attribute builders / readers
+# ---------------------------------------------------------------------------
+def _idx(v):
+    return ir.IntegerAttr.get(ir.IndexType.get(), int(v))
+
+
+def _opaque(ctx, text):
+    return ir.Attribute.parse('#emitc.opaque<"%s">' % text, ctx)
+
+
+def _arr(ctx, vals):
+    """A C compound-literal `(const int64_t[]){...}` arg, or `nullptr` if empty
+    (the call site decays it to a `const int64_t*`)."""
+    vals = list(vals)
+    if not vals:
+        return _opaque(ctx, "nullptr")
+    return _opaque(ctx, "(const int64_t[]){%s}" % ", ".join(str(int(v)) for v in vals))
+
+
+def _attr_bool(op, key):
+    return 1 if ir.BoolAttr(op.operation.attributes[key]).value else 0
+
+
+# ---------------------------------------------------------------------------
+# step 1: rewrite signature + togsim.* ops (the unregistered-op glue)
+# ---------------------------------------------------------------------------
+def _strip_aux(module):
+    """Erase memref.global decls and every func except @kernel (the wrapper)."""
+    victims = []
+    for op in module.body.operations:
+        name = op.operation.name
+        if name == "memref.global":
+            victims.append(op)
+        elif name == "func.func":
+            if ir.StringAttr(op.operation.attributes["sym_name"]).value != "kernel":
+                victims.append(op)
+    for op in victims:
+        op.operation.erase()
+
+
+def _rewrite_signature(kernel, ctx):
+    """Replace @kernel's memref tensor args with the ABI args
+    (EmitCtx*, int64_t* shape_args, int32_t n) and rename it to togsim_kernel.
+    Returns the ctx Value."""
+    block = kernel.regions[0].blocks[0]
+    for arg in block.arguments:
+        if len(list(arg.uses)) > 0:
+            raise ValueError(
+                "kernel arg still used after build_skeleton; cannot drop it "
+                "(expected the DCE to have removed all tensor-data ops)")
+    # erase existing (memref) args high-to-low, then append the ABI args.
+    for i in reversed(range(len(block.arguments))):
+        block.erase_argument(i)
+    ptr = ir.Type.parse(CTX_TYPE, ctx)
+    i64ptr = ir.Type.parse("!emitc.ptr<i64>", ctx)
+    i32 = ir.IntegerType.get_signless(32)
+    loc = ir.Location.unknown(ctx)
+    block.add_argument(ptr, loc)
+    block.add_argument(i64ptr, loc)
+    block.add_argument(i32, loc)
+    kernel.operation.attributes["function_type"] = ir.TypeAttr.get(
+        ir.FunctionType.get([ptr, i64ptr, i32], []))
+    kernel.operation.attributes["sym_name"] = ir.StringAttr.get(ENTRY)
+    return block.arguments[0]
+
+
+def _call(ctx, ctx_val, op, callee, arg_attrs):
+    """Insert emitc.call_opaque <callee>(ctx) {args=[0:index, ...]} before `op`.
+    The leading `0 : index` references operand 0 (ctx); other entries are
+    literal C args (integer attr -> literal, #emitc.opaque -> verbatim)."""
+    ir.Operation.create(
+        "emitc.call_opaque", results=[], operands=[ctx_val],
+        attributes={"callee": ir.StringAttr.get(callee),
+                    "args": ir.ArrayAttr.get([_idx(0)] + arg_attrs)},
+        loc=ir.Location.unknown(ctx), ip=ir.InsertionPoint(op))
+
+
+def _innermost_outer_loop(block):
+    """Deepest `affine.for {outer_loop=true}` (the PARALLEL/ACCUMULATION
+    boundary). Returns the op or None if the kernel has no parallel loop."""
+    found = [None]
+
+    def is_outer(op):
+        a = op.operation.attributes
+        return "outer_loop" in a and ir.BoolAttr(a["outer_loop"]).value
+
+    def walk(b):
+        for op in b.operations:
+            if op.operation.name == "affine.for" and is_outer(op):
+                found[0] = op   # nested outer loops overwrite -> deepest wins
+            for r in op.operation.regions:
+                for bb in r.blocks:
+                    walk(bb)
+
+    walk(block)
+    return found[0]
+
+
+def _is_outer(forop):
+    a = forop.operation.attributes
+    return "outer_loop" in a and ir.BoolAttr(a["outer_loop"]).value
+
+
+def _parallel_loop_chain(block):
+    """The nested chain of `affine.for {outer_loop}` from `block` inward (one
+    work-item's parallel indices). Empty if the kernel has no parallel loop."""
+    chain = []
+    cur = block
+    while True:
+        nxt = None
+        for op in cur.operations:
+            if op.operation.name == "affine.for" and _is_outer(op):
+                nxt = op
+                break
+        if nxt is None:
+            break
+        chain.append(nxt)
+        cur = nxt.operation.regions[0].blocks[0]
+    return chain
+
+
+def _const_op(value):
+    """The defining arith/emitc constant Operation if `value` is a constant
+    result, else None (block args / other ops)."""
+    owner = value.owner
+    if isinstance(owner, ir.Block):
+        return None
+    return owner if owner.name in ("arith.constant", "emitc.constant") else None
+
+
+def _outline_work_item(ctx, kernel, ctx_val):
+    """Outline the innermost parallel work-item body into a uniform
+    `togsim_kernel_tile(ctx, iv, n)` func, replacing it with a
+    `togsim_dispatch(ctx, togsim_kernel_tile, iv, n)` call (sec 9.3). The
+    work-item SCOPE becomes the function body; the runtime wrapper owns the
+    core-alloc + the TILE_BEGIN/TILE_END boundary (a decorator). One uniform tile
+    signature -> a single general dispatcher serves every kernel.
+
+    Runs after `_rewrite_togsim_ops`, so the moved body holds emitc.call_opaque
+    (not togsim.* ops). The only values captured from outside the body are ctx,
+    the enclosing parallel induction vars, and constants -- threaded via the iv
+    array (parallel IVs) / cloned (constants); anything else is unsupported
+    (dynamic shape -> P4)."""
+    kblk = kernel.regions[0].blocks[0]
+    chain = _parallel_loop_chain(kblk)
+    if chain:
+        L = chain[-1]
+        Lbody = L.operation.regions[0].blocks[0]
+        ivs = [c.operation.regions[0].blocks[0].arguments[0] for c in chain]
+    else:                       # no parallel loop -> the whole kernel body is one work-item
+        L = None
+        Lbody = kblk
+        ivs = []
+
+    i64 = ir.IntegerType.get_signless(64)
+    i32 = ir.IntegerType.get_signless(32)
+    idxty = ir.IndexType.get()
+    ctxty = ir.Type.parse(CTX_TYPE, ctx)
+    i64ptr = ir.Type.parse("!emitc.ptr<i64>", ctx)
+    loc = ir.Location.unknown(ctx)
+
+    # --- the outlined tile function (before the kernel so C defines it first) ---
+    tile = ir.Operation.create(
+        "func.func", results=[], regions=1,
+        attributes={
+            "function_type": ir.TypeAttr.get(ir.FunctionType.get([ctxty, i64ptr, i32], [])),
+            "sym_name": ir.StringAttr.get(ts.TILE_SYMBOL),
+            "sym_visibility": ir.StringAttr.get("private")},
+        loc=loc, ip=ir.InsertionPoint(kernel))
+    with loc:
+        tblk = tile.regions[0].blocks.append(ctxty, i64ptr, i32)
+    ctx2, iv2, _n2 = tblk.arguments
+    with ir.InsertionPoint(tblk):
+        tret = ir.Operation.create("func.return", results=[], operands=[], loc=loc)
+
+    # in the tile fn: recover each parallel index = index_cast(iv[k]).
+    idx_vals = []
+    with ir.InsertionPoint(tret):
+        for k in range(len(ivs)):
+            kc = ir.Operation.create("emitc.constant", results=[i64],
+                    attributes={"value": ir.IntegerAttr.get(i64, k)}, loc=loc).results[0]
+            elem = ir.Operation.create("emitc.subscript", results=[i64],
+                    operands=[iv2, kc], loc=loc).results[0]
+            idx_vals.append(ir.Operation.create("arith.index_cast", results=[idxty],
+                    operands=[elem], loc=loc).results[0])
+
+    # move the work-item body into the tile fn (terminators stay behind).
+    for op in [o for o in Lbody.operations
+               if o.operation.name not in ("affine.yield", "func.return")]:
+        op.operation.move_before(tret)
+
+    # remap captures (Value `==` is identity): ctx -> ctx2, each parallel IV ->
+    # its index_cast, each external constant -> a clone inside the tile fn. A
+    # constant defined inside the tile fn (moved/read) is internal -> left alone.
+    caps = [(ctx_val, ctx2)] + list(zip(ivs, idx_vals))
+    internal_consts = []
+    def _collect_internal(block):
+        for op in block.operations:
+            c = _const_op(op.operation.results[0]) if len(op.operation.results) == 1 else None
+            if c is not None:
+                internal_consts.append(op.operation.results[0])
+            for rg in op.operation.regions:
+                for b in rg.blocks:
+                    _collect_internal(b)
+    _collect_internal(tblk)
+    const_clones = []
+    ext_consts = []
+    def _find_ext_consts(block):
+        for op in block.operations:
+            for opnd in op.operation.operands:
+                if _const_op(opnd) is None:
+                    continue
+                if any(opnd == ic for ic in internal_consts):
+                    continue
+                if any(opnd == e for e in ext_consts):
+                    continue
+                ext_consts.append(opnd)
+            for rg in op.operation.regions:
+                for b in rg.blocks:
+                    _find_ext_consts(b)
+    _find_ext_consts(tblk)
+    top = ir.InsertionPoint(tblk.operations[0])
+    for e in ext_consts:
+        c = _const_op(e)
+        clone = ir.Operation.create(c.name, results=[e.type],
+                    attributes={"value": c.attributes["value"]}, loc=loc, ip=top).results[0]
+        const_clones.append((e, clone))
+
+    allcaps = caps + const_clones
+    def _remap(block):
+        for op in block.operations:
+            for i in range(len(op.operation.operands)):
+                cur = op.operation.operands[i]
+                for orig, new in allcaps:
+                    if cur == orig:
+                        op.operation.operands[i] = new
+                        break
+            for rg in op.operation.regions:
+                for b in rg.blocks:
+                    _remap(b)
+    _remap(tblk)
+
+    # --- the dispatcher: marshal the IVs and hand the tile fn to togsim_dispatch ---
+    term = [o for o in Lbody.operations
+            if o.operation.name in ("affine.yield", "func.return")][0]
+    fn_ref = _opaque(ctx, ts.TILE_SYMBOL)   # function name -> verbatim pointer in C
+    with ir.InsertionPoint(term):
+        if ivs:
+            arrty = ir.Type.parse("!emitc.array<%dxi64>" % len(ivs), ctx)
+            arr = ir.Operation.create("emitc.variable", results=[arrty],
+                    attributes={"value": _opaque(ctx, "")}, loc=loc).results[0]
+            for k, iv in enumerate(ivs):
+                kc = ir.Operation.create("emitc.constant", results=[i64],
+                        attributes={"value": ir.IntegerAttr.get(i64, k)}, loc=loc).results[0]
+                v64 = ir.Operation.create("arith.index_cast", results=[i64],
+                        operands=[iv], loc=loc).results[0]
+                sub = ir.Operation.create("emitc.subscript", results=[i64],
+                        operands=[arr, kc], loc=loc).results[0]
+                # emitc.assign operands are (lvalue dest, value).
+                ir.Operation.create("emitc.assign", results=[], operands=[sub, v64], loc=loc)
+            ir.Operation.create(
+                "emitc.call_opaque", results=[], operands=[ctx_val, arr],
+                attributes={"callee": ir.StringAttr.get(ts.DISPATCH_CALLEE),
+                            "args": ir.ArrayAttr.get(
+                                [_idx(0), fn_ref, _idx(1), ir.IntegerAttr.get(i32, len(ivs))])},
+                loc=loc)
+        else:
+            ir.Operation.create(
+                "emitc.call_opaque", results=[], operands=[ctx_val],
+                attributes={"callee": ir.StringAttr.get(ts.DISPATCH_CALLEE),
+                            "args": ir.ArrayAttr.get(
+                                [_idx(0), fn_ref, _opaque(ctx, "nullptr"), ir.IntegerAttr.get(i32, 0)])},
+                loc=loc)
+
+
+def _rewrite_togsim_ops(ctx, kernel, ctx_val):
+    block = kernel.regions[0].blocks[0]
+    victims = []
+    for op in walk_ops(block):
+        name = op.operation.name
+        ipo = ir.InsertionPoint(op)
+        if name == ts.DMA:
+            dims = attr_i64_array(op, ts.ATTR_DIMS)
+            # The DRAM element offset is the togsim.dma operand (the original
+            # affine index, kept live by build_skeleton); pass it as a call
+            # operand so convert-arith-to-emitc lowers the address arithmetic
+            # into the producer (P3 approach A). The runtime adds the tensor base.
+            # Operands carried by build_skeleton: [dram_index, tag_index] (each
+            # optional). Pass each as a call operand so convert-arith-to-emitc
+            # lowers it; reference it from `args` by its operand position. offset
+            # -> DRAM byte address (runtime adds the tensor base); tag_slot -> the
+            # SRAM tile slot (runtime uses it for double-buffer/SRAM-capacity).
+            ins = list(op.operation.operands)
+            dram_operand = ins[0] if len(ins) >= 1 else None
+            tag_operand = ins[1] if len(ins) >= 2 else None
+            operands = [ctx_val]
+            offset_arg = i64(0)
+            tag_arg = i64(0)
+            if dram_operand is not None:
+                operands.append(dram_operand)
+                offset_arg = _idx(len(operands) - 1)
+            if tag_operand is not None:
+                operands.append(tag_operand)
+                tag_arg = _idx(len(operands) - 1)
+            args = [_idx(0),
+                    i32(attr_int(op, ts.ATTR_DIR)),
+                    i32(attr_int(op, ts.ATTR_ARG_ID)),
+                    offset_arg,
+                    i32(len(dims)),
+                    _arr(ctx, dims),
+                    _arr(ctx, attr_i64_array(op, ts.ATTR_STRIDES)),
+                    i32(attr_int(op, ts.ATTR_ELEM_BITS)),
+                    i32(_attr_bool(op, ts.ATTR_IS_ASYNC)),
+                    i32(attr_int(op, ts.ATTR_TAG_ID)),
+                    tag_arg]
+            _rb = attr_i64_array(op, ts.ATTR_READ_BUFS)
+            _wb = attr_i64_array(op, ts.ATTR_WRITE_BUFS)
+            args += [_arr(ctx, _rb), i32(len(_rb)), _arr(ctx, _wb), i32(len(_wb))]
+            # togsim_dma is void: the dma is paired with its barrier by the runtime
+            # (tag_id, tag_slot), not a returned handle.
+            ir.Operation.create(
+                "emitc.call_opaque", results=[], operands=operands,
+                attributes={"callee": ir.StringAttr.get(ts.EMITC_CALLEE[ts.DMA]),
+                            "args": ir.ArrayAttr.get(args)},
+                loc=ir.Location.unknown(ctx), ip=ipo)
+            victims.append(op)
+        elif name == ts.MEMORY_BAR:
+            # explicit async-DMA sync (the original dma_wait) ->
+            # togsim_memory_barrier(ctx, tag_id, tag_slot, write_bufs). The tag
+            # index operand (if any) is the runtime tag slot.
+            ins = list(op.operation.operands)
+            operands = [ctx_val]
+            tag_arg = i64(0)
+            if ins:
+                operands.append(ins[0])
+                tag_arg = _idx(len(operands) - 1)
+            _wb = attr_i64_array(op, ts.ATTR_WRITE_BUFS)
+            ir.Operation.create(
+                "emitc.call_opaque", results=[], operands=operands,
+                attributes={"callee": ir.StringAttr.get(ts.EMITC_CALLEE[ts.MEMORY_BAR]),
+                            "args": ir.ArrayAttr.get(
+                                [_idx(0), i32(attr_int(op, ts.ATTR_TAG_ID)), tag_arg,
+                                 _arr(ctx, _wb), i32(len(_wb))])},
+                loc=ir.Location.unknown(ctx), ip=ipo)
+            victims.append(op)
+        elif name == ts.COMPUTE:
+            # skeleton compute carries no dims (cost is keyed by tile_id) -> 0/null.
+            _rb = attr_i64_array(op, ts.ATTR_READ_BUFS)
+            _wb = attr_i64_array(op, ts.ATTR_WRITE_BUFS)
+            _call(ctx, ctx_val, op, ts.EMITC_CALLEE[ts.COMPUTE],
+                  [i64(attr_int(op, ts.ATTR_TILE_ID)),
+                   i32(attr_int(op, ts.ATTR_COMPUTE_TYPE)),
+                   i32(0), _opaque(ctx, "nullptr"),
+                   _arr(ctx, _rb), i32(len(_rb)), _arr(ctx, _wb), i32(len(_wb))])
+            victims.append(op)
+    for op in victims:
+        op.operation.erase()
+
+
+# ---------------------------------------------------------------------------
+# step 3: post-conversion fixups
+# ---------------------------------------------------------------------------
+def _retype_for_to_size_t(module):
+    """Make every `emitc.for` use `!emitc.size_t` bounds + induction variable,
+    then drop the `index`<->`!emitc.size_t` `unrealized_conversion_cast` ops that
+    `convert-scf-to-emitc` / `convert-arith-to-emitc` leave behind (mlir-to-cpp
+    cannot print them; --reconcile cannot fold them).
+
+    `emitc.for` accepts `size_t` bounds with the explicit type, and a `size_t` IV
+    makes the lowered address arithmetic (`convert-arith-to-emitc`, which works
+    in `size_t`) cast-free. So: set each IV to size_t, then for every
+    index<->size_t cast replace its result with its source (every consumer here
+    -- `emitc.for` bounds, `emitc.call_opaque` operands, `emitc` arith -- accepts
+    either, and after the IV retype each such cast bridges equal types)."""
+    idx = ir.IndexType.get()
+    st = ir.Type.parse("!emitc.size_t", module.context)
+
+    for op in list(walk_ops(module.body)):
+        if op.operation.name == "emitc.for":
+            op.operation.regions[0].blocks[0].arguments[0].set_type(st)
+
+    dead = []
+    for op in list(walk_ops(module.body)):
+        if op.operation.name != "builtin.unrealized_conversion_cast":
+            continue
+        res = op.results[0]
+        src = list(op.operation.operands)[0]
+        # idx<->size_t bridges (incl. the size_t->size_t identities left after
+        # the IV retype): every consumer here accepts either, so fold to source.
+        if src.type in (idx, st) and res.type in (idx, st):
+            res.replace_all_uses_with(src)
+            dead.append(op)
+    for d in dead:
+        try:
+            d.operation.erase()
+        except Exception:
+            pass
+
+
+def _add_extern_c(module, ctx):
+    for op in module.body.operations:
+        if (op.operation.name == "emitc.func"
+                and ir.StringAttr(op.operation.attributes["sym_name"]).value == ENTRY):
+            op.operation.attributes["specifiers"] = ir.ArrayAttr.get(
+                [ir.StringAttr.get('extern "C"')])
+            return
+    raise ValueError("emitc.func @%s not found after conversion" % ENTRY)
+
+
+# ---------------------------------------------------------------------------
+# driver
+# ---------------------------------------------------------------------------
+def lower_to_emitc(skeleton_module):
+    """Lower a skeleton+API module (in place) to an EmitC module with the
+    `togsim_kernel` entry function. Returns the same module."""
+    ctx = skeleton_module.context
+    kernel = _find_kernel(skeleton_module)
+    if kernel is None:
+        raise ValueError("no @kernel found in skeleton module")
+
+    _strip_aux(skeleton_module)
+    ctx_val = _rewrite_signature(kernel, ctx)
+    _rewrite_togsim_ops(ctx, kernel, ctx_val)         # togsim.* -> emitc.call_opaque
+    _outline_work_item(ctx, kernel, ctx_val)          # work-item body -> togsim_kernel_tile + dispatch
+
+    PassManager.parse(_PIPELINE, ctx).run(skeleton_module.operation)
+
+    _retype_for_to_size_t(skeleton_module)
+    _add_extern_c(skeleton_module, ctx)
+    return skeleton_module
+
+
+# ---------------------------------------------------------------------------
+# C++ / .so backend
+# ---------------------------------------------------------------------------
+def _mlir_translate_bin():
+    return os.path.join(os.environ.get("TORCHSIM_LLVM_PATH", "/usr/bin"),
+                        "mlir-translate")
+
+
+def emitc_to_cpp(emitc_module, mlir_translate=None):
+    """Render `emitc_module` to C++ source (prelude + mlir-to-cpp body)."""
+    mlir_translate = mlir_translate or _mlir_translate_bin()
+    proc = subprocess.run(
+        [mlir_translate, "--mlir-to-cpp"],
+        input=str(emitc_module), capture_output=True, text=True)
+    if proc.returncode != 0:
+        raise RuntimeError("mlir-translate --mlir-to-cpp failed:\n" + proc.stderr)
+    return _PRELUDE + proc.stdout
+
+
+def compile_so(cpp_text, so_path, include_dir, cxx=None):
+    """Compile producer C++ to `so_path`. `include_dir` must hold
+    togsim_runtime.h. togsim_* symbols are left undefined (resolved at dlopen)."""
+    cxx = cxx or os.environ.get("CXX", "g++")
+    cpp_path = os.path.splitext(so_path)[0] + ".cpp"
+    with open(cpp_path, "w") as fh:
+        fh.write(cpp_text)
+    proc = subprocess.run(
+        [cxx, "-shared", "-fPIC", "-std=gnu++17", "-O2",
+         "-I", include_dir, cpp_path, "-o", so_path],
+        capture_output=True, text=True)
+    if proc.returncode != 0:
+        raise RuntimeError("%s failed:\n%s" % (cxx, proc.stderr))
+    return so_path
+
+
+def _default_include_dir():
+    root = os.environ.get("TORCHSIM_DIR")
+    if not root:
+        root = os.path.dirname(os.path.dirname(os.path.dirname(
+            os.path.dirname(os.path.abspath(__file__)))))
+    return os.path.join(root, "TOGSim", "include")
+
+
+def skeleton_to_so(skeleton_module, so_path, include_dir=None):
+    """skeleton module -> EmitC -> C++ -> compiled trace `.so`. Returns the
+    EmitC module text (for inspection / caching)."""
+    emitc = lower_to_emitc(skeleton_module)
+    cpp = emitc_to_cpp(emitc)
+    compile_so(cpp, so_path, include_dir or _default_include_dir())
+    return str(emitc)
+
+
+def build_trace_so(postvcix_path, so_path, include_dir=None):
+    """Full P2 path from a post-vcix kernel .mlir to a trace `.so`."""
+    from . import build_skeleton as bs
+
+    ctx = ir.Context()
+    ctx.allow_unregistered_dialects = True
+    with ctx:
+        module = ir.Module.parse(open(postvcix_path).read(), ctx)
+        bs.build_skeleton(module)
+        return skeleton_to_so(module, so_path, include_dir)
+
+
+def main(argv):
+    import argparse
+
+    parser = argparse.ArgumentParser(prog="lower_to_emitc.py")
+    parser.add_argument("input", help="post-vcix kernel .mlir")
+    parser.add_argument("--so", required=True, help="output .so path")
+    parser.add_argument("--include-dir", default=None,
+                        help="dir holding togsim_runtime.h (default: TOGSim/include)")
+    parser.add_argument("--emit-cpp", default=None,
+                        help="also write the generated C++ here")
+    parser.add_argument("--emit-mlir", default=None,
+                        help="also write the EmitC MLIR here")
+    args = parser.parse_args(argv[1:])
+
+    from . import build_skeleton as bs
+    ctx = ir.Context()
+    ctx.allow_unregistered_dialects = True
+    with ctx:
+        module = ir.Module.parse(open(args.input).read(), ctx)
+        bs.build_skeleton(module)
+        emitc = lower_to_emitc(module)
+        if args.emit_mlir:
+            open(args.emit_mlir, "w").write(str(emitc))
+        cpp = emitc_to_cpp(emitc)
+        if args.emit_cpp:
+            open(args.emit_cpp, "w").write(cpp)
+        compile_so(cpp, args.so, args.include_dir or _default_include_dir())
+    import sys
+    sys.stderr.write("wrote %s\n" % args.so)
+    return 0
+
+
+if __name__ == "__main__":
+    import sys
+    sys.exit(main(sys.argv))
diff --git a/PyTorchSimFrontend/mlir/passes/lower_to_vcix.py b/PyTorchSimFrontend/mlir/passes/lower_to_vcix.py
index ac93ebc8..df124d00 100644
--- a/PyTorchSimFrontend/mlir/passes/lower_to_vcix.py
+++ b/PyTorchSimFrontend/mlir/passes/lower_to_vcix.py
@@ -29,6 +29,8 @@
 
 import mlir.ir as ir  # noqa: E402
 
+from ._mlir_util import walk_ops, i32, i64, attr_bool
+
 MARKERS = ("linalg.matmul", "math.exp", "math.erf", "math.tanh", "math.sin", "math.cos")
 
 # math op name -> (opcode, imm) for the vcix.v.iv lowering (mirror Math*ToVCIX).
@@ -80,20 +82,12 @@ def _legalize_vector_type(vt, vlen):
     return n, ir.VectorType.get([elt_count >> (n - 1)], elt_ty, scalable=[True])
 
 
-def _i64(v):
-    return ir.IntegerAttr.get(ir.IntegerType.get_signless(64), v)
-
-
-def _i32(v):
-    return ir.IntegerAttr.get(ir.IntegerType.get_signless(32), v)
-
-
 def _viv(operand, result_ty, opcode, imm, rvl=None):
     """Create an unregistered vcix.v.iv (vcix::BinaryImmOp) op at the current IP."""
     operands = [operand] if rvl is None else [operand, rvl]
     return ir.Operation.create(
         "vcix.v.iv", results=[result_ty], operands=operands,
-        attributes={"opcode": _i64(opcode), "imm": _i32(imm)}).results[0]
+        attributes={"opcode": i64(opcode), "imm": i32(imm)}).results[0]
 
 
 def _make_sf_vc_v_iv(vec, op_vt, n, legal_ty, opcode, imm):
@@ -104,7 +98,7 @@ def _make_sf_vc_v_iv(vec, op_vt, n, legal_ty, opcode, imm):
     scalable = legal_ty.scalable
     rvl = None
     if scalable:
-        rvl = arith.ConstantOp(ir.IntegerType.get_signless(64), _i64(9)).result
+        rvl = arith.ConstantOp(ir.IntegerType.get_signless(64), i64(9)).result
     if n == 1:
         return _viv(vec, legal_ty, opcode, imm, rvl)
     elt_ty = legal_ty.element_type
@@ -119,24 +113,16 @@ def _make_sf_vc_v_iv(vec, op_vt, n, legal_ty, opcode, imm):
         for i in range(total // elt_count):
             ext = vector.ExtractStridedSliceOp(
                 legal_ty, vec,
-                ir.ArrayAttr.get([_i64(i * elt_count)]),
-                ir.ArrayAttr.get([_i64(elt_count)]),
-                ir.ArrayAttr.get([_i64(1)])).result
+                ir.ArrayAttr.get([i64(i * elt_count)]),
+                ir.ArrayAttr.get([i64(elt_count)]),
+                ir.ArrayAttr.get([i64(1)])).result
             v = _viv(ext, legal_ty, opcode, imm, rvl)
             res = vector.InsertStridedSliceOp(
-                v, res, ir.ArrayAttr.get([_i64(i * elt_count)]),
-                ir.ArrayAttr.get([_i64(1)])).result
+                v, res, ir.ArrayAttr.get([i64(i * elt_count)]),
+                ir.ArrayAttr.get([i64(1)])).result
     return res
 
 
-def _iter_ops(block):
-    for op in list(block.operations):
-        yield op
-        for region in op.operation.regions:
-            for b in region.blocks:
-                yield from _iter_ops(b)
-
-
 # ---------------------------------------------------------------------------
 # matmul lowering helpers (mirror MatmulOpLowering)
 # ---------------------------------------------------------------------------
@@ -146,11 +132,6 @@ def _elt_bits(elt_ty):
     return ir.FloatType(elt_ty).width
 
 
-def _bool_attr_true(op, key):
-    a = op.attributes
-    return key in a and ir.BoolAttr(a[key]).value
-
-
 def _enclosing_loops(op):
     """Walk ancestor ops; return (accumulation, outer, inner) affine.for lists,
     outermost-first (mirror the C++ insert-at-begin)."""
@@ -158,11 +139,11 @@ def _enclosing_loops(op):
     parent = op.operation.parent
     while parent is not None:
         if parent.name == "affine.for":
-            if _bool_attr_true(parent, "accumulation_loop"):
+            if attr_bool(parent, "accumulation_loop"):
                 acc.insert(0, parent)
-            if _bool_attr_true(parent, "outer_loop"):
+            if attr_bool(parent, "outer_loop"):
                 outer.insert(0, parent)
-            if _bool_attr_true(parent, "inner_loop"):
+            if attr_bool(parent, "inner_loop"):
                 inner.insert(0, parent)
         parent = parent.parent
     return acc, outer, inner
@@ -200,7 +181,7 @@ def _scan_conv_offsets(ow_loop, o_h, k_h, o_w, k_w):
     """Mirror the heuristic offset scan: find affine.apply(o_h,k_h)/(o_w,k_w) in the
     o_w loop and read the constant in its map (default 1)."""
     offset_h = offset_w = 1
-    for o in _iter_ops(ow_loop.regions[0].blocks[0]):
+    for o in walk_ops(ow_loop.regions[0].blocks[0]):
         if o.operation.name != "affine.apply":
             continue
         ops = list(o.operation.operands)
@@ -391,7 +372,7 @@ def _root(v):
                 return owner.operands[0]
         return v
     rootA, rootB = _root(A), _root(B)
-    for o in _iter_ops(outer[-1].regions[0].blocks[0]):
+    for o in walk_ops(outer[-1].regions[0].blocks[0]):
         if o.operation.name == "affine.vector_store":
             dest = _root(o.operation.operands[1])
             if dest == rootA:
@@ -488,6 +469,14 @@ def _root(v):
         # --- B dma_wait ---
         nacc = len(acc)
         acc_ivs = [_loop_iv(l) for l in acc]
+        # LEGACY behavior: coefficient -1 on each accumulation (reduction) loop var
+        # is a SENTINEL marking "this tag dim is the reduction axis", not an
+        # arithmetic offset. The legacy TOG path (TileGraphParser.cc) honors it by
+        # routing those vars to a separate accum tag component and skipping stride
+        # -1. The C++ trace path does NOT honor it: build_skeleton._strip_accum_terms
+        # drops these -1 terms so the memory_barrier slot stays subtile-only and
+        # pairs with its async load. Kept here for byte-identity with the C++
+        # -test-pytorchsim-to-vcix pass; remove (do not flag) once legacy retires.
         bexpr = ir.AffineDimExpr.get(0) * -1
         for i in range(1, nacc):
             bexpr = bexpr + ir.AffineDimExpr.get(i) * -1
@@ -544,6 +533,10 @@ def _root(v):
 
     with body_ip:
         # --- A dma_wait ---
+        # LEGACY behavior (see the B dma_wait above): the -1 coefficients mark the
+        # reduction axis for the legacy TOG path; the trace path strips them in
+        # build_skeleton._strip_accum_terms. Kept for byte-identity with the C++
+        # -test-pytorchsim-to-vcix pass; remove once legacy retires.
         aexpr = ir.AffineDimExpr.get(0) * -1
         for i in range(1, nacc):
             aexpr = aexpr + ir.AffineDimExpr.get(i) * -1
@@ -617,7 +610,7 @@ def run(module, vectorlane=128, vlen=128, **_):
     mms = []
     for region in module.operation.regions:
         for b in region.blocks:
-            for o in _iter_ops(b):
+            for o in walk_ops(b):
                 if o.operation.name == "linalg.matmul":
                     mms.append(o.operation)
     for o in mms:
@@ -625,7 +618,7 @@ def run(module, vectorlane=128, vlen=128, **_):
     targets = []
     for region in module.operation.regions:
         for b in region.blocks:
-            for op in _iter_ops(b):
+            for op in walk_ops(b):
                 if op.operation.name in _MATH_VIV:
                     targets.append(op.operation)
     for op in targets:
diff --git a/PyTorchSimFrontend/mlir/passes/lower_vlane_idx.py b/PyTorchSimFrontend/mlir/passes/lower_vlane_idx.py
index 76e30cb3..3ed0a394 100644
--- a/PyTorchSimFrontend/mlir/passes/lower_vlane_idx.py
+++ b/PyTorchSimFrontend/mlir/passes/lower_vlane_idx.py
@@ -24,13 +24,7 @@
 OP_NAME = "torchsim.vlane_idx"
 MARKERS = (OP_NAME,)
 
-
-def _iter_ops(block):
-    for op in list(block.operations):
-        yield op
-        for region in op.operation.regions:
-            for b in region.blocks:
-                yield from _iter_ops(b)
+from ._mlir_util import walk_ops
 
 
 def run(module, **_):
@@ -46,7 +40,7 @@ def run(module, **_):
     targets = []
     for region in module.operation.regions:
         for b in region.blocks:
-            for op in _iter_ops(b):
+            for op in walk_ops(b):
                 if op.operation.name == OP_NAME:
                     targets.append(op.operation)
 
diff --git a/PyTorchSimFrontend/mlir/passes/togsim_ops.py b/PyTorchSimFrontend/mlir/passes/togsim_ops.py
new file mode 100644
index 00000000..21983da0
--- /dev/null
+++ b/PyTorchSimFrontend/mlir/passes/togsim_ops.py
@@ -0,0 +1,102 @@
+"""Shared vocabulary for the skeleton+API MLIR form (C1).
+
+The trace pipeline (docs/design/togsim_cpp_trace.md) reduces a kernel's MLIR to
+a *loop skeleton + API calls*: native `affine.for`/`scf.for` loops (bounds kept
+as-is, symbolic preserved) plus a handful of `togsim.*` ops that stand for the
+runtime API. This module is the single source of truth for those op names and
+attribute keys, shared by:
+
+  * build_skeleton (C2) -- produces the skeleton+API MLIR, and
+  * togsim->emitc lowering (C4) -- rewrites each op to an `emitc.call_opaque`.
+
+The ops are kept *unregistered* (like the existing `togsim.transfer`), so there
+is no C++ dialect to register; C4 is a custom rewrite, not a registered
+ConversionPass.
+
+Grammar (each op lowers 1:1 to a `togsim_runtime.h` free function):
+
+    "togsim.dma"(%dram_idx, %tag_idx) {         -> togsim_dma(ctx, dir, arg_id,
+            dir = 0 | 1,            # LOAD|STORE      offset, ndim, dims, strides,
+            dims = [..], strides = [..],                elem_bits, is_async,
+            elem_bits = i32, is_async = bool,           tag_id, tag_slot,
+            tag_id = i32, arg_id = i32,                 read_bufs, write_bufs)
+            read_bufs = [..], write_bufs = [..]
+         } : (index, index) -> ()
+
+    "togsim.compute"() {                        -> togsim_compute(ctx, tile_id,
+            tile_id = i64, compute_type = i32,          compute_type, ndim, dims,
+            read_bufs = [..], write_bufs = [..]         read_bufs, write_bufs)
+         } : () -> ()
+
+    "togsim.memory_barrier"(%tag_idx) {         -> togsim_memory_barrier(ctx,
+            tag_id = i32, write_bufs = [..]             tag_id, tag_slot, write_bufs)
+         } : (index) -> ()
+
+How an async dma pairs with its sync point: NOT by a compile-time id. One static
+`togsim.dma` op runs once per loop iteration, each with a different RUNTIME tag
+slot `%tag[%idx]`, so the pairing must be a runtime key. `togsim.dma` carries a
+`tag_id` (its tag memref identity) and the runtime `%tag[%idx]` operand; the
+original `memref.dma_wait` becomes an explicit `togsim.memory_barrier` carrying
+the same `tag_id` + tag index. They pair at runtime by `(tag_id, tag_slot)` via
+the Core's tag table (the dma signals the tag at data-arrival; the barrier waits
+it). `tag_id` (which tag memref) is distinct from `tag_slot` (the SRAM tile slot,
+used for the double-buffer / capacity model). A sync (non-async) dma is blocking,
+so it needs no barrier. (Supersedes the earlier static `event_id` + `togsim.wait`
+design, which could not express per-iteration pairing.)
+
+Keep this in lockstep with TOGSim/include/togsim_runtime.h (TOGSIM_ABI_VERSION).
+"""
+
+# ---- op names -------------------------------------------------------------
+DMA    = "togsim.dma"
+COMPUTE = "togsim.compute"
+MEMORY_BAR = "togsim.memory_barrier"    # explicit async-DMA sync (the original dma_wait); tag-keyed
+
+#: every op this module owns (for matchers / DCE roots in C2).
+OP_NAMES = (DMA, COMPUTE, MEMORY_BAR)
+
+#: op name -> the togsim_runtime.h symbol C4 lowers it to.
+EMITC_CALLEE = {
+    DMA:     "togsim_dma",
+    COMPUTE: "togsim_compute",
+    MEMORY_BAR: "togsim_memory_barrier",
+}
+
+#: producer entry-point symbol the TOGSim loader resolves (see togsim_runtime.h).
+ENTRY_SYMBOL = "togsim_kernel"
+
+#: outlined per-work-item function the dispatcher hands to togsim_dispatch
+#: (uniform signature (ctx, int64* iv, i32 n); see togsim_cpp_trace.md sec 9.3).
+TILE_SYMBOL = "togsim_kernel_tile"
+
+#: runtime callees emitted directly by lower_to_emitc (not skeleton ops), kept in
+#: lockstep with togsim_runtime.h. DISPATCH_CALLEE is the higher-order wrapper the
+#: dispatcher loop calls per work-item (round-robins a core + TILE_BEGIN/END);
+#: TILE_SYMBOL is passed to it as the function pointer.
+DISPATCH_CALLEE = "togsim_dispatch"
+
+# ---- attribute keys -------------------------------------------------------
+ATTR_DIR       = "dir"        # i32: DIR_LOAD | DIR_STORE
+ATTR_DIMS      = "dims"       # i64 array: tile extents
+ATTR_STRIDES   = "strides"    # i64 array: tile strides
+ATTR_ELEM_BITS = "elem_bits"  # i32
+ATTR_IS_ASYNC  = "is_async"   # bool
+ATTR_TILE_ID   = "tile_id"    # i64: key into the precomputed tile_id->cycle table
+ATTR_COMPUTE_TYPE = "compute_type"  # i32: 0 vector / 1 matmul / 2 preload (Core enum)
+ATTR_READ_BUFS  = "read_bufs"   # i64 array: SRAM buffer ids this op reads  (sec 10 dataflow)
+ATTR_WRITE_BUFS = "write_bufs"  # i64 array: SRAM buffer ids this op writes (sec 10 dataflow)
+ATTR_TAG_ID    = "tag_id"     # i32: identity of the DMA's tag memref; pairs an async dma with
+                              #      its memory_barrier by the RUNTIME tag slot (tag_id + tag index)
+ATTR_ARG_ID    = "arg_id"     # i32: which tensor (func arg) this DMA's base is
+
+# Must match togsim_dma_dir in togsim_runtime.h.
+DIR_LOAD  = 0
+DIR_STORE = 1
+
+
+def is_togsim_op(op):
+    """True if `op` (an Operation or a wrapping view) is one of ours."""
+    name = getattr(op, "name", None)
+    if name is None:
+        name = getattr(getattr(op, "operation", None), "name", None)
+    return name in OP_NAMES
diff --git a/README.md b/README.md
index f0bdc772..c2298376 100644
--- a/README.md
+++ b/README.md
@@ -385,6 +385,7 @@ num_cores: 1
 core_freq_mhz: 940
 core_stats_print_period_cycles: 10000
 num_systolic_array_per_core: 2
+sa_weight_buffer_depth: 2   # per-SA resident weight slots; must be > 0 (default 2). Raise to loosen the preload throttle.
 # Optional: one entry per core, default ws_mesh
 # core_type: [ws_mesh, ws_mesh]
 # Optional STONNE cores: stonne_config_path, num_stonne_per_core, num_stonne_port
@@ -453,7 +454,7 @@ codegen_compiler_optimization: all    # all | none | list of option names
 
 One-line meaning for each group (details in the YAML block above).
 
-- **Core (`num_cores`, `core_freq_mhz`, `core_stats_print_period_cycles`, `num_systolic_array_per_core`, optional `core_type`, STONNE keys)**: how many cores, their clock, stats cadence, systolic count per core, and optional non-default mesh vs STONNE mix.
+- **Core (`num_cores`, `core_freq_mhz`, `core_stats_print_period_cycles`, `num_systolic_array_per_core`, `sa_weight_buffer_depth`, optional `core_type`, STONNE keys)**: how many cores, their clock, stats cadence, systolic count per core, the per-SA resident weight-slot count (must be > 0; bounds preload run-ahead—raise it to loosen the throttle), and optional non-default mesh vs STONNE mix.
 - **VPU (`vpu_*`)**: vector lane count, per-lane scratchpad (KB), and vector register width—**compiler** uses these for tiling/codegen.
 - **DRAM (`dram_type`, `dram_channels`, …)**: `ramulator2` uses `ramulator_config_path`; `simple` uses fixed latency and optional bandwidth caps (`dram_bandwidth_gbps_*`, `dram_freq_mhz` when capped). `dram_num_partitions` splits channels for NUMA-style addressing.
 - **Interconnect (`icnt_*`, `booksim_config_path`)**: `simple` adds fixed hop latency (`icnt_latency_cycles`); `booksim2` points at a BookSim2 topology file.
diff --git a/Simulator/simulator.py b/Simulator/simulator.py
index 2b9f05be..a4517285 100644
--- a/Simulator/simulator.py
+++ b/Simulator/simulator.py
@@ -560,7 +560,23 @@ def run_standalone(
             os.fsync(trace_file.fileno())
 
         try:
-            cmd = f"{TOGSimulator.get_togsim_command(config_path, togsim_path)} --models_list {trace_file_path}"
+            # The C++ TOG (trace) path is the DEFAULT: drive the simulation from the
+            # emitted trace.so. The legacy ONNX TOG is the opt-in fallback via
+            # TORCHSIM_LEGACY_TOG=1. Each autotune candidate compiles to its own
+            # write_path (keyed by its retiled source), so its trace.so/cycle_table sit
+            # next to its tile_graph.onnx -- benchmark it through the trace path too.
+            # Fall back to legacy only if the .so was not emitted.
+            trace_so = os.path.join(os.path.dirname(str(model_path)), "trace.so")
+            cycle_tsv = os.path.join(os.path.dirname(str(model_path)), "trace_cycles.tsv")
+            base_cmd = TOGSimulator.get_togsim_command(config_path, togsim_path)
+            use_trace = (os.environ.get("TORCHSIM_LEGACY_TOG") != "1"
+                         and os.path.exists(trace_so))
+            if os.environ.get("TORCHSIM_LEGACY_TOG") == "1":
+                logger.warning("TORCHSIM_LEGACY_TOG=1 selects the DEPRECATED legacy ONNX TOG path")
+            if use_trace:
+                cmd = f"{base_cmd} --trace_so {trace_so} --cycle_table {cycle_tsv}"
+            else:  # DEPRECATED: legacy ONNX TOG path
+                cmd = f"{base_cmd} --models_list {trace_file_path}"
             if extension_config.CONFIG_TOGSIM_DEBUG_LEVEL:
                 cmd += f" --log_level {extension_config.CONFIG_TOGSIM_DEBUG_LEVEL}"
 
diff --git a/TOGSim/include/Core.h b/TOGSim/include/Core.h
index 286feb5f..75ad9cf4 100644
--- a/TOGSim/include/Core.h
+++ b/TOGSim/include/Core.h
@@ -1,6 +1,7 @@
 #pragma once
 #include <robin_hood.h>
 #include <unordered_set>
+#include <map>
 #include <memory>
 #include <vector>
 #include <fmt/core.h>
@@ -19,11 +20,22 @@ enum class InstFinishTraceTag {
   DmaRespComplete,
 };
 
+// A timed effect due at a cycle: free a weight slot, or wake a MEMORY_BAR.
+struct DueAction {
+  enum Kind { FreeWeightSlot, WakeBar } kind;
+  std::shared_ptr<WeightToken> token;
+  std::shared_ptr<Instruction> bar;
+};
+
 class Core {
  public:
   Core(uint32_t id, SimulationConfig config);
   ~Core()=default;
   virtual bool running();
+  // True if this core has work actively in flight (DMA / compute pipeline / queues)
+  // that will produce a future finish event -- i.e. running() minus "tiles waiting".
+  // Used by the frozen-state (spad-too-small) guard.
+  bool has_inflight();
   virtual bool can_issue(const std::shared_ptr<Tile>& op);
   virtual void issue(std::shared_ptr<Tile> tile);
   virtual std::shared_ptr<Tile> pop_finished_tile();
@@ -55,6 +67,17 @@ class Core {
   void sa_cycle();
   bool can_issue_compute(std::shared_ptr<Instruction>& inst);
   void update_stats();
+  // SRAM-capacity throttle (sec 10.x): a consumer frees the buffer-versions it
+  // read (refcount -> 0 releases the spad bytes). Called when COMP/MOVOUT issue.
+  void release_sram(const std::shared_ptr<Instruction>& inst);
+  // Occupy inst's buffer-version footprint on issue; false if it would overflow
+  // the spad this cycle (the caller stalls it). True for untracked insts.
+  bool try_occupy_sram(const std::shared_ptr<Instruction>& inst);
+  // SA weight-buffer throttle (sec 10.x): pick a systolic array that has a free
+  // weight slot (round-robin among free); -1 if all full -> the preload stalls.
+  int pick_free_weight_sa();
+  void process_due_events();   // drain _due_events due this cycle
+  void apply_due(const DueAction& a);
 
   /* Core id & config file */
   const uint32_t _id;
@@ -103,4 +126,18 @@ class Core {
   std::queue<mem_fetch*> _request_queue;
   std::queue<mem_fetch*> _response_queue;
   uint32_t _waiting_write_reqs;
+
+  // SRAM-capacity throttle (sec 10.x). _sram_used = current per-core spad bytes;
+  // _sram_capacity = limit (0 = disabled); _sram_allocs maps a buffer-version id
+  // to its accumulated footprint bytes (freed when its last reader issues).
+  size_t _sram_used = 0;
+  size_t _sram_capacity = 0;
+  std::unordered_map<int64_t, size_t> _sram_allocs;
+
+  // SA weight-buffer throttle (sec 10.x). _weight_slots_used[s] = weights resident
+  // on SA s (loaded by a preload, not yet freed by their last matmul);
+  // _weight_slot_depth = per-SA weight-slot capacity (must be > 0).
+  std::vector<int> _weight_slots_used;
+  uint32_t _weight_slot_depth = 0;
+  std::multimap<cycle_type, DueAction> _due_events;
 };
\ No newline at end of file
diff --git a/TOGSim/include/Instruction.h b/TOGSim/include/Instruction.h
index bb62a440..24659791 100644
--- a/TOGSim/include/Instruction.h
+++ b/TOGSim/include/Instruction.h
@@ -6,13 +6,25 @@
 #include <list>
 #include <numeric>
 
+#include <array>
 #include <set>
 #include <cassert>
 #include <cstdint>
 #include <memory>
 #include <vector>
 
-enum class Opcode { MOVIN, MOVOUT, COMP, BAR, COUNT};
+// MEMORY_BAR: the DMA/memory barrier (waits a DMA tag in the tag table).
+enum class Opcode { MOVIN, MOVOUT, COMP, MEMORY_BAR, COUNT};
+
+// A dependency edge releases its consumer on one of the producer's lifecycle
+// events: ISSUE (occupancy -- the consumer overlaps the producer on the SA
+// pipeline) or DONE (latency -- the consumer needs the producer's result).
+enum class DepEvent : uint8_t { ISSUE = 0, DONE = 1, COUNT = 2 };
+
+// One weight slot on systolic array `sa` (sec 10.x). A preload sets refcount =
+// the matmuls reusing the weight; each frees it at its streaming-end, the last
+// one releases the slot. Shared (shared_ptr) by the preload's matmul consumers.
+struct WeightToken { int sa; int refcount; };
 
 typedef uint64_t addr_type;
 typedef uint64_t cycle_type;
@@ -28,7 +40,28 @@ class Instruction : public std::enable_shared_from_this<Instruction> {
               std::vector<int64_t> accum_tag_idx_list);
   Instruction(Opcode opcode);
   void finish_instruction();
-  void add_child(std::shared_ptr<Instruction> child);
+  // Subscribe `c` to this op's `on` event (ISSUE=occupancy, DONE=latency). The set
+  // dedups, so ready_counter is bumped only on a new edge (a producer writing
+  // several buffers one consumer reads links the pair once per buffer).
+  void add_dep(std::shared_ptr<Instruction> c, DepEvent on) {
+    if (_deps[static_cast<size_t>(on)].insert(c).second) c->inc_ready_counter();
+  }
+  // Release every subscriber of `e` (decrement its ready_counter) and clear.
+  void fire(DepEvent e) {
+    for (auto& c : _deps[static_cast<size_t>(e)]) c->dec_ready_counter();
+    _deps[static_cast<size_t>(e)].clear();
+  }
+  const std::set<std::shared_ptr<Instruction>>& get_deps(DepEvent e) {
+    return _deps[static_cast<size_t>(e)];
+  }
+  void set_assigned_sa(int s) { _assigned_sa = s; }
+  int get_assigned_sa() const { return _assigned_sa; }
+  void set_weight_token(const std::shared_ptr<WeightToken>& t) { _weight_token = t; }
+  const std::shared_ptr<WeightToken>& get_weight_token() const { return _weight_token; }
+  // Trace-only: which work-item (togsim_dispatch tile) this op belongs to, for
+  // grouping/coloring in the timeline. Set by the bridge per TILE_BEGIN.
+  void set_tile_group(int g) { _tile_group = g; }
+  int get_tile_group() const { return _tile_group; }
   bool check_ready() { return ready_counter == 0; }
   const Opcode get_opcode() { return opcode; }
   bool is_dma_read() { return opcode == Opcode::MOVIN; }
@@ -51,6 +84,9 @@ class Instruction : public std::enable_shared_from_this<Instruction> {
   void inc_waiting_request();
   void dec_waiting_request();
   size_t get_waiting_request() { return _nr_waiting_request; }
+  // trace: log only the FIRST DRAM response of a load (when data starts arriving).
+  bool got_first_response() const { return _got_first_response; }
+  void mark_first_response() { _got_first_response = true; }
   std::vector<size_t>& get_tile_size() { return tile_size; }
   std::vector<int>& get_tile_stride() { return tile_stride; }
   void set_overlapping_cycle(cycle_type cycle) { overlapping_cycle = cycle; }
@@ -83,15 +119,33 @@ class Instruction : public std::enable_shared_from_this<Instruction> {
   void prepare_tag_key();
   bool is_sparse_inst() { return _is_sparse_inst; }
   void set_sparse_state(bool state) { _is_sparse_inst = state; }
-  std::set<std::shared_ptr<Instruction>>& get_child_inst() { return child_inst; }
   uint64_t get_global_inst_id() const { return _global_inst_id; }
 
-  cycle_type start_cycle;
-  cycle_type finish_cycle;
+  // SRAM-capacity model (sec 10.x). A load contributes its footprint to a
+  // buffer-version allocation; the version is freed when its LAST consumer (the
+  // program-order-last reader, tagged by the bridge) issues. The bridge fills
+  // these; Core enforces them.
+  //   _sram_alloc_id      : which buffer-version this load fills (-1 = untracked)
+  //   _sram_release_allocs: versions this consumer frees on issue (tagged only on
+  //                         each version's last reader)
+  void set_sram_alloc(int64_t id) { _sram_alloc_id = id; }
+  int64_t get_sram_alloc() const { return _sram_alloc_id; }
+  void add_sram_release(int64_t id) { _sram_release_allocs.push_back(id); }
+  const std::vector<int64_t>& get_sram_release() const { return _sram_release_allocs; }
+  // bytes this instruction's buffer occupies in the spad. A DMA derives it from
+  // the tile it moves; a compute output gets it set explicitly by the bridge (the
+  // buffer's size is known from the DMA records that touch the same buffer).
+  void set_sram_footprint(size_t b) { _sram_footprint_override = b; }
+  size_t sram_footprint() const {
+    return _sram_footprint_override ? _sram_footprint_override
+                                    : _tile_numel * (_elem_bits / 8);
+  }
+
+  cycle_type finish_cycle = 0;
   cycle_type bubble_cycle=0;
 
   bool finished=false;
-  int subgraph_id;
+  int subgraph_id = 0;
  private:
   uint64_t _global_inst_id = 0;
   static uint64_t _next_global_inst_id;
@@ -99,16 +153,22 @@ class Instruction : public std::enable_shared_from_this<Instruction> {
   void *_owner = nullptr;
   std::list<std::shared_ptr<Instruction>>* _owner_ready_queue_ref = nullptr;
   Opcode opcode;
-  cycle_type compute_cycle;
-  cycle_type overlapping_cycle;
-  size_t ready_counter;
-  std::set<std::shared_ptr<Instruction>> child_inst;
+  cycle_type compute_cycle = 0;
+  cycle_type overlapping_cycle = 0;
+  size_t ready_counter = 0;   // parents not yet finished; the minimal Instruction(Opcode)
+                              // ctor (barriers) relies on this default + inc_ready_counter
+  // Per-event subscriber sets: _deps[ISSUE] released at issue (occupancy),
+  // _deps[DONE] released at finish (latency). std::set dedups + keeps a stable
+  // iteration order (byte-identical release order).
+  std::array<std::set<std::shared_ptr<Instruction>>,
+             static_cast<size_t>(DepEvent::COUNT)> _deps;
   std::vector<size_t> tile_size;
   std::vector<int> tile_stride;
-  size_t _tile_numel;
+  size_t _tile_numel = 0;
   size_t _nr_waiting_request=0;
+  bool _got_first_response=false;
   size_t _elem_bits = 0;
-  addr_type dram_addr;
+  addr_type dram_addr = 0;
   uint32_t _numa_id = 0; // For DMA instruction
   int _compute_type = 0;
   std::vector<int64_t> _tag_idx_list;
@@ -123,4 +183,12 @@ class Instruction : public std::enable_shared_from_this<Instruction> {
   bool _is_indirect_mode=false;
   bool _is_sparse_inst=false;
   std::string _indirect_index_path="";
+  // SRAM-capacity model (see the setters above).
+  int64_t _sram_alloc_id = -1;
+  std::vector<int64_t> _sram_release_allocs;
+  size_t _sram_footprint_override = 0;
+  // SA weight-buffer model (see the setters above).
+  int _assigned_sa = -1;
+  std::shared_ptr<WeightToken> _weight_token;
+  int _tile_group = -1;   // trace-only work-item id (see set_tile_group)
 };
\ No newline at end of file
diff --git a/TOGSim/include/SimulationConfig.h b/TOGSim/include/SimulationConfig.h
index 2ef08618..7785ff7a 100644
--- a/TOGSim/include/SimulationConfig.h
+++ b/TOGSim/include/SimulationConfig.h
@@ -27,6 +27,16 @@ struct SimulationConfig {
   uint32_t num_systolic_array_per_core = 1;
   uint32_t num_stonne_per_core = 1;
   uint32_t num_stonne_port = 1;
+  // Per-core VMEM/spad capacity (KB) for the trace-path DMA throttle (sec 10.x):
+  // a load that would overflow the spad does not issue until a consumer frees a
+  // tile. Provided by the config (the TPU configs set 16384 = 16 MB VMEM). 0 =
+  // unset -> gate disabled (unlimited). Only affects trace-path instructions
+  // (legacy TileGraphParser insts have alloc id -1 -> never gated).
+  uint32_t core_spad_size_kb = 0;
+  // SA weight-buffer depth (sec 10.x): weight tiles a systolic array holds; a
+  // preload stalls until a slot frees (its matmuls finished). 2 = weight
+  // double-buffer (convention default, tunable). 0 = disabled.
+  uint32_t sa_weight_buffer_depth = 2;
 
   /* DRAM config */
   DramType dram_type;
diff --git a/TOGSim/include/Simulator.h b/TOGSim/include/Simulator.h
index e3542d51..91baf5b5 100644
--- a/TOGSim/include/Simulator.h
+++ b/TOGSim/include/Simulator.h
@@ -48,6 +48,9 @@ class Simulator {
   void dram_cycle();
   void icnt_cycle();
   bool running();
+  // Spad-too-small guard: if the sim stays frozen (running() but nothing in
+  // flight) past kWedgeThreshold cycles, error out and exit. Called each cycle.
+  void check_frozen();
   void set_cycle_mask();
   uint32_t get_dest_node(mem_fetch *access);
   SimulationConfig _config;
diff --git a/TOGSim/include/TraceLogTags.h b/TOGSim/include/TraceLogTags.h
index 6c158099..759a4fdb 100644
--- a/TOGSim/include/TraceLogTags.h
+++ b/TOGSim/include/TraceLogTags.h
@@ -24,6 +24,7 @@ inline constexpr const char* kInstructionFinished = "INST_FINISHED";
 inline constexpr const char* kInstructionSkipped = "INST_SKIP";
 
 inline constexpr const char* kAsyncDmaAllRequestsIssued = "ASYNC_DMA_ISSUE";
+inline constexpr const char* kFirstDramResponse = "DRAM_RESP_FIRST";
 inline constexpr const char* kAllDramResponsesReceived = "DRAM_RESP_DONE";
 
 inline constexpr const char* kL2CacheableStatusForAddress = "L2CACHE_STAT";
diff --git a/TOGSim/include/togsim_loader.h b/TOGSim/include/togsim_loader.h
new file mode 100644
index 00000000..17e10b34
--- /dev/null
+++ b/TOGSim/include/togsim_loader.h
@@ -0,0 +1,79 @@
+#pragma once
+// togsim_loader.h
+// -----------------------------------------------------------------------------
+// TOGSim-side loader for the compiled trace producer (C6, P3 task 5). NOT part
+// of the producer ABI (togsim_runtime.h) -- this is the TOGSim half that
+// `dlopen`s a producer `.so`, runs its `togsim_kernel`, and records the emitted
+// instruction stream. See docs/design/togsim_cpp_trace.md sec 5.3 / 9.7.
+//
+// This first cut is the "materializing sink": the callbacks resolve each tile's
+// DRAM address (base[arg_id] + offset*elem_bytes) and per-tile compute cost
+// (the cycle table), mint event handles, and append a TraceRec per modeled
+// instruction. Feeding the recorded stream into the existing timing core
+// (Core/Simulator) for cycle-equivalence vs the build_tog path is the remaining
+// task-5 step.
+// -----------------------------------------------------------------------------
+
+#include <cstdint>
+#include <vector>
+
+#include "togsim_runtime.h"
+
+namespace togsim {
+
+// One modeled instruction recorded by the runtime callbacks.
+struct TraceRec {
+  enum Kind { TILE_BEGIN, TILE_END, DMA, COMPUTE, MEMORY_BAR } kind;
+  int32_t  core;          // work-item -> core binding (set by togsim_dispatch)
+  // DMA / MEMORY_BAR
+  int32_t  dir;           // togsim_dma_dir
+  int32_t  arg_id;        // tensor
+  int32_t  elem_bits;
+  int32_t  is_async;
+  uint64_t addr;          // resolved DRAM byte address = base[arg_id] + off*bytes
+  int32_t  tag_id;        // DMA/MEMORY_BAR: tag memref identity; with tag_slot the
+                          // runtime pairing key (an async dma <-> its memory_barrier)
+  uint64_t tag_slot;      // SRAM tile slot (double-buffer / capacity model)
+  std::vector<int64_t> dims;     // tile extents (DMA)
+  std::vector<int64_t> strides;  // tile strides (DMA)
+  std::vector<int64_t> read_bufs;   // SRAM buffer ids read  (sec 10 dataflow DAG)
+  std::vector<int64_t> write_bufs;  // SRAM buffer ids written (MEMORY_BAR: released bufs)
+  // COMPUTE
+  uint64_t tile_id;
+  int32_t  compute_type;  // 0 vector / 1 matmul / 2 preload (Core unit enum)
+  int64_t  cycle;         // looked up from the cycle table
+  int64_t  overlapping;   // looked up from the cycle table
+};
+
+struct RunResult {
+  bool ok = false;
+  std::vector<TraceRec> trace;
+};
+
+// Load `so_path`, run its `togsim_kernel(shape_args, n_shape)` against a freshly
+// built EmitCtx, and return the recorded trace.
+//   tensor_base[arg_id] : DRAM base address of each kernel tensor argument
+//   cyc[tile_id] / ovl[tile_id] : the cycle table (cycle, overlapping_cycle)
+//   partition_cores : the core ids of the partition this kernel is enqueued to;
+//                     dispatch round-robins work-items only over THESE cores (a
+//                     kernel stays within its partition -- other partitions are
+//                     independent). Empty/null -> core 0.
+RunResult run_producer(const char* so_path,
+                       const int64_t* shape_args, int32_t n_shape,
+                       const uint64_t* tensor_base, int32_t n_tensors,
+                       const int64_t* cyc, const int64_t* ovl, int32_t n_tiles,
+                       const int32_t* partition_cores, int32_t n_partition_cores);
+
+// First-order reference timing over a recorded trace, to validate that the
+// stream carries enough to be scheduled (it is NOT the production Core -- no
+// DRAM/NoC/L2 contention; the real cycle-equivalence path feeds Tile/TileGraph
+// into Core). Models, per core: a DMA-engine timeline (DMAs serialize, overlap
+// compute), a compute timeline (serial = reduction accumulate, with the
+// finish = prev.finish + cycle - overlapped pipeline overlap of Core.cc), and
+// data dependencies (a compute waits the dmas whose handles its preceding
+// togsim_wait()s named).
+struct TimingParams { uint64_t dma_latency = 100; };
+struct SimResult { uint64_t total_cycle = 0; int n_compute = 0, n_dma = 0; };
+SimResult simulate(const RunResult& run, const TimingParams& params);
+
+}  // namespace togsim
diff --git a/TOGSim/include/togsim_runtime.h b/TOGSim/include/togsim_runtime.h
new file mode 100644
index 00000000..d87c61d5
--- /dev/null
+++ b/TOGSim/include/togsim_runtime.h
@@ -0,0 +1,172 @@
+#pragma once
+// togsim_runtime.h
+// -----------------------------------------------------------------------------
+// Shared C ABI between a compiled, shape-parametric trace producer (`.so`,
+// generated MLIR -> EmitC -> C++) and TOGSim. See docs/design/togsim_cpp_trace.md.
+//
+// The producer keeps loops as native loops (symbolic bounds become function
+// parameters) and calls the functions below; each call emits one trace record =
+// one modeled instruction. TOGSim `dlopen`s the producer, constructs an
+// `EmitCtx`, calls the entry point, records the emitted stream, and feeds it to
+// the existing timing core. The producer carries NO timing model and NO
+// functional compute -- it is a deterministic trace generator only.
+//
+// ABI shape rationale: `mlir-translate --mlir-to-cpp` lowers our `togsim.*` ops
+// (via `emitc.call_opaque`) to *free function* calls, so the contract is a set
+// of `extern "C"` free functions taking an opaque `EmitCtx*` as the first
+// argument. Implementations live in TOGSim and may dispatch internally; the
+// `EmitCtx` is opaque to the producer. `togsim_abi_version()` guards against a
+// producer `.so` built against a stale header.
+//
+// STATUS: firmed up in P2. The signatures below match what the C4
+// togsim->emitc lowering (PyTorchSimFrontend/mlir/passes/lower_to_emitc.py)
+// emits as `emitc.call_opaque` targets and what `mlir-translate --mlir-to-cpp`
+// renders. Synchronization is event-id based: each async op is registered
+// under an integer `event_id` and the matching wait passes the same id (the
+// "event-id table replaces the memory-keyed tag_table" decision). Tile DRAM
+// base addresses are still passed as a stub (0) until P3 wires real addresses.
+// -----------------------------------------------------------------------------
+
+#include <cstdint>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Bump whenever the signatures below change incompatibly. TOGSim refuses to load
+// a producer whose embedded version (a `togsim_producer_abi_version` symbol, or
+// a value passed at the entry point) does not match.
+//   v1 -> v2 (P2): dma takes an event_id and returns void (was: returns a
+//                  handle); togsim_kernel shape_args is non-const to match the
+//                  emitc/mlir-to-cpp output.
+//   v2 -> v3 (P3): add togsim_dispatch (work-item boundary + core binding) and
+//                  togsim_wait_all (join / barrier).
+//   v3 -> v4 (P3): togsim_dma takes (arg_id, element offset) instead of a
+//                  precomputed base_addr; the producer lowers the address
+//                  arithmetic and the runtime adds the tensor base.
+//   v4 -> v5 (P3): event handles. togsim_dma RETURNS a fresh handle (drops the
+//                  event_id arg); the producer parks it in a heap event buffer
+//                  (togsim_event_alloc/free) and togsim_wait takes the handle.
+//   v5 -> v6 (P3): replace togsim_dispatch with togsim_core_alloc (returns a
+//                  core id; no free) -- the runtime owns the core pool, num_cores
+//                  is never baked into the producer.
+//   v6 -> v7 (P3): togsim_dma takes a tag_slot (SRAM tile slot) for the runtime's
+//                  double-buffer / SRAM-capacity model.
+//   v7 -> v8 (P3): togsim_compute takes a compute_type (vector/matmul/preload) so
+//                  the Core routes it to the right compute unit.
+//   v8 -> v9 (P3 sec10): togsim_dma/compute take read_bufs/write_bufs (SRAM buffer
+//                  ids); the loader builds an explicit dependency DAG by
+//                  last-writer per buffer (replaces in-order/tag dependencies).
+//   v9 -> v10 (P3 sec10.7): add togsim_compute_barrier (the explicit compute fence
+//                  before a store; loader -> COMPUTE_BAR instruction).
+//   v10 -> v11 (P3 sec10): replace the static event-id pairing with the RUNTIME
+//                  tag slot. togsim_dma takes a tag_id (its tag memref identity)
+//                  and returns void; the original dma_wait becomes an explicit
+//                  togsim_memory_barrier(tag_id, tag_slot, write_bufs) that pairs
+//                  with its async dma by the runtime (tag_id, tag_slot) -- one
+//                  static dma op runs once per loop iteration with a different
+//                  %tag[%idx], so only a runtime key can pair them. Drops
+//                  togsim_wait/signal/wait_all/event_alloc/event_free + the
+//                  togsim_event handle (no compile-time pairing token).
+//   v11 -> v12 (P3 sec9.3): replace the bare togsim_core_alloc marker with a
+//                  higher-order togsim_dispatch(ctx, tile_fn, iv, n_iv) wrapper.
+//                  The producer outlines each parallel work-item into a uniform
+//                  togsim_kernel_tile(ctx, iv, n) and the dispatcher loop hands it
+//                  to togsim_dispatch, which round-robins a core and brackets the
+//                  call with TILE_BEGIN/TILE_END. The work-item scope is now the
+//                  function call itself (no implicit "until the next core_alloc"
+//                  range); one general dispatcher serves every kernel (uniform
+//                  iv-array ABI). Core alloc + the begin/end boundary are
+//                  runtime-owned.
+#define TOGSIM_ABI_VERSION 12
+int32_t togsim_abi_version(void);
+
+// Opaque per-invocation context owned by TOGSim. Holds the record sink and the
+// tile_id->cycle lookup. Never dereferenced by the producer.
+typedef struct EmitCtx EmitCtx;
+
+// Direction for togsim_dma.
+typedef enum {
+  TOGSIM_DMA_LOAD  = 0,  // DRAM -> SRAM (MOVIN)
+  TOGSIM_DMA_STORE = 1,  // SRAM -> DRAM (MOVOUT)
+} togsim_dma_dir;
+
+// Emit a DMA.
+//   dir       : load/store
+//   arg_id    : which tensor (kernel func arg) this tile lives in
+//   offset    : ELEMENT offset of this tile within that tensor, computed by the
+//               producer from the loop indices (the affine address arithmetic is
+//               lowered into the producer -- P3 approach A). The runtime forms
+//               the DRAM address as base[arg_id] + offset*elem_bytes (only the
+//               runtime knows the tensors' allocation base addresses).
+//   ndim      : rank of the tile
+//   dims      : ndim tile extents
+//   strides   : ndim tile strides (may be null => contiguous)
+//   elem_bits : element width in bits
+//   is_async  : non-zero => issue-complete is the finish; the consumer must be
+//               gated by an explicit togsim_memory_barrier (data arrives later).
+//               Zero => blocking: the dma finishes at data-arrival.
+//   tag_id    : identity of this dma's tag memref. With tag_slot it forms the
+//               RUNTIME pairing key (tag_id, tag_slot) the matching
+//               togsim_memory_barrier waits on -- not a compile-time id, since
+//               one static dma op runs once per loop iteration.
+//   tag_slot  : the SRAM tile slot this tile occupies (the producer's lowered
+//               tag index, evaluated at runtime). Also the double-buffer /
+//               SRAM-capacity slot. Single-buffer kernels pass 0.
+//   read_bufs/n_read, write_bufs/n_write : SRAM buffer ids this op reads/writes
+//   (sec 10 dataflow). The loader builds the dependency DAG by last-writer per
+//   buffer.
+void togsim_dma(EmitCtx* ctx, int32_t dir, int32_t arg_id,
+                uint64_t offset, int32_t ndim, const int64_t* dims,
+                const int64_t* strides, int32_t elem_bits,
+                int32_t is_async, int32_t tag_id, uint64_t tag_slot,
+                const int64_t* read_bufs, int32_t n_read,
+                const int64_t* write_bufs, int32_t n_write);
+
+// Emit a fixed-size tile compute. Cost is looked up from the precomputed
+// tile_id->cycle table (annotation pass / sample-mode); `dims` are passed for
+// logging and future remainder-tile handling, not to compute cost here.
+//   compute_type : 0 vector / 1 matmul / 2 preload (maps to the Core unit enum;
+//                  routes the op to the VPU vs the systolic array).
+void togsim_compute(EmitCtx* ctx, uint64_t tile_id, int32_t compute_type,
+                    int32_t ndim, const int64_t* dims,
+                    const int64_t* read_bufs, int32_t n_read,
+                    const int64_t* write_bufs, int32_t n_write);
+
+// Explicit async-DMA sync -- the original memref.dma_wait. Pairs with its async
+// togsim_dma by the RUNTIME tag slot (tag_id, tag_slot) and gates consumers on
+// data-arrival (resp-complete), since an async dma's own finish is only
+// issue-complete. `write_bufs` is the SRAM buffer(s) that dma loaded; the loader
+// makes the barrier the last writer of them so consumers depend on it. Sync DMAs
+// need no barrier (they block to data-arrival themselves).
+void togsim_memory_barrier(EmitCtx* ctx, int32_t tag_id, uint64_t tag_slot,
+                           const int64_t* write_bufs, int32_t n_write);
+
+// A parallel work-item body, outlined by the producer (sec 9.3). Uniform across
+// kernels: it takes the EmitCtx, the packed parallel loop indices `iv` (iv[0..
+// n_iv) -- e.g. the (m,n) output-tile indices) and their count. The body emits
+// the work-item's ops (init / reduction / store). One signature => one general
+// dispatcher serves every kernel.
+// (iv is non-const to match the `int64_t*` the EmitC producer emits; the runtime
+// only reads it.)
+typedef void (*togsim_tile_fn)(EmitCtx* ctx, int64_t* iv, int32_t n_iv);
+
+// Dispatch one work-item (sec 9.3). The runtime round-robins a core from the
+// pool, brackets the call with TILE_BEGIN/TILE_END (the work-item boundary), and
+// invokes `fn(ctx, iv, n_iv)` -- so the work-item SCOPE is exactly the function
+// call, not an implicit "ops until the next alloc" range. Core alloc + boundary
+// are runtime-owned; the producer is core-count transparent (never names
+// num_cores or a physical core). Independent work-items land on different cores
+// -> multi-core. A general (kernel-independent) wrapper: it only forwards the
+// opaque iv array to fn.
+void togsim_dispatch(EmitCtx* ctx, togsim_tile_fn fn,
+                     int64_t* iv, int32_t n_iv);
+
+// Entry point the loader resolves in the producer `.so`. `shape_args` carries
+// the runtime values for the kernel's symbolic dimensions (in a kernel-specific
+// order recorded alongside the cached `.so`); `n_shape_args` is their count.
+void togsim_kernel(EmitCtx* ctx, int64_t* shape_args, int32_t n_shape_args);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
diff --git a/TOGSim/include/togsim_trace_bridge.h b/TOGSim/include/togsim_trace_bridge.h
new file mode 100644
index 00000000..f0213ef5
--- /dev/null
+++ b/TOGSim/include/togsim_trace_bridge.h
@@ -0,0 +1,18 @@
+#pragma once
+// togsim_trace_bridge.h
+// -----------------------------------------------------------------------------
+// Bridge from the recorded trace (togsim_loader.h RunResult) to a TileGraph the
+// existing Simulator/Core can run, for production cycle-equivalence (P3 task 5;
+// see togsim_cpp_trace.md sec 9.9). First cut: one Tile per work-item (the span
+// between two togsim_core_alloc markers), bound to that work-item's core; the
+// DMA/compute records become MOVIN/MOVOUT/COMP Instructions with the RAW
+// dependency edges (a compute waits the dmas its preceding waits named).
+// -----------------------------------------------------------------------------
+#include <memory>
+
+#include "TileGraph.h"
+#include "togsim_loader.h"
+
+// Build a TileGraph from a recorded trace. `path`/`name` label the graph.
+std::unique_ptr<TileGraph> trace_to_tilegraph(const togsim::RunResult& run,
+                                              const std::string& name);
diff --git a/TOGSim/src/CMakeLists.txt b/TOGSim/src/CMakeLists.txt
index 65cd4dd4..d782d4d1 100644
--- a/TOGSim/src/CMakeLists.txt
+++ b/TOGSim/src/CMakeLists.txt
@@ -12,3 +12,8 @@ file(GLOB_RECURSE SRC_FILES
 
 # build
 add_executable(${LIB_NAME} ${SRC_FILES})
+
+# Export the executable's dynamic symbols (-rdynamic) so a dlopen'd trace
+# producer .so resolves the togsim_* runtime callbacks back into this binary
+# (P3 trace pipeline).
+set_target_properties(${LIB_NAME} PROPERTIES ENABLE_EXPORTS ON)
diff --git a/TOGSim/src/Common.cc b/TOGSim/src/Common.cc
index 3f84d885..6f9a74d7 100644
--- a/TOGSim/src/Common.cc
+++ b/TOGSim/src/Common.cc
@@ -64,6 +64,10 @@ SimulationConfig initialize_config(const YAML::Node& config,
   parsed_config.core_freq_mhz = get_config_value<uint32_t>(config, "core_freq_mhz");
   if (config["num_systolic_array_per_core"])
     parsed_config.num_systolic_array_per_core = config["num_systolic_array_per_core"].as<uint32_t>();
+  if (config["core_spad_size_kb"])
+    parsed_config.core_spad_size_kb = config["core_spad_size_kb"].as<uint32_t>();
+  if (config["sa_weight_buffer_depth"])
+    parsed_config.sa_weight_buffer_depth = config["sa_weight_buffer_depth"].as<uint32_t>();
   if (config["num_stonne_per_core"])
     parsed_config.num_stonne_per_core = config["num_stonne_per_core"].as<uint32_t>();
   if (config["num_stonne_port"])
diff --git a/TOGSim/src/Core.cc b/TOGSim/src/Core.cc
index 9dad8597..915988ce 100644
--- a/TOGSim/src/Core.cc
+++ b/TOGSim/src/Core.cc
@@ -17,11 +17,75 @@ Core::Core(uint32_t id, SimulationConfig config)
   _stat_sa_compute_idle_cycle.resize(_num_systolic_array_per_core);
   _stat_inst_count.resize(static_cast<size_t>(Opcode::COUNT), 0);
   _stat_tot_skipped_inst.resize(static_cast<size_t>(Opcode::COUNT), 0);
+  _sram_capacity = (size_t)config.core_spad_size_kb * 1024;  // 0 = throttle disabled
+  _weight_slot_depth = config.sa_weight_buffer_depth;        // per-SA weight slots (>0)
+  if (_weight_slot_depth == 0) {
+    spdlog::error("sa_weight_buffer_depth must be > 0 (raise it to loosen the preload throttle)");
+    exit(EXIT_FAILURE);
+  }
+  _weight_slots_used.resize(_num_systolic_array_per_core, 0);
+}
+
+// Round-robin a systolic array that still has a free weight slot; -1 if all full
+// (the preload must stall). Advances _systolic_array_rr past the chosen SA.
+int Core::pick_free_weight_sa() {
+  for (uint32_t i = 0; i < _num_systolic_array_per_core; i++) {
+    uint32_t s = (_systolic_array_rr + i) % _num_systolic_array_per_core;
+    if (_weight_slots_used[s] < (int)_weight_slot_depth) {
+      _systolic_array_rr = (s + 1) % _num_systolic_array_per_core;
+      return (int)s;
+    }
+  }
+  return -1;
+}
+
+void Core::apply_due(const DueAction& a) {
+  switch (a.kind) {
+    case DueAction::FreeWeightSlot:
+      if (--a.token->refcount <= 0) _weight_slots_used[a.token->sa]--;  // last reader frees the slot
+      break;
+    case DueAction::WakeBar: {
+      auto bar = a.bar;            // async load data arrived -> fire its MEMORY_BAR
+      finish_instruction(bar);
+      break;
+    }
+  }
+}
+
+void Core::process_due_events() {
+  while (!_due_events.empty() && _due_events.begin()->first <= _core_cycle) {
+    apply_due(_due_events.begin()->second);
+    _due_events.erase(_due_events.begin());
+  }
+}
+
+// The LAST reader of a buffer-version issued (bridge tags only that consumer):
+// free the version's bytes back to the per-core spad.
+void Core::release_sram(const std::shared_ptr<Instruction>& inst) {
+  if (!_sram_capacity) return;
+  for (int64_t id : inst->get_sram_release()) {
+    auto it = _sram_allocs.find(id);
+    if (it == _sram_allocs.end()) continue;
+    _sram_used -= it->second;
+    _sram_allocs.erase(it);
+  }
+}
+
+bool Core::try_occupy_sram(const std::shared_ptr<Instruction>& inst) {
+  if (!_sram_capacity || inst->get_sram_alloc() < 0) return true;   // untracked
+  size_t F = inst->sram_footprint();
+  if (_sram_used + F > _sram_capacity) return false;                // would overflow -> stall
+  _sram_used += F;
+  _sram_allocs[inst->get_sram_alloc()] += F;                        // accumulate version footprint
+  return true;
 }
 
 bool Core::can_issue(const std::shared_ptr<Tile>& op) {
-  /* Check SRAM is enough to run tile */
-  return _tiles.size() < 4  && !op->is_stonne_tile();
+  /* Bound concurrent dispatches so their combined spad working set fits: with the
+   * global @buffers each in-flight dispatch piles its own load versions, and too
+   * many at once overflow the spad (versions never free -> wedge). 2 keeps double-
+   * buffering overlap while leaving headroom. */
+  return _tiles.size() < 2  && !op->is_stonne_tile();
 }
 
 void Core::issue(std::shared_ptr<Tile> op) {
@@ -135,7 +199,7 @@ void Core::dma_cycle() {
       finish_instruction(instruction, InstFinishTraceTag::DmaRespComplete);
       for (auto & wait_inst : _dma.get_tag_waiter(instruction->subgraph_id, key)) {
         _dma.mark_tag_used(instruction->subgraph_id, key);
-        finish_instruction(wait_inst);
+        _due_events.emplace(_core_cycle, DueAction{DueAction::WakeBar, nullptr, wait_inst});
       }
     }
     _dma_finished_queue.erase(_dma_finished_queue.begin());
@@ -154,7 +218,7 @@ void Core::dma_cycle() {
       } else if(!finished_inst->is_dma_read()) {
         core_trace_log::log_error_dma_instruction_invalid(_core_cycle, _id);
         exit(EXIT_FAILURE);
-      } else if (finished_inst->get_opcode() == Opcode::BAR) {
+      } else if (finished_inst->get_opcode() == Opcode::MEMORY_BAR) {
         core_trace_log::trace_instruction_line(_core_cycle,
                                                _id,
                                                TraceLogTag::pad15(TraceLogTag::kInstructionFinished),
@@ -200,6 +264,8 @@ void Core::cycle() {
   /* Increase core cycle counter */
   _core_cycle++;
 
+  process_due_events();  // weight-slot frees + DMA-arrival wakeups due this cycle
+
   /* Iterate tile while an instruction is issued */
   bool issued = false;
 
@@ -207,9 +273,6 @@ void Core::cycle() {
     auto& instructions = _tiles[i]->get_ready_instructions();
     for (auto it=instructions.begin(); it!=instructions.end();) {
       auto& inst = *it;
-      /* Skip instruction is not ready  */
-      //if (!inst->is_ready())
-      //  continue;
 
       switch (inst->get_opcode()) {
         case Opcode::MOVIN:
@@ -240,6 +303,8 @@ void Core::cycle() {
               _stat_tot_skipped_inst.at(static_cast<size_t>(inst->get_opcode()))++;
               break;
             } else {
+              // load occupies its spad bytes on issue; stall (retry next cycle) if full.
+              if (!try_occupy_sram(inst)) break;
               core_trace_log::trace_instruction_line(_core_cycle,
                                                        _id,
                                                        TraceLogTag::pad15(
@@ -254,6 +319,7 @@ void Core::cycle() {
             }
           }
         case Opcode::MOVOUT:
+          release_sram(inst);   // store issued -> free the tiles it drained
           core_trace_log::trace_instruction_line(_core_cycle,
                                                    _id,
                                                    TraceLogTag::pad15(TraceLogTag::kInstructionIssued),
@@ -265,7 +331,41 @@ void Core::cycle() {
           break;
         case Opcode::COMP:
           {
-            auto& target_pipeline = get_compute_pipeline(inst->get_compute_type());
+            const int ct = inst->get_compute_type();
+            // a fresh-output compute occupies its spad bytes on issue; stall if full.
+            if (!try_occupy_sram(inst)) break;
+            // SA selection (sec 10.x): a preload picks an SA with a free weight slot
+            // and pins its matmul consumers there; a matmul runs on its pinned SA.
+            int sa_idx = -1;
+            if (ct == MATMUL || ct == PRELOAD) {
+              if (ct == PRELOAD) {
+                int n_consumers = 0;   // matmuls reusing this weight
+                for (auto& c : inst->get_deps(DepEvent::ISSUE))
+                  if (c->get_compute_type() == MATMUL) n_consumers++;
+                if (n_consumers == 0) {            // weight-slot model needs >=1 consumer
+                  spdlog::error("preload has no matmul consumer (weight-slot model invariant)");
+                  exit(EXIT_FAILURE);
+                }
+                sa_idx = pick_free_weight_sa();
+                if (sa_idx < 0) break;              // all weight slots full -> stall (retry)
+                _weight_slots_used[sa_idx]++;
+                auto tok = std::make_shared<WeightToken>(WeightToken{sa_idx, n_consumers});
+                for (auto& c : inst->get_deps(DepEvent::ISSUE))
+                  if (c->get_compute_type() == MATMUL) {
+                    c->set_assigned_sa(sa_idx);
+                    c->set_weight_token(tok);
+                  }
+              } else {                              // MATMUL
+                sa_idx = inst->get_assigned_sa();   // pinned by its preload
+                if (sa_idx < 0) {                   // unpinned -> no preload set its SA
+                  spdlog::error("matmul was not pinned to an SA by a preload (weight-slot model invariant)");
+                  exit(EXIT_FAILURE);
+                }
+              }
+              inst->set_assigned_sa(sa_idx);         // record the SA actually used (for the trace)
+            }
+            auto& target_pipeline = (ct == VECTOR_UNIT) ? _vu_compute_pipeline
+                                                        : _sa_compute_pipeline.at(sa_idx);
             if (target_pipeline.empty()) {
               inst->finish_cycle = _core_cycle + inst->get_compute_cycle();
               inst->bubble_cycle = inst->get_overlapping_cycle();
@@ -275,7 +375,19 @@ void Core::cycle() {
               inst->finish_cycle = target_pipeline.back()->finish_cycle + inst->get_compute_cycle() - overlapped_cycle;
               inst->bubble_cycle = bubble_cycle;
             }
+            // release the occupancy (ISSUE) dependents so a successor overlaps this op.
+            inst->fire(DepEvent::ISSUE);
+
+            // Release this matmul's weight slot at its streaming-end (finish -
+            // overlapping), not at full finish (the drain tail does not read it).
+            if (ct == MATMUL && inst->get_weight_token()) {
+              cycle_type rel = inst->finish_cycle > inst->get_overlapping_cycle()
+                                 ? inst->finish_cycle - inst->get_overlapping_cycle() : _core_cycle;
+              _due_events.emplace(rel, DueAction{DueAction::FreeWeightSlot,
+                                                 inst->get_weight_token(), nullptr});
+            }
 
+            release_sram(inst);   // free the tiles it read (before the skip path)
             if (inst->get_compute_cycle() == 0) {
               inst->finish_instruction();
               static_cast<Tile*>(inst->get_owner())->inc_finished_inst();
@@ -297,12 +409,12 @@ void Core::cycle() {
             }
           }
           break;
-        case Opcode::BAR:
+        case Opcode::MEMORY_BAR:
           {
             auto& key = inst->get_tag_id();
             uint32_t finished = _dma.get_tag_finish(inst->subgraph_id, key);
             if (finished == -1) {
-              for (auto child_inst : inst->get_child_inst()) {
+              for (auto child_inst : inst->get_deps(DepEvent::DONE)) {
                 if (child_inst->get_opcode() == Opcode::COMP && child_inst->get_compute_type() == MATMUL) {
                   child_inst->set_compute_cycle(0);
                 }
@@ -387,6 +499,19 @@ void Core::finish_instruction(std::shared_ptr<Instruction>& inst, InstFinishTrac
                                            core_trace_log::format_instruction_detail_line(*inst));
 }
 
+bool Core::has_inflight() {
+  // running() without the "_tiles.size() > 0" term: work that will produce a
+  // finish event on its own (so the sim is NOT frozen). If this is false but
+  // tiles remain, only stalled ready instructions are left.
+  if (!_vu_compute_pipeline.empty()) return true;
+  for (int i = 0; i < _num_systolic_array_per_core; i++)
+    if (!_sa_compute_pipeline.at(i).empty()) return true;
+  if (!_dma_waiting_queue.empty() || !_dma_finished_queue.empty()) return true;
+  if (!_dma.empty()) return true;
+  if (!_ld_inst_queue.empty() || !_st_inst_queue.empty()) return true;
+  return false;
+}
+
 bool Core::running() {
   bool running = false;
   running = running || _tiles.size() > 0;
@@ -412,6 +537,13 @@ void Core::push_memory_response(mem_fetch* response) {
   Instruction* owner_inst = static_cast<Instruction*>(response->get_custom_data());
   assert(owner_inst->get_waiting_request());
 
+  if (!owner_inst->got_first_response()) {   // first data of this load arrived
+    owner_inst->mark_first_response();
+    core_trace_log::trace_instruction_line(_core_cycle, _id,
+        TraceLogTag::pad15(TraceLogTag::kFirstDramResponse),
+        owner_inst->get_global_inst_id(),
+        core_trace_log::format_instruction_detail_line(*owner_inst));
+  }
   owner_inst->dec_waiting_request();
   if (!owner_inst->get_waiting_request()) {
     auto it = _dma_waiting_queue.find(owner_inst);
diff --git a/TOGSim/src/CoreTraceLog.cc b/TOGSim/src/CoreTraceLog.cc
index ebc31de0..7086893e 100644
--- a/TOGSim/src/CoreTraceLog.cc
+++ b/TOGSim/src/CoreTraceLog.cc
@@ -31,7 +31,7 @@ std::string format_dma_inst_issued_detail(Instruction& inst) {
   }
   return fmt::format(
       "addr_name={} dram=0x{:016x} rank={} elem_bits={} async={} indirect={} tag=0x{:016x} stride=[{}] size=[{}] "
-      "tag_idx=[{}]",
+      "tag_idx=[{}] tile={}",
       inst.get_addr_name(),
       static_cast<uint64_t>(inst.get_base_dram_address()),
       rank,
@@ -41,7 +41,8 @@ std::string format_dma_inst_issued_detail(Instruction& inst) {
       tag_hex,
       fmt::join(inst.get_tile_stride(), ","),
       fmt::join(ts, ","),
-      fmt::join(tidx, ","));
+      fmt::join(tidx, ","),
+      inst.get_tile_group());
 }
 
 std::string format_dma_inst_issued_trace_line(Instruction& inst) {
@@ -52,31 +53,35 @@ std::string format_instruction_detail_line(Instruction& inst) {
   const Opcode op = inst.get_opcode();
   const std::string opname = opcode_to_string(op);
   if (op == Opcode::COMP) {
-    return fmt::format("{} (compute_type={} compute_cycle={} overlapping_cycle={})",
+    return fmt::format("{} (compute_type={} compute_cycle={} overlapping_cycle={} sa={} tile={})",
                        opname,
                        inst.get_compute_type(),
                        inst.get_compute_cycle(),
-                       inst.get_overlapping_cycle());
+                       inst.get_overlapping_cycle(),
+                       inst.get_assigned_sa(),
+                       inst.get_tile_group());
   }
   if ((op == Opcode::MOVIN || op == Opcode::MOVOUT) && inst.is_async_dma()) {
-    return fmt::format("{} (ASYNC subgraph_id={} addr_name={} tag_id=[{}] tag_idx=[{}] tag_stride=[{}])",
+    return fmt::format("{} (ASYNC subgraph_id={} addr_name={} tag_id=[{}] tag_idx=[{}] tag_stride=[{}] tile={})",
                        opname,
                        inst.subgraph_id,
                        inst.get_addr_name(),
                        format_tag_key_list_hex(inst.get_tag_id()),
                        fmt::join(inst.get_tag_idx_list(), ","),
-                       fmt::join(inst.get_tag_stride_list(), ","));
+                       fmt::join(inst.get_tag_stride_list(), ","),
+                       inst.get_tile_group());
   }
   if (op == Opcode::MOVIN || op == Opcode::MOVOUT) {
-    return fmt::format("{} (addr_name={})", opname, inst.get_addr_name());
+    return fmt::format("{} (addr_name={} tile={})", opname, inst.get_addr_name(), inst.get_tile_group());
   }
-  if (op == Opcode::BAR) {
-    return fmt::format("{} (addr_name={} tag_id=[{}] tag_idx=[{}] tag_stride=[{}])",
+  if (op == Opcode::MEMORY_BAR) {
+    return fmt::format("{} (addr_name={} tag_id=[{}] tag_idx=[{}] tag_stride=[{}] tile={})",
                        opname,
                        inst.get_addr_name(),
                        format_tag_key_list_hex(inst.get_tag_id()),
                        fmt::join(inst.get_tag_idx_list(), ","),
-                       fmt::join(inst.get_tag_stride_list(), ","));
+                       fmt::join(inst.get_tag_stride_list(), ","),
+                       inst.get_tile_group());
   }
   return opname;
 }
diff --git a/TOGSim/src/Instruction.cc b/TOGSim/src/Instruction.cc
index f236d160..ee184a1a 100644
--- a/TOGSim/src/Instruction.cc
+++ b/TOGSim/src/Instruction.cc
@@ -23,7 +23,7 @@ std::string opcode_to_string(Opcode opcode) {
         case Opcode::MOVIN:        return "MOVIN";
         case Opcode::MOVOUT:       return "MOVOUT";
         case Opcode::COMP:         return "COMP";
-        case Opcode::BAR:          return "BAR";
+        case Opcode::MEMORY_BAR:   return "MEMORY_BAR";
         default:                   return "Unknown";
     }
 }
@@ -50,16 +50,10 @@ Instruction::Instruction(Opcode opcode)
 }
 
 void Instruction::finish_instruction() {
-  for (auto& counter : child_inst)
-    counter->dec_ready_counter();
+  fire(DepEvent::DONE);   // latency consumers
   finished = true;
 }
 
-void Instruction::add_child(std::shared_ptr<Instruction> child) {
-  child->inc_ready_counter();
-  child_inst.insert(child);
-}
-
 void Instruction::inc_waiting_request() {
   _nr_waiting_request++;
 }
diff --git a/TOGSim/src/Simulator.cc b/TOGSim/src/Simulator.cc
index d987d787..366528ec 100644
--- a/TOGSim/src/Simulator.cc
+++ b/TOGSim/src/Simulator.cc
@@ -184,6 +184,38 @@ void Simulator::icnt_cycle() {
   _icnt->cycle();
 }
 
+// Consecutive frozen cycles tolerated before declaring the sim wedged (spad too
+// small). Generous so transient idle never false-fires; a true freeze is constant.
+static constexpr uint64_t kWedgeThreshold = 100000;
+
+// Frozen-state guard: work remains (running()) but nothing is in flight to
+// advance it -- the SRAM throttle can never satisfy a load because the kernel's
+// working set exceeds the whole per-core spad (core_spad_size_kb too small). The
+// state repeats every cycle, so after a margin error out instead of looping
+// forever. `stuck` is function-local-static (one running sim at a time; it resets
+// on any progress).
+void Simulator::check_frozen() {
+  static uint64_t stuck = 0;
+  // In flight = anything that will produce a future state change: icnt/dram busy,
+  // a core with DMA/compute pending, or a tile still schedulable.
+  bool inflight = _icnt->running() || _dram->running();
+  for (int id = 0; id < _n_cores && !inflight; id++) {
+    if (_cores[id]->has_inflight()) inflight = true;
+    else if (!get_partition_scheduler(id)->empty(id)) inflight = true;
+  }
+  if (running() && !inflight) {
+    if (++stuck > kWedgeThreshold) {
+      spdlog::error("[Simulator] simulation wedged at cycle {}: work remains but "
+                    "nothing is in flight -- the per-core spad (core_spad_size_kb) "
+                    "is too small to hold a kernel's working set. Increase it.",
+                    _core_cycles);
+      exit(EXIT_FAILURE);
+    }
+  } else {
+    stuck = 0;
+  }
+}
+
 void Simulator::cycle() {
   while (running() || _core_cycles < 1) {
     set_cycle_mask();
@@ -198,6 +230,8 @@ void Simulator::cycle() {
     // Interconnect cycle
     if (IS_ICNT_CYCLE(_cycle_mask))
       icnt_cycle();
+
+    check_frozen();   // spad-too-small guard (errors out if wedged)
   }
   for (auto &core: _cores) {
     core->check_tag();
diff --git a/TOGSim/src/TileGraphParser.cc b/TOGSim/src/TileGraphParser.cc
index 5060d336..c252258e 100644
--- a/TOGSim/src/TileGraphParser.cc
+++ b/TOGSim/src/TileGraphParser.cc
@@ -543,7 +543,7 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
              fmt::join(new_tag_stride_list, ", "));
 
       std::shared_ptr<Instruction> inst = std::make_shared<Instruction>(
-        Opcode::BAR, 0,
+        Opcode::MEMORY_BAR, 0,
         0, base_addr,
         std::vector<size_t>(), std::vector<int>(), 0,
         tag_list, new_tag_stride_list, accum_tag_list
@@ -584,7 +584,7 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
         for (const auto& child_node: node->get_child()) {
           if (link_map.find(child_node) != link_map.end()) {
             std::shared_ptr<Instruction> child_inst = link_map[child_node];
-            inst->add_child(child_inst);
+            inst->add_dep(child_inst, DepEvent::DONE);
           }
         }
       }
@@ -606,7 +606,7 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
             for (auto& inner_inst : inner_tile->get_instructions()) {
               tile_vec.back()->append_instuction(inner_inst);
               if (nr_inst) {
-                last_instruction->add_child(inner_inst);
+                last_instruction->add_dep(inner_inst, DepEvent::DONE);
               }
             }
           }
@@ -662,7 +662,7 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
     for (const auto& child_node: node->get_child()) {
       if (link_map.find(child_node) != link_map.end()) {
         std::shared_ptr<Instruction> child_inst = link_map[child_node];
-        inst->add_child(child_inst);
+        inst->add_dep(child_inst, DepEvent::DONE);
       }
     }
   }
diff --git a/TOGSim/src/main.cc b/TOGSim/src/main.cc
index 010826ef..274d63da 100644
--- a/TOGSim/src/main.cc
+++ b/TOGSim/src/main.cc
@@ -1,3 +1,4 @@
+#include <cstdlib>
 #include <fstream>
 #include <chrono>
 #include <filesystem>
@@ -8,18 +9,72 @@
 #include "Simulator.h"
 #include "TileGraphParser.h"
 #include "helper/CommandLineParser.h"
+#include "togsim_loader.h"        // P3 trace pipeline: run a compiled producer .so
+#include "togsim_trace_bridge.h"  // ... and bridge its trace to a TileGraph
 
 namespace fs = std::filesystem;
 namespace po = boost::program_options;
 
 
+// Run a kernel's compiled trace producer (.so) and bridge it to a TileGraph,
+// targeting `partition_id` (its work-items round-robin only over that partition's
+// cores -- partitions are independent schedulers). The cycle-table TSV gives the
+// per-tile compute latency; a flat stub is used if it is missing. Returns nullptr
+// if the producer run fails. Shared by the standalone --trace_so path and the
+// multi-tenant launchKernel below.
+std::unique_ptr<TileGraph> build_trace_tilegraph(Simulator* simulator,
+                                                 const std::string& trace_so_path,
+                                                 const std::string& cycle_table_path,
+                                                 int partition_id) {
+  const auto& cfg = simulator->get_hardware_config_yaml();
+  int num_cores = cfg["num_cores"] ? cfg["num_cores"].as<int>() : 1;
+  std::vector<int32_t> partition_cores;
+  for (int c = 0; c < num_cores; c++)
+    if (simulator->get_partition_id(c) == partition_id) partition_cores.push_back(c);
+  if (partition_cores.empty()) partition_cores.push_back(0);
+  // First cut: stub tensor bases (real per-tensor addresses come later).
+  std::vector<uint64_t> bases(16);
+  for (size_t i = 0; i < bases.size(); ++i) bases[i] = 0x100000ull * (i + 1);
+  // Cycle table: load the per-tile_id TSV sidecar if present, else a flat stub.
+  std::vector<int64_t> cyc, ovl;
+  std::ifstream ct(cycle_table_path);
+  if (ct.is_open()) {
+    int64_t c, o;
+    while (ct >> c >> o) { cyc.push_back(c); ovl.push_back(o); }
+  }
+  if (cyc.empty()) { cyc.assign(256, 128); ovl.assign(256, 0); }
+  auto run = togsim::run_producer(trace_so_path.c_str(), nullptr, 0,
+                                  bases.data(), (int)bases.size(),
+                                  cyc.data(), ovl.data(), (int)cyc.size(),
+                                  partition_cores.data(), (int32_t)partition_cores.size());
+  if (!run.ok) return nullptr;
+  return trace_to_tilegraph(run, "trace_kernel");
+}
+
 void launchKernel(Simulator* simulator, unsigned int kernel_id, std::string onnx_path, std::string attribute_path, const YAML::Node& config_yaml, cycle_type request_time=0, int partition_id=0, int device_id=0) {
-  auto graph_praser = TileGraphParser(onnx_path, attribute_path, config_yaml);
-  std::unique_ptr<TileGraph>& tile_graph = graph_praser.get_tile_graph();
+  std::unique_ptr<TileGraph> tile_graph;
+  std::string tog_path = onnx_path;  // for the log line
+  // The C++ trace path is the supported one: the kernel's trace.so / trace_cycles.tsv
+  // sit next to its tile_graph.onnx (same write_path). The legacy ONNX parser below is
+  // DEPRECATED -- only used via TORCHSIM_LEGACY_TOG=1 or when the .so is absent / fails.
+  const char* legacy = std::getenv("TORCHSIM_LEGACY_TOG");
+  std::string dir = fs::path(onnx_path).parent_path().string();
+  std::string trace_so = dir + "/trace.so";
+  std::string cycle_tsv = dir + "/trace_cycles.tsv";
+  if ((!legacy || std::string(legacy) != "1") && fs::exists(trace_so)) {
+    tile_graph = build_trace_tilegraph(simulator, trace_so, cycle_tsv, partition_id);
+    if (tile_graph) tog_path = trace_so;
+    else spdlog::warn("[TOGSim] trace.so run failed for {}; falling back to ONNX", trace_so);
+  }
+  if (!tile_graph) {
+    spdlog::warn("[TOGSim] using the DEPRECATED legacy ONNX TOG path for {}", onnx_path);
+    auto graph_praser = TileGraphParser(onnx_path, attribute_path, config_yaml);
+    tile_graph = std::move(graph_praser.get_tile_graph());
+  }
   tile_graph->set_arrival_time(request_time ? request_time : simulator->get_core_cycle());
   tile_graph->set_kernel_id(kernel_id);
   spdlog::info("[Scheduler {}] Enqueued kernel_id: {}, tog_path: {}, operation: {}, request_time_cycles: {}",
-               partition_id, kernel_id, onnx_path, tile_graph->get_name(), request_time);
+               partition_id, kernel_id, tog_path, tile_graph->get_name(), request_time);
   simulator->enqueue_graph(partition_id, std::move(tile_graph));
 }
 
@@ -104,6 +159,11 @@ int main(int argc, char** argv) {
       "models_list", "Path for the trace file (.trace)");
   cmd_parser.add_command_line_option<std::string>(
       "log_level", "Set for log level [trace, debug, info], default = info");
+  cmd_parser.add_command_line_option<std::string>(
+      "trace_so", "Path to a compiled trace producer .so (P3 trace pipeline)");
+  cmd_parser.add_command_line_option<std::string>(
+      "cycle_table", "Path to a 'cycle<TAB>overlapping' per-tile_id sidecar (TSV) "
+                     "for --trace_so; falls back to a flat stub if omitted");
   try {
     cmd_parser.parse(argc, argv);
   } catch (const CommandLineParser::ParsingError& e) {
@@ -147,6 +207,27 @@ int main(int argc, char** argv) {
     exit(1);
   }
 
+  // P3 trace pipeline: if a compiled producer .so is given, run it, bridge the
+  // recorded trace to a TileGraph, and run the existing Simulator on it.
+  std::string trace_so_path;
+  cmd_parser.set_if_defined("trace_so", &trace_so_path);
+  if (!trace_so_path.empty()) {
+    // Standalone single-kernel trace run: enqueue to partition 0 (its work-items
+    // round-robin over partition 0's cores only; see build_trace_tilegraph).
+    std::string cycle_table_path;
+    cmd_parser.set_if_defined("cycle_table", &cycle_table_path);
+    auto tg = build_trace_tilegraph(simulator, trace_so_path, cycle_table_path, 0);
+    if (!tg) { spdlog::error("[TOGSim] trace producer run failed"); exit(1); }
+    tg->set_arrival_time(simulator->get_core_cycle());
+    tg->set_kernel_id(0);
+    simulator->enqueue_graph(0, std::move(tg));
+    simulator->run_simulator();
+    spdlog::info("[TOGSim-trace] Total cycles: {}", simulator->get_core_cycle());
+    spdlog::info("Simulation finished");
+    simulator->print_core_stat();
+    return 0;
+  }
+
   // Get trace file path
   cmd_parser.set_if_defined("models_list", &trace_file_path);
 
diff --git a/TOGSim/src/togsim_runtime.cc b/TOGSim/src/togsim_runtime.cc
new file mode 100644
index 00000000..a83b8541
--- /dev/null
+++ b/TOGSim/src/togsim_runtime.cc
@@ -0,0 +1,196 @@
+// togsim_runtime.cc
+// -----------------------------------------------------------------------------
+// C6 runtime + loader for the compiled trace producer (P3 task 5). Implements
+// the producer ABI (togsim_runtime.h) and the TOGSim-side loader
+// (togsim_loader.h). See docs/design/togsim_cpp_trace.md sec 5.3 / 9.6.1 / 9.7.
+//
+// The producer `.so` calls the extern "C" togsim_* functions below; each one
+// records a TraceRec on the EmitCtx. EmitCtx is the opaque type the producer
+// only ever passes back to us. This is the "materializing sink": it resolves
+// addresses and per-tile cycles into a recorded instruction stream. Wiring the
+// stream into the existing timing core (Core/Simulator) is the remaining step.
+// -----------------------------------------------------------------------------
+
+#include "togsim_loader.h"
+
+#include <algorithm>
+#include <cstdio>
+#include <cstdlib>
+#include <dlfcn.h>
+#include <map>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+// Full definition of the opaque handle from togsim_runtime.h. The producer holds
+// only EmitCtx* and never dereferences it.
+struct EmitCtx {
+  // inputs supplied by the loader
+  const uint64_t* tensor_base = nullptr;
+  int32_t         n_tensors = 0;
+  const int64_t*  cyc = nullptr;   // tile_id -> cycle
+  const int64_t*  ovl = nullptr;   // tile_id -> overlapping_cycle
+  int32_t         n_tiles = 0;
+  std::vector<int32_t> cores{0};   // the partition's core ids; dispatch round-robins over these
+  // mutable run state
+  int32_t  rr = 0;            // round-robin cursor into `cores`
+  int32_t  cur_core = -1;     // current work-item's core
+  std::vector<togsim::TraceRec> trace;
+};
+
+namespace {
+inline togsim::TraceRec blank(togsim::TraceRec::Kind k, int32_t core) {
+  togsim::TraceRec r{};
+  r.kind = k;
+  r.core = core;
+  return r;
+}
+}  // namespace
+
+extern "C" {
+
+int32_t togsim_abi_version(void) { return TOGSIM_ABI_VERSION; }
+
+void togsim_dispatch(EmitCtx* ctx, togsim_tile_fn fn, int64_t* iv, int32_t n_iv) {
+  // Higher-order work-item wrapper (sec 9.3): round-robin over THIS partition's
+  // cores (a kernel is enqueued to one partition; partitions are independent, so
+  // a work-item must never land on another partition's core -- that subgraph would
+  // sit in this partition's scheduler forever). Bracket the work-item with
+  // TILE_BEGIN/TILE_END; the ops fn emits records under ctx->cur_core.
+  ctx->cur_core = ctx->cores.empty() ? 0
+                : ctx->cores[ctx->rr++ % (int32_t)ctx->cores.size()];
+  ctx->trace.push_back(blank(togsim::TraceRec::TILE_BEGIN, ctx->cur_core));
+  fn(ctx, iv, n_iv);
+  ctx->trace.push_back(blank(togsim::TraceRec::TILE_END, ctx->cur_core));
+}
+
+void togsim_dma(EmitCtx* ctx, int32_t dir, int32_t arg_id,
+                uint64_t offset, int32_t ndim, const int64_t* dims,
+                const int64_t* strides, int32_t elem_bits,
+                int32_t is_async, int32_t tag_id, uint64_t tag_slot,
+                const int64_t* read_bufs, int32_t n_read,
+                const int64_t* write_bufs, int32_t n_write) {
+  uint64_t base = (arg_id >= 0 && arg_id < ctx->n_tensors)
+                      ? ctx->tensor_base[arg_id] : 0;
+  uint64_t addr = base + offset * (uint64_t)(elem_bits / 8);
+  togsim::TraceRec r = blank(togsim::TraceRec::DMA, ctx->cur_core);
+  r.dir = dir; r.arg_id = arg_id; r.elem_bits = elem_bits;
+  r.is_async = is_async; r.addr = addr; r.tag_id = tag_id; r.tag_slot = tag_slot;
+  for (int32_t i = 0; i < ndim; ++i) {
+    if (dims) r.dims.push_back(dims[i]);
+    if (strides) r.strides.push_back(strides[i]);
+  }
+  for (int32_t i = 0; i < n_read; ++i) r.read_bufs.push_back(read_bufs[i]);
+  for (int32_t i = 0; i < n_write; ++i) r.write_bufs.push_back(write_bufs[i]);
+  ctx->trace.push_back(r);
+}
+
+void togsim_compute(EmitCtx* ctx, uint64_t tile_id, int32_t compute_type,
+                    int32_t ndim, const int64_t* dims,
+                    const int64_t* read_bufs, int32_t n_read,
+                    const int64_t* write_bufs, int32_t n_write) {
+  (void)ndim; (void)dims;
+  togsim::TraceRec r = blank(togsim::TraceRec::COMPUTE, ctx->cur_core);
+  r.tile_id = tile_id;
+  r.compute_type = compute_type;
+  for (int32_t i = 0; i < n_read; ++i) r.read_bufs.push_back(read_bufs[i]);
+  for (int32_t i = 0; i < n_write; ++i) r.write_bufs.push_back(write_bufs[i]);
+  if (ctx->cyc && (int32_t)tile_id < ctx->n_tiles) r.cycle = ctx->cyc[tile_id];
+  if (ctx->ovl && (int32_t)tile_id < ctx->n_tiles) r.overlapping = ctx->ovl[tile_id];
+  ctx->trace.push_back(r);
+}
+
+void togsim_memory_barrier(EmitCtx* ctx, int32_t tag_id, uint64_t tag_slot,
+                           const int64_t* write_bufs, int32_t n_write) {
+  togsim::TraceRec r = blank(togsim::TraceRec::MEMORY_BAR, ctx->cur_core);
+  r.tag_id = tag_id; r.tag_slot = tag_slot;
+  for (int32_t i = 0; i < n_write; ++i) r.write_bufs.push_back(write_bufs[i]);
+  ctx->trace.push_back(r);
+}
+
+}  // extern "C"
+
+namespace togsim {
+
+RunResult run_producer(const char* so_path,
+                       const int64_t* shape_args, int32_t n_shape,
+                       const uint64_t* tensor_base, int32_t n_tensors,
+                       const int64_t* cyc, const int64_t* ovl, int32_t n_tiles,
+                       const int32_t* partition_cores, int32_t n_partition_cores) {
+  RunResult res;
+  void* lib = dlopen(so_path, RTLD_NOW | RTLD_GLOBAL);
+  if (!lib) { fprintf(stderr, "togsim: dlopen failed: %s\n", dlerror()); return res; }
+  auto emit = (void (*)(EmitCtx*, int64_t*, int32_t))dlsym(lib, "togsim_kernel");
+  if (!emit) { fprintf(stderr, "togsim: dlsym togsim_kernel failed: %s\n", dlerror()); return res; }
+
+  EmitCtx ctx;
+  ctx.tensor_base = tensor_base; ctx.n_tensors = n_tensors;
+  ctx.cyc = cyc; ctx.ovl = ovl; ctx.n_tiles = n_tiles;
+  ctx.cores.assign(partition_cores, partition_cores + (n_partition_cores > 0 ? n_partition_cores : 0));
+  if (ctx.cores.empty()) ctx.cores.push_back(0);
+  emit(&ctx, (int64_t*)shape_args, n_shape);
+
+  res.ok = true;
+  res.trace = std::move(ctx.trace);
+  return res;
+}
+
+SimResult simulate(const RunResult& run, const TimingParams& params) {
+  SimResult out;
+  std::unordered_map<int, uint64_t> dma_free;     // DMA-engine free time, per core
+  std::unordered_map<int, uint64_t> comp_free;    // compute free time, per core
+  std::unordered_map<int, uint64_t> prev_comp;    // prev compute finish (overlap), per core
+  std::map<std::pair<int32_t, uint64_t>, uint64_t> tag_finish;  // (tag_id,tag_slot) -> finish
+  std::vector<uint64_t> pending;                    // barrier-resolved deps since last compute
+
+  for (const auto& t : run.trace) {
+    const int c = t.core;
+    switch (t.kind) {
+      case TraceRec::DMA: {
+        // DMAs serialize on the core's DMA engine (overlap compute -> separate
+        // timeline). finish = issue + latency, recorded under the runtime tag.
+        uint64_t start = dma_free[c];
+        uint64_t fin = start + params.dma_latency;
+        dma_free[c] = fin;
+        tag_finish[{t.tag_id, t.tag_slot}] = fin;
+        out.n_dma++;
+        break;
+      }
+      case TraceRec::MEMORY_BAR: {
+        // the explicit async-DMA sync: gate the next compute on the paired dma's
+        // data-arrival, found by the runtime tag (tag_id, tag_slot).
+        auto it = tag_finish.find({t.tag_id, t.tag_slot});
+        if (it != tag_finish.end()) pending.push_back(it->second);
+        break;
+      }
+      case TraceRec::COMPUTE: {
+        uint64_t deps = 0;
+        for (uint64_t f : pending) deps = std::max(deps, f);
+        pending.clear();
+        uint64_t start = std::max(comp_free[c], deps);
+        uint64_t fin;
+        auto pit = prev_comp.find(c);
+        if (pit != prev_comp.end()) {
+          uint64_t prev = pit->second;
+          uint64_t tail = prev > start ? prev - start : 0;     // prev still running
+          uint64_t overlapped = std::min<uint64_t>(tail, (uint64_t)t.overlapping);
+          fin = std::max(start, prev) + (uint64_t)t.cycle - overlapped;
+        } else {
+          fin = start + (uint64_t)t.cycle;
+        }
+        comp_free[c] = fin;
+        prev_comp[c] = fin;
+        out.n_compute++;
+        break;
+      }
+      case TraceRec::TILE_BEGIN:
+      case TraceRec::TILE_END:
+        break;  // work-item boundary: no cost in this reference timer
+    }
+  }
+  for (auto& kv : dma_free) out.total_cycle = std::max(out.total_cycle, kv.second);
+  for (auto& kv : comp_free) out.total_cycle = std::max(out.total_cycle, kv.second);
+  return out;
+}
+
+}  // namespace togsim
diff --git a/TOGSim/src/togsim_trace_bridge.cc b/TOGSim/src/togsim_trace_bridge.cc
new file mode 100644
index 00000000..351e313e
--- /dev/null
+++ b/TOGSim/src/togsim_trace_bridge.cc
@@ -0,0 +1,342 @@
+// togsim_trace_bridge.cc -- see togsim_trace_bridge.h
+#include "togsim_trace_bridge.h"
+
+#include <algorithm>
+#include <map>
+#include <utility>
+#include <vector>
+
+#include "Tile.h"
+#include "Instruction.h"
+
+namespace {
+
+// `uniq` is a per-DMA-record unique tag-key id minted by the caller. The Core
+// tag table keys completion on [addr_id, ..., sum(tag_idx*stride)]; using `uniq`
+// as addr_id makes every reduction iteration of one static dma get a DISTINCT
+// key -- so multi-tile-K (and conv, whose reduction is the kh*kw*C nest) do not
+// collide, with no coordinate enumeration. The matching memory_barrier reuses
+// the same `uniq` (current-load map per (tag_id, tag_slot), see
+// trace_to_tilegraph), so the table still pairs them. This works because the
+// recorded stream is already per-iteration (the producer ran the loops) --
+// unlike a compile-time event_id. `tag_idx` (the subtile slot) is retained for
+// the SRAM double-buffer model.
+//
+// FIXME(semantics): the per-iteration tag is still reconstructed HERE from the
+// record order. The producer IR now DOES carry a per-iteration tag -- dma_fine_-
+// grained emits a fresh tag memref.alloc just before each coarse load (rewiring
+// its dma_wait), so successive reduction iterations allocate distinct tags -- but
+// build_skeleton collapses that to one static tag_id (it DCEs the alloc and keys
+// togsim.dma by the alloc's static identity), so this bridge still needs `uniq`
+// to tell iterations apart at runtime. The faithful finish is to thread the
+// per-iteration alloc identity through build_skeleton as an SSA tag handle on the
+// togsim.dma / togsim.memory_barrier (then `uniq` here is unnecessary).
+std::shared_ptr<Instruction> make_dma(const togsim::TraceRec& t, int64_t uniq) {
+  Opcode op = (t.dir == 1) ? Opcode::MOVOUT : Opcode::MOVIN;
+  std::vector<size_t> tile_size(t.dims.begin(), t.dims.end());
+  std::vector<int> tile_stride(t.strides.begin(), t.strides.end());
+  std::vector<int64_t> tag_idx{(int64_t)t.tag_slot};
+  std::vector<int64_t> tag_stride{1};
+  auto inst = std::make_shared<Instruction>(
+      op, /*compute_cycle=*/0, /*num_parents=*/0, /*dram_addr=*/t.addr,
+      tile_size, tile_stride, (size_t)t.elem_bits, tag_idx, tag_stride,
+      /*accum_tag_idx_list=*/std::vector<int64_t>{});
+  inst->set_is_async(t.is_async != 0);
+  inst->set_addr_name("tag" + std::to_string(uniq), uniq);
+  inst->prepare_tag_key();
+  return inst;
+}
+
+// A MEMORY_BAR carrying the SAME `uniq` tag key as the async dma it gates -- the
+// Core's tag table signals it at the dma's DATA-ready (resp-complete), unlike a
+// raw DONE edge that the async dma releases at issue-complete.
+std::shared_ptr<Instruction> make_mem_bar(const togsim::TraceRec& t, int64_t uniq) {
+  auto bar = std::make_shared<Instruction>(
+      Opcode::MEMORY_BAR, 0, 0, 0,
+      std::vector<size_t>{}, std::vector<int>{}, 0,
+      std::vector<int64_t>{(int64_t)t.tag_slot}, std::vector<int64_t>{1},
+      std::vector<int64_t>{});
+  bar->set_addr_name("tag" + std::to_string(uniq), uniq);
+  bar->prepare_tag_key();
+  return bar;
+}
+
+std::shared_ptr<Instruction> make_compute(const togsim::TraceRec& t) {
+  auto inst = std::make_shared<Instruction>(
+      Opcode::COMP, /*compute_cycle=*/(cycle_type)t.cycle, /*num_parents=*/0,
+      /*dram_addr=*/0, std::vector<size_t>{}, std::vector<int>{}, /*elem_bits=*/0,
+      std::vector<int64_t>{}, std::vector<int64_t>{}, std::vector<int64_t>{});
+  inst->set_overlapping_cycle((cycle_type)t.overlapping);
+  inst->set_compute_type(t.compute_type);  // route to VPU vs systolic array
+  return inst;
+}
+
+}  // namespace
+
+std::unique_ptr<TileGraph> trace_to_tilegraph(const togsim::RunResult& run,
+                                              const std::string& name) {
+  using togsim::TraceRec;
+  auto tg = std::make_unique<TileGraph>(name, name);
+  // Empty cache plan (no L2/CMEM persistence) -- append_subgraph propagates it
+  // to each subgraph, and DMA::is_cacheable dereferences it, so it must be a
+  // valid (if empty) IntervalTree rather than null.
+  tg->init_cache_plan({});
+
+  std::shared_ptr<TileSubGraph> sg;
+  std::shared_ptr<Tile> tile;
+  // Explicit dependency DAG (sec 10), one clean dataflow rule (see `link`).
+  // Per SRAM buffer we keep writers(b) -- a SET of the current producers'
+  // DONE-handles -- and readers(b). Scoped per work-item (reset at each dispatch)
+  // -- buffers are work-item-local, so distinct work-items are independent
+  // (-> parallel).
+  std::map<int64_t, std::vector<std::shared_ptr<Instruction>>> writers;       // buffer id -> current producers (DONE-handles)
+  // An async dma is paired with its explicit memory_barrier(s) by the runtime tag
+  // (tag_id, tag_slot). It is 1 load : N barriers (the load happens once per
+  // reduction iteration; each consumer in that iteration is preceded by a wait on
+  // the same tag), so we track the CURRENT (most recent) load per (tag_id,
+  // tag_slot) -- not a FIFO. Each load gets a fresh `uniq` Core key, so successive
+  // reduction iterations (multi-tile-K, conv) never collide in the tag table; the
+  // iteration's barriers reuse that load's uniq. Correct because the load nest and
+  // its consumer nest run in order within the reduction body (no cross-iteration
+  // prefetch). Scoped per work-item.
+  std::map<std::pair<int32_t, uint64_t>,
+           std::pair<int64_t, std::shared_ptr<Instruction>>> current_dma;
+  // Dedup identical dma_waits: the barrier already built for the CURRENT load of a
+  // (tag_id, tag_slot). A later memory_barrier on the SAME load instance reuses it
+  // (its consumers gate on the existing bar) instead of re-emitting -- a conv reads one
+  // loaded subtile from many matmuls, so the fine-grained per-consumer waits collapse to
+  // one per load. A new load (next reduction iter) bumps uniq, so a genuine new wait
+  // still gets its own bar; the first wait stays at its consumer, so overlap is kept.
+  std::map<std::pair<int32_t, uint64_t>,
+           std::pair<int64_t, std::shared_ptr<Instruction>>> bar_for_load;
+  int64_t next_tag = 0;   // mints a unique Core tag key per dma record
+  int cur_tile_group = -1;   // work-item index, bumped per TILE_BEGIN (trace grouping)
+
+  auto flush = [&]() {
+    if (sg && tile) {
+      sg->add_tile(tile);
+      tile->set_owner(sg);
+      tg->append_subgraph(sg);
+    }
+    sg.reset();
+    tile.reset();
+    writers.clear();
+    current_dma.clear();
+    bar_for_load.clear();
+    next_tag = 0;
+  };
+
+  // Single dataflow rule (sec 10). Per buffer b, writers(b) is a SET of the
+  // current producers' DONE-handles.
+  //  - READ b: depend on ALL writers(b) -- occupancy (ISSUE) when both are SA ops
+  //    (preload/matmul overlap on the pipeline), else latency (DONE).
+  //  - WRITE b: REPLACE -- reset writers(b)={inst}.
+  //  - Exception is_mm_accum (a MATMUL reading AND writing b = a commutative
+  //    accumulator, Y += X@W): skip the read edge and UNION the write -- wait only the
+  //    non-matmul seed (init/bias) and join writers(b) without resetting or ordering
+  //    against co-matmuls, so the K matmuls do not chain through the accumulator and a
+  //    later reader joins all of them. TOGSim is timing-only (values come from trace).
+  // Buffer-reuse (WAR) ordering is modeled by the resource models, not edges: the SRAM
+  // version/capacity machinery for spad buffers, the weight-slot machinery for weights.
+  const int MATMUL_CT = 1, PRELOAD_CT = 2;
+  auto is_mm_accum = [&](const std::shared_ptr<Instruction>& inst, int64_t b,
+                         const std::vector<int64_t>& writes) {
+    if (inst->get_compute_type() != MATMUL_CT) return false;
+    for (int64_t w : writes) if (w == b) return true;
+    return false;
+  };
+  auto link = [&](std::shared_ptr<Instruction> inst,
+                  const std::vector<int64_t>& reads,
+                  const std::vector<int64_t>& writes) {
+    for (int64_t b : reads) {
+      if (is_mm_accum(inst, b, writes)) continue;   // accumulator read -> handled in WRITE (UNION)
+      auto it = writers.find(b);
+      if (it != writers.end())
+        for (auto& w : it->second) {
+          int pct = w->get_compute_type();
+          // both SA ops -> occupancy (overlap on the SA pipeline); else latency.
+          DepEvent on = (inst->get_compute_type() == MATMUL_CT &&
+                         (pct == MATMUL_CT || pct == PRELOAD_CT))
+                            ? DepEvent::ISSUE : DepEvent::DONE;
+          w->add_dep(inst, on);
+        }
+    }
+    for (int64_t b : writes) {
+      if (is_mm_accum(inst, b, writes)) {            // UNION (commutative accumulate)
+        auto it = writers.find(b);
+        if (it != writers.end())
+          for (auto& s : it->second)
+            if (s->get_compute_type() != MATMUL_CT)
+              s->add_dep(inst, DepEvent::DONE);   // wait the init/bias seed only
+        writers[b].push_back(inst);        // join; no reset, no co-matmul edge
+      } else {                             // REPLACE (normal output; resets the producer set)
+        writers[b] = { inst };
+      }
+    }
+    tile->append_instuction(inst);
+  };
+
+  // --- SRAM-capacity tracking (buffer-version allocations, sec 10.x) ---
+  // A coarse tile = one version of its buffer; the fine DMAs that fill it share
+  // one allocation, freed once all the version's consumers have issued (refcount
+  // -> 0). NOT reset in flush(): the spad is one physical per-core resource, so a
+  // buffer reused by the next reduction iter / work-item is a NEW version that
+  // must wait for the old one to free (WAR / double-buffer). Both DMA-loaded
+  // buffers AND compute outputs (the accumulator, vector epilogue results) are
+  // tracked; the virtual SA-weights are not (weight slots model them). (v1:
+  // single-core; multi-core would key cur_alloc/vers by (core, buf).)
+  int64_t next_alloc = 0;
+  std::map<int64_t, int64_t> cur_alloc;   // buf -> current version id
+  std::map<int64_t, bool> open_ver;       // buf -> version still accepting writes
+  struct Ver { std::vector<std::shared_ptr<Instruction>> loads, readers; };
+  std::map<int64_t, Ver> vers;
+  // Spad bytes per buffer id, taken from the DMA records that touch it (load fills
+  // its dst, store drains its src) -- the authoritative tile size. A compute output
+  // (never DMA-loaded but stored) gets its footprint from its store record. Built
+  // in a pre-pass so it is known before the producing compute is processed.
+  auto rec_bytes = [](const TraceRec& t) {        // single source of the tile footprint
+    size_t numel = 1;
+    for (auto d : t.dims) numel *= (size_t)d;
+    return numel * (t.elem_bits / 8);
+  };
+  std::map<int64_t, size_t> buf_bytes;
+  for (const auto& t : run.trace) {
+    if (t.kind != TraceRec::DMA) continue;
+    const auto& bs = (t.dir == 1) ? t.read_bufs : t.write_bufs;  // store reads spad, load writes spad
+    for (int64_t b : bs) buf_bytes[b] = rec_bytes(t);
+  }
+  auto sram_on_load = [&](int64_t b, const std::shared_ptr<Instruction>& ld) {
+    if (!cur_alloc.count(b) || !open_ver[b]) {   // a read closed it -> new version
+      cur_alloc[b] = next_alloc++;
+      open_ver[b] = true;
+      vers[cur_alloc[b]] = {};
+    }
+    ld->set_sram_alloc(cur_alloc[b]);
+    vers[cur_alloc[b]].loads.push_back(ld);
+  };
+  // A compute that freshly produces buffer b (b not read-and-written in place) opens
+  // a version like a load; the opener carries b's footprint (from buf_bytes). A
+  // version continues across the producing writes until a consuming read closes it,
+  // and its last reader frees it (sram_finalize) -- identical lifecycle to a load.
+  auto sram_on_write = [&](int64_t b, const std::shared_ptr<Instruction>& w) {
+    auto bb = buf_bytes.find(b);
+    if (bb == buf_bytes.end()) return;           // size unknown (never DMA'd) -> untracked
+    if (!cur_alloc.count(b) || !open_ver[b]) {   // a consuming read closed it -> new version
+      cur_alloc[b] = next_alloc++;
+      open_ver[b] = true;
+      vers[cur_alloc[b]] = {};
+      w->set_sram_alloc(cur_alloc[b]);
+      w->set_sram_footprint(bb->second);
+      vers[cur_alloc[b]].loads.push_back(w);
+    }
+    // already-open version (further producing writes): same physical bytes, no re-add.
+  };
+  auto sram_on_read = [&](int64_t b, const std::shared_ptr<Instruction>& rd) {
+    auto it = cur_alloc.find(b);
+    if (it == cur_alloc.end()) return;           // not a load buffer -> untracked
+    vers[it->second].readers.push_back(rd);
+    open_ver[b] = false;                          // next write starts a new version
+  };
+  auto sram_finalize = [&]() {                    // tag only each version's LAST reader
+    for (auto& kv : vers) {
+      auto& v = kv.second;
+      if (v.readers.empty()) {                    // no consumer -> never freed: untrack
+        for (auto& ld : v.loads) ld->set_sram_alloc(-1);
+        continue;
+      }
+      v.readers.back()->add_sram_release(kv.first);  // it frees the whole version on issue
+    }
+  };
+
+  for (const auto& t : run.trace) {
+    if (t.kind == TraceRec::TILE_BEGIN) {
+      // togsim_dispatch opened a work-item -> new subgraph (bound to its core) +
+      // tile. The scope runs until the matching TILE_END (the dispatch wrapper
+      // brackets the tile fn call), not until the next begin.
+      flush();
+      sg = std::make_shared<TileSubGraph>();
+      sg->set_core_id(t.core);
+      tile = std::make_shared<Tile>(Tile::Status::INITIALIZED);
+      cur_tile_group++;
+      continue;
+    }
+    if (t.kind == TraceRec::TILE_END) {
+      flush();   // close the work-item explicitly (scope = the tile fn call)
+      continue;
+    }
+    if (!tile) continue;  // defensive: ops before the first TILE_BEGIN
+
+    if (t.kind == TraceRec::DMA) {
+      int64_t uniq = next_tag++;                         // fresh Core tag key per dma record
+      auto inst = make_dma(t, uniq);
+      inst->set_tile_group(cur_tile_group);
+      tile->inc_required_sram_size(rec_bytes(t));         // SRAM footprint (ready-tile ordering)
+      if (t.dir == 1) {                                  // STORE
+        // store reads the result buffer(s) -> link() JOINs all their writers.
+        link(inst, t.read_bufs, t.write_bufs);
+        for (int64_t b : t.read_bufs) sram_on_read(b, inst);  // store frees what it drains
+      } else {                                           // LOAD
+        tile->append_instuction(inst);
+        // async load: record it as the CURRENT load for this (tag_id, tag_slot)
+        // with its fresh uniq; the barriers in this reduction iteration reuse that
+        // uniq (1 load : N barriers). A new iteration's load overwrites it with a
+        // new uniq -> distinct tag key, no collision. writers = the dma for now;
+        // the barrier overwrites it so consumers gate on data arrival. A sync load
+        // has no barrier and blocks to arrival itself.
+        if (t.is_async) current_dma[{t.tag_id, t.tag_slot}] = {uniq, inst};
+        for (int64_t b : t.write_bufs) {
+          // No hard WAR edge here: load-buffer reuse (double-buffering, X_spad/
+          // W_spad reloaded each reduction iter) is modeled by the SRAM
+          // version/capacity machinery (sram_on_load), which sizes how many
+          // versions physically coexist. A latency WAR edge would force
+          // single-buffering and kill the overlap the spad permits. (The
+          // accumulator Y is NOT a load buffer -> its cross-tile WAR is handled by
+          // the REPLACE branch of link() when the next tile's init overwrites it.)
+          writers[b] = { inst };
+          sram_on_load(b, inst);                         // occupy spad
+        }
+      }
+    } else if (t.kind == TraceRec::MEMORY_BAR) {
+      // the explicit async-DMA sync (the original dma_wait). Pair with the CURRENT
+      // load for this (tag_id, tag_slot), reusing its uniq Core key so the dma and
+      // bar pair in the tag table; the dma releases the bar at issue-complete
+      // (a DONE edge), then the bar parks on the tag until data-ready (resp-complete,
+      // set_tag_finish). Consumers of the loaded buffer then gate on the bar, so
+      // the bar (not the load) is the load's DONE-handle in writers(b).
+      auto it = current_dma.find({t.tag_id, t.tag_slot});
+      int64_t uniq = next_tag++;                         // fallback if unpaired
+      std::shared_ptr<Instruction> dma_inst;
+      if (it != current_dma.end()) { uniq = it->second.first; dma_inst = it->second.second; }
+      // Identical wait (same slot, same load instance) already has a barrier -> reuse it
+      // so the buffer's consumers gate on it, instead of emitting a redundant barrier.
+      auto bf = bar_for_load.find({t.tag_id, t.tag_slot});
+      if (bf != bar_for_load.end() && bf->second.first == uniq) {
+        for (int64_t b : t.write_bufs) writers[b] = { bf->second.second };
+        continue;
+      }
+      auto bar = make_mem_bar(t, uniq);
+      bar->set_tile_group(cur_tile_group);
+      if (dma_inst) dma_inst->add_dep(bar, DepEvent::DONE);
+      tile->append_instuction(bar);
+      // the bar is the load's DONE-handle: REPLACE writers(b) with it (no WAR -- the
+      // load already WAR'd the prior readers when it wrote).
+      for (int64_t b : t.write_bufs) writers[b] = { bar };
+      bar_for_load[{t.tag_id, t.tag_slot}] = {uniq, bar};
+    } else if (t.kind == TraceRec::COMPUTE) {
+      auto inst = make_compute(t);
+      inst->set_tile_group(cur_tile_group);
+      link(inst, t.read_bufs, t.write_bufs);
+      // in-place buffers (read AND written) are version-transparent (accumulator,
+      // in-place vector): skip the self-read and the self-write so footprint is not
+      // double-counted. read_bufs/write_bufs are tiny, so a linear scan beats a set.
+      auto in = [](const std::vector<int64_t>& v, int64_t b) {
+        return std::find(v.begin(), v.end(), b) != v.end();
+      };
+      for (int64_t b : t.read_bufs)  if (!in(t.write_bufs, b)) sram_on_read(b, inst);   // consuming reads
+      for (int64_t b : t.write_bufs) if (!in(t.read_bufs, b))  sram_on_write(b, inst);  // fresh outputs
+    }
+  }
+  flush();
+  sram_finalize();   // readers per version are now final -> set each version's refcount
+  return tg;
+}
diff --git a/configs/systolic_ws_128x128_c1_booksim_tpuv2.yml b/configs/systolic_ws_128x128_c1_booksim_tpuv2.yml
index 6d2537d9..7fea374b 100644
--- a/configs/systolic_ws_128x128_c1_booksim_tpuv2.yml
+++ b/configs/systolic_ws_128x128_c1_booksim_tpuv2.yml
@@ -22,3 +22,6 @@ codegen_external_mapping_file: ''
 codegen_autotune_max_retry: 10
 codegen_autotune_template_topk: 4
 codegen_compiler_optimization: all
+
+# Per-core VMEM (vector/scratchpad) size: TPUv2/v3/v4 = 16 MB.
+core_spad_size_kb: 16384
diff --git a/configs/systolic_ws_128x128_c1_booksim_tpuv3.yml b/configs/systolic_ws_128x128_c1_booksim_tpuv3.yml
index f830419b..3a96b588 100644
--- a/configs/systolic_ws_128x128_c1_booksim_tpuv3.yml
+++ b/configs/systolic_ws_128x128_c1_booksim_tpuv3.yml
@@ -26,3 +26,6 @@ codegen_external_mapping_file: ''
 codegen_autotune_max_retry: 10
 codegen_autotune_template_topk: 4
 codegen_compiler_optimization: all
+
+# Per-core VMEM (vector/scratchpad) size: TPUv2/v3/v4 = 16 MB.
+core_spad_size_kb: 16384
diff --git a/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.yml b/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.yml
index 1a8c60f6..41e267b6 100644
--- a/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.yml
+++ b/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.yml
@@ -25,3 +25,6 @@ codegen_external_mapping_file: ''
 codegen_autotune_max_retry: 10
 codegen_autotune_template_topk: 4
 codegen_compiler_optimization: all
+
+# Per-core VMEM (vector/scratchpad) size: TPUv2/v3/v4 = 16 MB.
+core_spad_size_kb: 16384
diff --git a/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml b/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml
index ff976784..397f0fb7 100644
--- a/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml
+++ b/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml
@@ -26,3 +26,6 @@ codegen_external_mapping_file: ''
 codegen_autotune_max_retry: 10
 codegen_autotune_template_topk: 4
 codegen_compiler_optimization: all
+
+# Per-core VMEM (vector/scratchpad) size: TPUv2/v3/v4 = 16 MB.
+core_spad_size_kb: 16384
diff --git a/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.yml b/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.yml
index 2ed1bb12..f080fc69 100644
--- a/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.yml
+++ b/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.yml
@@ -26,3 +26,6 @@ codegen_external_mapping_file: ''
 codegen_autotune_max_retry: 10
 codegen_autotune_template_topk: 4
 codegen_compiler_optimization: all
+
+# Per-core VMEM (vector/scratchpad) size: TPUv2/v3/v4 = 16 MB.
+core_spad_size_kb: 16384
diff --git a/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_timing_only.yml b/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_timing_only.yml
index 1bcc9bb3..f89661b8 100644
--- a/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_timing_only.yml
+++ b/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_timing_only.yml
@@ -26,3 +26,6 @@ codegen_external_mapping_file: ''
 codegen_autotune_max_retry: 10
 codegen_autotune_template_topk: 8
 codegen_compiler_optimization: all
+
+# Per-core VMEM (vector/scratchpad) size: TPUv2/v3/v4 = 16 MB.
+core_spad_size_kb: 16384
diff --git a/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.yml b/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.yml
index 39d195b0..ca69d930 100644
--- a/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.yml
+++ b/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.yml
@@ -28,3 +28,6 @@ codegen_external_mapping_file: ''
 codegen_autotune_max_retry: 10
 codegen_autotune_template_topk: 4
 codegen_compiler_optimization: all
+
+# Per-core VMEM (vector/scratchpad) size: TPUv2/v3/v4 = 16 MB.
+core_spad_size_kb: 16384
diff --git a/configs/systolic_ws_128x128_c2_booksim_tpuv3.yml b/configs/systolic_ws_128x128_c2_booksim_tpuv3.yml
index bf01913b..b7b03e7a 100644
--- a/configs/systolic_ws_128x128_c2_booksim_tpuv3.yml
+++ b/configs/systolic_ws_128x128_c2_booksim_tpuv3.yml
@@ -26,3 +26,6 @@ codegen_external_mapping_file: ''
 codegen_autotune_max_retry: 10
 codegen_autotune_template_topk: 4
 codegen_compiler_optimization: all
+
+# Per-core VMEM (vector/scratchpad) size: TPUv2/v3/v4 = 16 MB.
+core_spad_size_kb: 16384
diff --git a/configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.yml b/configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.yml
index 8c71c528..903ffcbc 100644
--- a/configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.yml
+++ b/configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.yml
@@ -34,3 +34,6 @@ codegen_external_mapping_file: ''
 codegen_autotune_max_retry: 10
 codegen_autotune_template_topk: 4
 codegen_compiler_optimization: all
+
+# Per-core VMEM (vector/scratchpad) size: TPUv2/v3/v4 = 16 MB.
+core_spad_size_kb: 16384
diff --git a/configs/systolic_ws_128x128_c2_chiplet_tpuv3.yml b/configs/systolic_ws_128x128_c2_chiplet_tpuv3.yml
index d058f188..6a234017 100644
--- a/configs/systolic_ws_128x128_c2_chiplet_tpuv3.yml
+++ b/configs/systolic_ws_128x128_c2_chiplet_tpuv3.yml
@@ -28,3 +28,6 @@ codegen_external_mapping_file: ''
 codegen_autotune_max_retry: 10
 codegen_autotune_template_topk: 4
 codegen_compiler_optimization: all
+
+# Per-core VMEM (vector/scratchpad) size: TPUv2/v3/v4 = 16 MB.
+core_spad_size_kb: 16384
diff --git a/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.yml b/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.yml
index 019a0f0f..f0546e56 100644
--- a/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.yml
+++ b/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.yml
@@ -27,3 +27,6 @@ codegen_external_mapping_file: ''
 codegen_autotune_max_retry: 10
 codegen_autotune_template_topk: 4
 codegen_compiler_optimization: all
+
+# Per-core VMEM (vector/scratchpad) size: TPUv2/v3/v4 = 16 MB.
+core_spad_size_kb: 16384
diff --git a/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.yml b/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.yml
index 348babae..08ec26ac 100644
--- a/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.yml
+++ b/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.yml
@@ -25,3 +25,6 @@ codegen_external_mapping_file: ''
 codegen_autotune_max_retry: 10
 codegen_autotune_template_topk: 4
 codegen_compiler_optimization: all
+
+# Per-core VMEM (vector/scratchpad) size: TPUv2/v3/v4 = 16 MB.
+core_spad_size_kb: 16384
diff --git a/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.yml b/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.yml
index a0985aec..a6e073e9 100644
--- a/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.yml
+++ b/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.yml
@@ -26,3 +26,6 @@ codegen_external_mapping_file: ''
 codegen_autotune_max_retry: 10
 codegen_autotune_template_topk: 4
 codegen_compiler_optimization: all
+
+# Per-core VMEM (vector/scratchpad) size: TPUv2/v3/v4 = 16 MB.
+core_spad_size_kb: 16384
diff --git a/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_ils.yml b/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_ils.yml
index 166e2e25..5436b3e8 100644
--- a/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_ils.yml
+++ b/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_ils.yml
@@ -29,3 +29,6 @@ codegen_external_mapping_file: ''
 codegen_autotune_max_retry: 10
 codegen_autotune_template_topk: 4
 codegen_compiler_optimization: all
+
+# Per-core VMEM (vector/scratchpad) size: TPUv2/v3/v4 = 16 MB.
+core_spad_size_kb: 16384
diff --git a/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.yml b/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.yml
index 6119e83d..d928f9d3 100644
--- a/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.yml
+++ b/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.yml
@@ -30,3 +30,6 @@ codegen_external_mapping_file: ''
 codegen_autotune_max_retry: 10
 codegen_autotune_template_topk: 4
 codegen_compiler_optimization: all
+
+# Per-core VMEM (vector/scratchpad) size: TPUv2/v3/v4 = 16 MB.
+core_spad_size_kb: 16384
diff --git a/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml b/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml
index 9100c22a..dd9dfac7 100644
--- a/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml
+++ b/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml
@@ -28,3 +28,6 @@ codegen_external_mapping_file: ''
 codegen_autotune_max_retry: 10
 codegen_autotune_template_topk: 4
 codegen_compiler_optimization: all
+
+# Per-core VMEM (vector/scratchpad) size: TPUv2/v3/v4 = 16 MB.
+core_spad_size_kb: 16384
diff --git a/configs/systolic_ws_8x8_c1_booksim.yml b/configs/systolic_ws_8x8_c1_booksim.yml
index f46d380e..1593e148 100644
--- a/configs/systolic_ws_8x8_c1_booksim.yml
+++ b/configs/systolic_ws_8x8_c1_booksim.yml
@@ -23,3 +23,6 @@ codegen_external_mapping_file: ''
 codegen_autotune_max_retry: 10
 codegen_autotune_template_topk: 4
 codegen_compiler_optimization: all
+
+# Per-core spad: 8x8 array, 128 KB x 8 = 1 MB.
+core_spad_size_kb: 1024
diff --git a/configs/systolic_ws_8x8_c1_simple_noc.yml b/configs/systolic_ws_8x8_c1_simple_noc.yml
index 1be24b85..b2d16c6a 100644
--- a/configs/systolic_ws_8x8_c1_simple_noc.yml
+++ b/configs/systolic_ws_8x8_c1_simple_noc.yml
@@ -24,3 +24,6 @@ codegen_external_mapping_file: ''
 codegen_autotune_max_retry: 10
 codegen_autotune_template_topk: 4
 codegen_compiler_optimization: all
+
+# Per-core spad: 8x8 array, 128 KB x 8 = 1 MB.
+core_spad_size_kb: 1024
diff --git a/docs/design/togsim_cpp_trace.md b/docs/design/togsim_cpp_trace.md
new file mode 100644
index 00000000..9565bdfb
--- /dev/null
+++ b/docs/design/togsim_cpp_trace.md
@@ -0,0 +1,1006 @@
+# TOGSim C++ Trace Generation — Design Proposal
+
+**Status:** Implemented end-to-end through the real timing Core (256^3 GEMM); see
+§11 for remaining work.
+**Branch:** `feature/togsim-cpp-trace`
+**Scope:** Replace the timing-path TOG producer (MLIR → Python-dict → ONNX → C++
+parser) with a compiled, shape-parametric trace producer (MLIR → C++ → `.so`).
+TOGSim's timing core is preserved.
+
+**Note on the sync mechanism (read before §3, §5, §9).** An earlier version of
+this design synchronized an asynchronous DMA with the consumer that waits on its
+data using a compile-time integer `event_id` — one id per static `togsim.dma`/
+`togsim.wait` op, paired through a heap "event buffer" of opaque handles. That
+mechanism was *removed*: a single static `togsim.dma` op executes once per loop
+iteration, each iteration writing a different runtime tag slot, so one
+compile-time id per op cannot represent the per-iteration pairing. The current
+design (ABI v11) pairs an async DMA with its sync point by the **runtime tag
+slot** instead. Sections below have been rewritten to the runtime-tag model;
+where a section still mentions `event_id` / event handles / `togsim_wait` /
+`togsim_signal`, it is flagged as the superseded design, not current behavior.
+
+---
+
+## 1. Motivation
+
+The current Tile-Operation Graph (TOG) pipeline has accumulated structural debt
+that blocks where we want to go (notably dynamic shape for LLM decode / MoE):
+
+1. **"ONNX in name only."** TOG is serialized as ONNX, but every op is a custom
+   `torchsim_*` attribute. We pay ONNX's costs (rigid schema, protobuf,
+   stringly-typed attribute encoding) and use none of its interop value
+   (onnxruntime, standard ops, netron). The schema lives in three places —
+   Python dict (`extension_op.py`), ONNX (`AsmParser/onnx_utility.py`), C++
+   (`TOGSim/.../TileGraphParser`) — and drifts.
+
+2. **Synchronization is ad-hoc and DMA-specific.** Completion tracking is a
+   counting-semaphore in disguise, but unnamed and tangled:
+   - `DMA.h`: `tag_table[subgraph][tag_key] -> uint32` with overloaded magic
+     values (`0` pending, `1` signaled, `>1` consumed-count, `-1` sparse) plus a
+     parallel `waiters` wait-queue. The `tag_key` is a hand-rolled
+     content-addressed vector computed from loop indices/strides (`calc_tag`),
+     with implicit fallbacks (push `0` when an index is missing, dedup by
+     silently `continue`-ing).
+   - A *second*, separate dependency mechanism — `Instruction::ready_counter` +
+     `child_inst` graph edges — handles structural ordering.
+   - Net: one concept ("an async op completed; a consumer may proceed") is
+     expressed two different ways, and the event-like one only works for DMA.
+
+3. **Static shape is baked in.** `build_tog._affine_for_bounds` resolves loop
+   bounds to constants (`_const_index_value`). The graph is fully materialized
+   per static shape, so dynamic shape forces recompile-per-shape — pathological
+   for decode (a new `seq_len` every step) and MoE (variable expert load).
+
+4. **Loop-flattening hackery.** Much of the roughness (`loop_end` tricks,
+   `calc_tag`, dedup-by-skip, magic offsets) exists only to flatten loop nests
+   into a static graph.
+
+See [Appendix A](#appendix-a-current-state-references) for file:line references.
+
+## 2. Key idea: trace-driven → execution-driven
+
+Instead of materializing a flattened graph, **TOG becomes a stream emitted by
+*running* a shape-parametric producer.** The producer is C++ compiled from the
+kernel's MLIR; it keeps loops as loops (with symbolic bounds) and calls a small
+**event-based API**. Each API call emits one trace record = one modeled
+instruction. TOGSim `dlopen`s the producer `.so`, injects a callback context
+that records and times the stream.
+
+This directly resolves the four problems:
+
+| Problem | Resolution |
+|---|---|
+| ONNX-in-name-only / 3-place schema | The API signature is the single contract. No ONNX. |
+| DMA-only, ad-hoc sync | An async DMA and the consumer that waits on its data are paired at runtime by the tile's tag slot, through the existing Core tag table (`prepare_tag_key`/`set_tag_finish`/`register_tag_waiter`). The DMA signals the tag when its data arrives; an explicit `togsim.memory_barrier` waits on it and becomes the last-writer of the loaded buffer, so consumers gate on data arrival. No content-hashed `calc_tag`, no magic values. |
+| Static shape | Loop bounds flow from MLIR as-is; symbolic bounds become native loop bounds in C++, so trip count is dynamic. |
+| Loop-flatten hacks | Loops stay loops; the trace is generated by executing them. `calc_tag`/dedup disappear. |
+
+It is *not* a dynamic hardware scheduler: control flow is still statically
+emitted by the compiler. The `.so` is a deterministic **trace generator**, not a
+timing model — it keeps the trace-as-data boundary, so TOGSim's timing core is
+untouched.
+
+## 3. Core algebra
+
+Small, orthogonal primitives. Everything else is composition (Layer-1 helpers
+like `double_buffered_loop`, not IR primitives).
+
+- `dma(dir, arg_id, offset, shape, is_async, tag_id, tag_slot, …)` —
+  `dir ∈ {LOAD, STORE}`. Returns void. A **synchronous** (non-async) DMA is
+  blocking: it finishes when its data arrives, and consumers depend on it
+  directly. An **async** DMA returns control immediately and signals its tag at
+  data arrival (DMA response-complete); a later `memory_barrier` is the explicit
+  point that waits on it.
+- `compute(tile_id, dims…)` — references a fixed-size tile kernel; cost is looked
+  up (§6), not computed here.
+- `memory_barrier(tag_id, tag_slot, write_bufs)` — the explicit async-DMA sync.
+  It waits until the async DMA carrying the same `(tag_id, tag_slot)` has
+  delivered its data, then becomes the last-writer of the loaded buffer so
+  consumers gate on data arrival. It is the original `memref.dma_wait` mapped
+  through from the source IR, not a synthesized barrier.
+- `compute_barrier()` — a compute fence inserted before a store, so the store
+  sees the drained accumulator. This is the **one** remaining auto-inserted
+  barrier; it is marked FIXME in the code as something that should also become
+  explicit in the source IR later (§10.7.3).
+- **Control flow lives in the producer** — ordinary `for`/`if`/`while` with
+  runtime bounds. Loop types (normal/parallel/accumulation/inner) and dynamic
+  shape are just producer loops; the emitted trace is already specialized.
+
+Two distinct things share the word "tag", and the design uses **both together**
+as the dma↔barrier pairing key:
+
+- **`tag_id`** — the identity of a DMA's *tag memref*. It plus the runtime
+  `tag_slot` index identifies which async DMA a `memory_barrier` is waiting on.
+- **`tag_slot`** — the SRAM tile slot the loaded tile occupies (the
+  double-buffer / SRAM-capacity index). It is *also* part of the pairing key
+  because each load's tile maps to its own slot. The slot is **subtile-only**:
+  `lower_to_vcix` writes the dma_wait tag index with a `-acc_iv` term for each
+  accumulation (reduction) loop var — a sentinel marking the reduction axis, not
+  an arithmetic offset — and `build_skeleton` strips those terms so a
+  `memory_barrier` waits on the same slot its async load wrote. (Mirrors legacy
+  `TileGraphParser`, which skips stride -1; reduction iterations are told apart
+  by the per-iteration tag alloc + a fresh per-record Core key in the bridge, not
+  by the slot.) Without the strip, the producer evaluates `-acc_iv` to a negative
+  slot at reduction iteration > 0 and the pairing fails on subtile + multi-tile-K.
+
+Pairing is done at runtime by the existing TOGSim Core tag table: the async DMA
+calls `prepare_tag_key` and `set_tag_finish` (signal at data arrival), the
+`memory_barrier` calls `register_tag_waiter` (wait on `(tag_id, tag_slot)`).
+A synchronous DMA needs no barrier — it blocks until data arrival itself.
+
+> **Superseded.** An earlier version used a neutral `event` completion token
+> (freely allocated, not tied to memory) with `signal`/`wait`/`wait_all`
+> primitives. That has been removed in favor of the runtime-tag mechanism above.
+
+## 4. Decisions (locked)
+
+| Axis | Decision |
+|---|---|
+| Input MLIR | Use the **given MLIR as-is**. Do not touch inductor / MLIR templates / shape plumbing. Whatever bounds the MLIR carries (const or symbolic) pass through verbatim. |
+| MLIR → C++ | **EmitC dialect + `mlir-translate --mlir-to-cpp`** (upstream). |
+| `.so` ↔ TOGSim | **`dlopen` + `EmitCtx` callback** (execution-driven). The ABI boundary is the main design surface. |
+| `.so` role | **Timing trace only.** Functional correctness stays on the existing Spike/LLVM path. Strip every op without a timing dependency; keep loop skeleton + API ops + ops feeding bounds/addresses. |
+| Compute cycle | A **separate annotation pass** reuses the existing **sample-mode** to produce a **precomputed `tile_id → cycle` table**, looked up at runtime. |
+| Dynamic shape | Falls out of symbolic loop bounds in the MLIR. Per-tile cost is static (tiles are fixed-size); only trip count is dynamic. |
+
+## 5. Architecture
+
+### 5.1 Artifacts (per kernel)
+
+- **Trace `.so`** — compiled from the skeleton+API MLIR. Shape-parametric:
+  symbolic bounds become C++ function parameters. Calls the runtime API
+  (`togsim_dma`, `togsim_compute`, `togsim_memory_barrier`, …).
+- **Cycle table** — `tile_id → cycle`, produced by the annotation pass.
+
+### 5.2 Pipeline (input = given MLIR)
+
+```
+given MLIR (affine/scf.for + memref.dma_start/dma_wait + vcix/vector compute)
+│
+├── Branch A (trace):
+│     C2 build_skeleton pass  (reuse build_tog traversal)
+│        • affine/scf.for kept, bounds as-is (symbolic preserved)
+│        • dma_start → togsim.dma(... tag_id, %tag[%idx], is_async)
+│        • dma_wait  → togsim.memory_barrier(tag_id, %tag[%idx], write_bufs)
+│        • compute block       → togsim.compute(tile_id, dims)
+│        • DCE: drop ops with no dependency to loop/address/API operands
+│     → C4 togsim→emitc lowering  (togsim.* → emitc.call_opaque;
+│        convert-scf/arith-to-emitc; func args incl. symbolic shapes)
+│     → mlir-translate --mlir-to-cpp
+│     → C5 compile → trace .so   (cached by kernel key)
+│
+└── Branch B (cost):
+      C3 annotation pass over the same MLIR
+        • extract per-tile compute bodies, assign tile_id
+        • run through existing sample-mode → tile_id → cycle table
+
+TOGSim (C6):
+  dlopen(trace.so) → resolve togsim_kernel
+  inject EmitCtx { tag table; record sink; cost = cycle_table[tile_id] }
+  togsim_kernel(ctx, runtime_shape_args...)   // producer runs, emits stream
+  → existing timing core consumes the recorded Instruction stream
+```
+
+### 5.3 Components
+
+- **C1 — `togsim` API op vocabulary.** `togsim.dma(...)` (void result, carrying
+  `tag_id`, the runtime tag-index operand, `is_async`),
+  `togsim.memory_barrier(tag_id, tag_slot, write_bufs)`,
+  `togsim.compute(tile_id, dims)`, `togsim.compute_barrier`. Kept *unregistered*
+  (like the existing `togsim.transfer`), so no C++ dialect registration; the
+  togsim→emitc step is a custom Python rewrite, not a registered ConversionPass.
+- **C2 — `build_skeleton` pass.** Sibling to `build_tog.py`, reusing its
+  traversal (matmul FSM, `_dma_start_fields`, loop typing). Emits the
+  skeleton+API MLIR instead of TOG nodes; preserves `is_async`. The original
+  `memref.dma_wait` is mapped through to an explicit `togsim.memory_barrier`
+  carrying the DMA's `tag_id` and the runtime tag-index operand.
+- **C3 — annotation pass + cycle table.** Reuses sample-mode to sample the
+  deterministic per-tile cycle; emits the `tile_id → cycle` table artifact.
+- **C4 — togsim→emitc lowering.** Maps each `togsim.*` op to an
+  `emitc.call_opaque "togsim_*"`; lowers control flow via `convert-scf-to-emitc`
+  / `convert-arith-to-emitc`; func arguments (including symbolic shapes) become
+  C++ parameters. Then `mlir-translate --mlir-to-cpp`.
+- **C5 — `.so` build.** Compile emitted `.cpp` + `togsim_runtime.h` to `.so`
+  via the existing toolchain; cache by kernel key.
+- **C6 — TOGSim runtime + loader.** `togsim_runtime.h/.cc`: `EmitCtx` and the
+  `togsim_dma/compute/memory_barrier/compute_barrier/core_alloc`
+  implementations (compute looks up the cycle table). Loader `dlopen`s the
+  `.so`, calls `togsim_kernel` with runtime shape args, records the stream, feeds
+  the existing timing core. An async DMA and its `memory_barrier` are paired at
+  runtime by `(tag_id, tag_slot)` through the existing Core tag table.
+
+### 5.4 ABI sketch (current: v11)
+
+```c
+// togsim_runtime.h — shared contract between emitted .cpp and TOGSim
+typedef struct EmitCtx EmitCtx;
+
+void togsim_dma(EmitCtx*, int32_t dir, int32_t arg_id, uint64_t offset,
+                int32_t ndim, const int64_t* dims, const int64_t* strides,
+                int32_t elem_bits, int32_t is_async,
+                int32_t tag_id, uint64_t tag_slot,
+                const int64_t* read_bufs, int32_t n_read,
+                const int64_t* write_bufs, int32_t n_write);
+
+void togsim_memory_barrier(EmitCtx*, int32_t tag_id, uint64_t tag_slot,
+                           const int64_t* write_bufs, int32_t n_write);
+
+void togsim_compute(EmitCtx*, uint64_t tile_id, int32_t compute_type, /* dims */ ...);
+void togsim_compute_barrier(EmitCtx*);
+int32_t togsim_core_alloc(EmitCtx*);
+
+// entry point the loader resolves:
+void togsim_kernel(EmitCtx*, int64_t* shape_args, int32_t n_shape_args);
+```
+
+`togsim_dma` returns void (no handle). An async DMA carries `(tag_id, tag_slot)`;
+the matching `togsim_memory_barrier` waits on the same pair through the Core tag
+table. The symbols are resolved as free `extern "C"` functions: the loaded `.so`
+links back into the Simulator binary (built with `ENABLE_EXPORTS`).
+
+> **Superseded.** v2–v10 evolved through a `togsim_event` handle type with
+> `togsim_dma` returning a handle and `togsim_wait`/`togsim_signal`/
+> `togsim_wait_all` plus `togsim_event_alloc`/`togsim_event_free`. v11 removed
+> all of those; see the note at the top of this doc and §9.6.1.
+
+## 6. Compute cost model
+
+The annotation pass (C3) reuses **sample-mode** to measure each tile's
+deterministic cycle once and stores a **precomputed `tile_id → cycle` table**.
+`togsim_compute` looks it up at runtime.
+
+This is consistent with dynamic shape because **tiles are fixed-size**
+(`TILE_M/N/K`): the per-tile cycle is invariant; only the *number* of tiles
+(loop trip count) varies, and that is handled by the symbolic loop in the `.so`.
+
+**Open edge case — remainder tiles.** When a dimension is not divisible by the
+tile size, edge tiles are partial and have a different cycle than the table
+entry. Options: pad to full-tile cost (simple, small error) vs. sample a
+separate `tile_id` for the remainder. Decided at P4.
+
+## 7. Milestones
+
+- **P0** — DONE. New branch; runtime API header (C6 surface) + `togsim` op
+  vocabulary (C1).
+- **P1** — DONE. `build_skeleton` pass (C2) on a matmul kernel; verified against
+  the legacy `build_tog` TOG. The async DMA's `memref.dma_wait` is mapped through
+  to an explicit `togsim.memory_barrier` carrying the DMA's `tag_id` and the
+  runtime tag-index operand; the IR verifies across sibling prefetch/compute loop
+  nests because the pairing is by runtime tag slot, not a cross-region SSA edge.
+- **P2** — DONE. togsim→emitc (C4) + `mlir-translate` + compile (C5) → `.so` for
+  that kernel (static shape). C4 rewrites the unregistered `togsim.*`/signature
+  then drives the upstream `lower-affine`/`convert-*-to-emitc` passes, with a
+  small fold for residual `emitc.for` bound casts (see §8). Base addresses
+  stubbed to 0 (wired in P3).
+- **P3** — DONE. TOGSim loader + runtime (C6) + cycle table (C3); runs end-to-end
+  through the real Simulator/Core (256^3 GEMM via `--trace_so`). Parallelism /
+  reduction / core dispatch design is locked in **§9** (core-transparent work
+  function + `togsim_core_alloc` hook). Async DMA↔consumer sync is the runtime
+  tag-slot mechanism (`togsim.memory_barrier`), not an event-id.
+- **P4** — Symbolic bounds end-to-end on a decode-style kernel; verify trace
+  length scales with runtime shape; decide remainder-tile handling.
+- **P5** — Migrate remaining op families (conv, SDPA, vector).
+
+## 8. Risks / open questions
+
+- **Remainder tiles vs. precomputed table** (§6) — P4.
+- **ABI versioning** — RESOLVED. Free `extern "C"` symbols (the `.so` links back
+  into the Simulator binary via `ENABLE_EXPORTS`); `TOGSIM_ABI_VERSION` is v11.
+- **togsim→emitc for unregistered ops** — must be a custom rewrite to
+  `emitc.call_opaque`, since unregistered ops have no registered conversion
+  patterns.
+- **EmitC coverage** — RESOLVED (P2). C4 uses the upstream conversion passes
+  (`lower-affine`, `convert-scf-to-emitc`, `convert-arith-to-emitc`,
+  `convert-func-to-emitc`). One gap in this LLVM 20 build:
+  `convert-scf-to-emitc` emits `emitc.for` with `index` bounds, so
+  `convert-arith-to-emitc` leaves `builtin.unrealized_conversion_cast` on the
+  bounds (`emitc.size_t`↔`index`) that `--reconcile-unrealized-casts` cannot
+  fold and `mlir-to-cpp` cannot print. C4 adds a small post-pass
+  (`_retype_for_to_size_t`) that retypes each `emitc.for` to `!emitc.size_t`
+  bounds + IV (`emitc.for` accepts size_t with the explicit type) and folds the
+  residual index<->size_t casts. A size_t IV also makes the lowered *address*
+  arithmetic cast-free, which is what lets P3 wire real addresses (approach A):
+  `togsim_dma` passes `(arg_id, element offset)` where the offset is computed
+  from the loop IVs and lowered by `convert-arith-to-emitc`.
+- **async/fire-and-forget** — `is_async` preserved on `togsim.dma`. An async DMA
+  signals its tag at data arrival; a sync DMA is blocking. A DMA with no matching
+  `memory_barrier` is fire-and-forget (nothing waits its tag).
+
+## 9. P3 design: parallelism, reduction, and core dispatch (locked)
+
+How the trace producer expresses *which core runs what*, *what is parallel*, and
+*what is a reduction* (cross-iteration dependency). This is the design for P3.
+
+### 9.1 Where the semantics come from
+
+Nothing new has to be inferred — the post-vcix `affine.for` already carries the
+mapping decision the frontend made, and `build_skeleton` preserves it:
+
+| attribute | meaning | role |
+|---|---|---|
+| `outer_loop` | PARALLEL axis (e.g. GEMM m, n) | independent output tiles -> distributable across cores |
+| `accumulation_loop` | REDUCTION axis (e.g. GEMM k) | partial sums into one output tile -> ordered dependency |
+| `inner_loop` | tile micro-loop | within one tile |
+
+This matches what legacy TOGSim already does with `torchsim_loop_type`
+(`TileGraphParser`: PARALLEL -> `outer_loop_idx` selects a core; ACCUMULATION ->
+`accum_tag` groups dependent partials). The current gap is only that
+`lower_to_emitc` (P2) *drops* these attributes when it lowers `affine.for` to
+`emitc.for`, producing a flat single-stream producer.
+
+### 9.2 Principle: bake intrinsic, parameterize extrinsic
+
+Two different kinds of hardware dependence must be treated differently:
+
+- **Intrinsic** (vlane / vector width, `TILE_M/N/K`, systolic size) — defines the
+  *content and cost of each instruction*. Already baked into the IR; correct.
+- **Extrinsic** (`num_cores`) — defines only the *distribution* of an otherwise
+  fixed set of work-items. The tile set, the per-tile cost table
+  (`tile_id -> cycle`), and the DMA tile shapes are all `num_cores`-invariant.
+
+Therefore `num_cores` is **not** baked into the producer. The producer is
+**core-count transparent**: it knows nothing about how many cores exist.
+
+### 9.3 Model: core-transparent work function + dispatch hook
+
+The producer is two functions, split at the PARALLEL/ACCUMULATION boundary:
+
+```c
+// WORK: trace for ONE independent output tile. Core-transparent: takes the
+// PARALLEL indices directly, names no core. Reduction (k) is program order ->
+// the dependency is implicit (the accumulator is core-local). An async load is
+// synced to its consumer by an explicit memory_barrier on the same tag slot.
+void togsim_kernel_tile(EmitCtx* ctx, int64_t mi, int64_t ni, int64_t* shape) {
+  togsim_core_alloc(ctx);                // first line: new work-item + pick core
+  togsim_compute(ctx, /*tile_id=*/0, ...);            // acc init
+  for (size_t ki = 0; ki < KT; ++ki) {                // REDUCTION = program order
+    togsim_dma(ctx, LOAD, A, offA(mi,ki), ..., /*is_async=*/1, /*tag_id=*/0, ki%D, ...);
+    togsim_dma(ctx, LOAD, B, offB(ki,ni), ..., /*is_async=*/1, /*tag_id=*/1, ki%D, ...);
+    togsim_memory_barrier(ctx, /*tag_id=*/1, ki%D, ...); togsim_compute(ctx, 1, ...);
+    togsim_memory_barrier(ctx, /*tag_id=*/0, ki%D, ...); togsim_compute(ctx, 2, ...);
+  }
+  togsim_dma(ctx, STORE, C, offC(mi,ni), ...);
+}
+
+// DISPATCH: enumerate the PARALLEL domain, one call per work-item.
+extern "C" void togsim_kernel(EmitCtx* ctx, int64_t* shape, int32_t n) {
+  size_t MT = shape[0]/256, NT = shape[1]/256;
+  for (size_t mi = 0; mi < MT; ++mi)
+    for (size_t ni = 0; ni < NT; ++ni)
+      togsim_kernel_tile(ctx, mi, ni, shape);
+}
+```
+
+Reduced to two orthogonal concepts:
+
+- **Parallel** = each `togsim_kernel_tile` call is an independent work-item (no
+  tags shared across calls). TOGSim is free to place it on any core.
+- **Reduction** = ordering *inside* one work-item: program order on its core
+  (no explicit barrier). The `memory_barrier`/tag-slot mechanism is only the
+  async-DMA → consumer data sync.
+- **Core assignment** = `togsim_core_alloc(ctx)` (a runtime callback, body in
+  TOGSim) marks the work-item boundary and binds the following ops to a chosen
+  core. The producer never sees `core_id`/`num_cores`; those live only in
+  TOGSim's dispatch policy (round-robin / blocked / cost-aware via the cycle
+  table).
+
+The boundary callback lives at the start of each work-item; it cannot be folded
+away because TOGSim cannot intercept the producer-internal work-function call --
+only `togsim_*` callbacks are visible across the `dlopen` boundary.
+
+> FINAL API (supersedes the `togsim_dispatch` naming used below): the boundary +
+> core binding is **`int32_t togsim_core_alloc(EmitCtx*)`** (header v6). The
+> producer calls it at each work-item start; the **runtime owns the core pool**
+> and round-robins -- `num_cores` is NEVER baked into the producer (it is purely
+> a runtime quantity). There is **no free**: a core is an assignment, not a held
+> resource; the next `togsim_core_alloc` starts the next work-item. The returned
+> id is discarded by the producer. This keeps the producer core-count transparent
+> while making the core mapping an explicit runtime allocation. Wherever the text
+> below says `togsim_dispatch`, read `togsim_core_alloc`.
+
+### 9.4 Codegen (lower_to_emitc) and ABI deltas
+
+- `lower_to_emitc` splits the loop nest at the PARALLEL/ACCUMULATION boundary
+  into two `emitc.func`: the PARALLEL loops become `togsim_kernel` (dispatcher,
+  passing the loop indices as args); the ACCUMULATION+INNER body becomes
+  `togsim_kernel_tile`, with `togsim_core_alloc(ctx)` inserted at its entry.
+- ABI additions in `togsim_runtime.h`: `int32_t togsim_core_alloc(EmitCtx*)`
+  (runtime owns the core pool; no `num_cores` in the producer; no free).
+  `togsim_kernel_tile` may stay internal (`static`) for now; export it only if a
+  future loader wants to own the parallel enumeration (which would also need a
+  `num_tiles`-style count — not required now).
+- `tile_id -> cycle` table unchanged (num_cores-invariant).
+
+> Implementation status (P3, ABI v12): `lower_to_emitc` OUTLINES the innermost
+> PARALLEL-loop body into a uniform `togsim_kernel_tile(ctx, iv, n)` func and the
+> dispatcher loop hands it to `togsim_dispatch(ctx, fn, iv, n)` -- a higher-order
+> runtime wrapper that round-robins a core and brackets the call with
+> TILE_BEGIN/TILE_END. The work-item SCOPE is now the function call itself (not an
+> implicit "ops until the next core_alloc" range), and one general dispatcher
+> serves every kernel (uniform iv-array ABI). Earlier this was a single
+> `togsim_kernel` with a bare `togsim_core_alloc` marker; the emitted *trace* is
+> identical (one work-item bracket, then the work ops), so cycles are unchanged --
+> the outline was done to make the boundary explicit, not for timing. Address
+> arithmetic is wired (approach A): each `togsim_dma` passes `(arg_id, element
+> offset)` with the offset computed from the loop IVs (lowered by
+> `convert-arith-to-emitc`, cast-free thanks to the size_t IV retype); the runtime
+> adds the tensor base. The parallel IVs reach the tile fn through the iv array.
+
+### 9.5 Stance and the split-K exception
+
+This refines the design's "not a dynamic scheduler / static control flow":
+**per-work-item trace is static and deterministic; only the work-item -> core
+binding is dynamic** (decided by `togsim_core_alloc`). That is independent-task
+distribution, not data-dependent control flow, and it matches a real tile
+scheduler more closely.
+
+The transparent model holds while work-items are independent (data-parallel over
+output tiles). **Split-K** (a reduction split *across* cores) breaks
+independence: the producer must emit `c` partials + a combine, so the
+instruction stream then depends on `num_cores`, and the cross-core dependency
+must be a real dataflow edge (not program order). Treat split-K as a deliberate,
+scoped exception — start P3 with data-parallel only.
+
+### 9.6 Work-items form a DAG (barriers, cross-parallel reduction)
+
+Work-items are not always a flat independent set. When there is a computation
+*between* parallel loops (e.g. an op at the m-level after the inner n parallel
+loop), it can only run once the inner parallel region completes — a join /
+barrier:
+
+```
+parallel for m:
+  parallel for n: A(m,n)     # leaf work-items
+  B(m)                       # join: needs all n of this m
+```
+
+This needs **no new primitive**: it is the same dataflow-edge mechanism the trace
+already uses (§10), just at work-item granularity. The join op declares the
+leaves' output buffers as its inputs, so the bridge makes it depend on every leaf
+through the last-writer-per-buffer analysis:
+
+```
+parallel for m:
+  parallel for n: A(m,n)   // each writes a tile of m's intermediate buffer
+  B(m)                     // reads that buffer -> depends on all n of this m
+```
+
+So the general picture: **work-items form a DAG; edges are buffer producer →
+consumer dependencies.** The independent data-parallel case is the degenerate
+edge-less DAG; barriers, reduction-across-a-parallel-axis, and split-K are the
+same DAG with real dataflow edges. (Async-DMA data arrival is the one edge that
+needs an explicit `memory_barrier` on the tag slot, because the buffer write
+completes only at DMA response-complete, later than the producing op's
+issue — see §10.7.4.)
+
+> **Superseded.** An earlier version expressed these joins with a per-leaf
+> completion `event` plus `togsim_wait_all`. Those primitives were removed; joins
+> are now ordinary buffer dependencies in the dataflow DAG (§10).
+
+### 9.6.1 How a barrier finds its DMA: runtime tag-slot pairing (locked)
+
+How the explicit `togsim.memory_barrier` (lowered from `memref.dma_wait`) finds
+*which* `togsim.dma` instance's data it must wait for. The hard case is a
+reduction loop: one static `togsim.dma` op executes once per iteration, each
+iteration loading a different tile into a different runtime tag slot. The pairing
+must therefore key on a *runtime* value, not a compile-time one.
+
+The locked model: pair by the **runtime tag slot**, using the existing TOGSim
+Core tag table.
+
+- **A DMA carries `(tag_id, tag_slot)`.** `tag_id` is the compile-time identity
+  of the DMA's tag memref (which logical channel — e.g. A-load vs B-load).
+  `tag_slot` is the *runtime* tag index `%tag[%idx]`, i.e. the SRAM tile slot
+  the loaded tile occupies this iteration. Together they uniquely name this
+  iteration's load.
+- **An async DMA signals; the barrier waits.** At DMA response-complete (the
+  moment data has actually arrived in SRAM), the runtime calls
+  `set_tag_finish(tag_id, tag_slot)`. The matching `togsim.memory_barrier`
+  carries the same `(tag_id, tag_slot)`; it calls `register_tag_waiter` and is
+  woken at that signal. The barrier then becomes the **last-writer** of the
+  loaded SRAM buffer (`write_bufs`), so every consumer that reads the buffer
+  gates on data arrival through the ordinary dataflow-edge analysis (§10).
+- **A synchronous DMA needs no barrier.** It is blocking — it finishes at data
+  arrival itself, and consumers depend on it directly.
+- **Reduction iterations do not collide.** Because `tag_slot` is the runtime
+  index, iteration `i`'s DMA and iteration `i`'s barrier share a slot that is
+  distinct from (or correctly reused after) other iterations — exactly the
+  per-iteration pairing a compile-time id could not express. The
+  double-buffer/pipeline depth is the slot's lifetime, owned by the Core's tag
+  table.
+
+**What this drops vs legacy `tag_table`:** no `calc_tag` content-hash, no magic
+values (`0`/`1`/`-1`/`>1`), no FIFO, no in-order assumption. The pairing key is
+`(tag_id, tag_slot)`, both carried explicitly on the trace ops.
+
+> Status: IMPLEMENTED (ABI v11). `build_skeleton` maps `memref.dma_wait` to
+> `togsim.memory_barrier` and tags `togsim.dma` with `tag_id` + the runtime
+> tag-index operand; `lower_to_emitc` lowers both; the runtime pairs them via
+> `prepare_tag_key`/`set_tag_finish`/`register_tag_waiter`. Verified bad=0 on the
+> 256^3 GEMM. (All current fixtures have tag memref size 1, i.e. single-buffer;
+> deeper double-buffer pipelines exercise more slots but use the same key.)
+>
+> **Superseded.** ABI v5–v10 used a dynamically minted `togsim_event` handle
+> parked in a heap "event buffer" (`togsim_event_alloc`/`togsim_event_free`),
+> with `togsim_dma` returning the handle and `togsim_wait(handle)` consuming it.
+> That mechanism — and the earlier static `event_id` it replaced — could not
+> represent per-iteration reduction pairing and was removed in v11 in favor of
+> the runtime tag slot above.
+
+### 9.7 Execution / simulation model: trace generation (not co-execution)
+
+The producer is a **pure trace (DAG) generator**: running its loops *emits* the
+ordered op stream + dependency edges. It never computes cycles, models hardware,
+or schedules. Two consequences pin the model:
+
+- **What is an edge vs. what blocks.** Data dependencies (buffer producer →
+  consumer edges, plus the async-DMA `memory_barrier` on its tag slot) are
+  recorded *edges* — the producer does not block on them. The only thing that
+  ever blocks the producer is *resource backpressure* (finite cores,
+  double-buffer / SRAM slots, DMA-queue depth), and that is pure flow control,
+  not timing semantics.
+- **Cores, double-buffering, DRAM/NoC are the timing core's job — reused, not
+  reimplemented.** TOGSim's timing core already models all of this when it
+  consumes the legacy TOG (Appendix A: `tag_table` double-buffer sync,
+  `num_cores`). The producer stays oblivious; depths/counts are consumer-side
+  config.
+
+Consumption is staged via a swappable **sink** behind the callbacks, so the
+choice does not touch the producer or the ABI:
+
+| | sink | threads | when |
+|---|---|---|---|
+| **P3** | *materializing* — callbacks append to the timing core's input; reuse its existing scheduler/timing | none | static shape; like-for-like cycle-equivalence vs `build_tog` |
+| **P4+** | *streaming* — callbacks push to a bounded queue; the producer runs as a fiber/coroutine and blocks on backpressure; the DES loop advances time, frees resources, resumes it | producer fiber | only when dynamic-shape trace size makes full materialization impractical |
+
+This is **not** timing co-execution: even the streaming sink only blocks the
+producer on resource flow-control, never on timing-resolved data events. It is
+the lazy/streamed realization of the same trace model. Decision: **do P3 with
+the materializing sink (no threads); defer streaming to P4 as a sink swap.** The
+single forward-compat requirement is that the callback sink is an interface.
+
+### 9.8 P3 task list
+
+1. DONE. `togsim_runtime.h` + `togsim_runtime.cc`/`togsim_loader.h`: C6 runtime
+   (`EmitCtx`) + `dlopen` loader (`run_producer`), materializing sink. Callees:
+   `togsim_core_alloc` (runtime core pool), `togsim_dma` (records a tile load/
+   store, signals its tag at data arrival), `togsim_compute` (cycle-table lookup),
+   `togsim_memory_barrier` (waits the matching `(tag_id, tag_slot)`),
+   `togsim_compute_barrier`.
+2. DONE (single-buffer). `lower_to_emitc`: OUTLINES the work-item body into
+   `togsim_kernel_tile(ctx, iv, n)` + a `togsim_dispatch` call at the work-item
+   boundary (ABI v12; was a bare `togsim_core_alloc` marker), lowers
+   `togsim.memory_barrier`, and reads `loop_type`. (Two-function outline DONE;
+   trace identical.)
+3. DONE. Real tile addresses wired (approach A): build_skeleton keeps the DRAM
+   index operand on `togsim.dma`; lower_to_emitc passes `(arg_id, offset)` and
+   `convert-arith-to-emitc` lowers the offset (size_t IV retype makes it
+   cast-free). Verified on 1024^3 GEMM: per-tile offsets are correct
+   (A[m,k]=m*1024+k, B[k,n]=k*1024+n).
+4. PARTIAL. C3 cycle table: `cycle_table.py` builds `tile_id -> (cycle,
+   overlapping_cycle)` from a per-tile `cycle_list`, with `overlapping_cycle =
+   max(cycle - offset[type], 0)` (the legacy formula) and a JSON sidecar dump.
+   Remaining (folds into task 5): feed it the gem5 sample-mode `cycle_list`
+   already computed in `extension_codecache` (reused -> both paths stay
+   cycle-consistent), and have `togsim_compute` set BOTH cycle and
+   overlapping_cycle on the Instruction.
+5. PARTIAL. C6 runtime + loader: `TOGSim/src/togsim_runtime.cc` +
+   `togsim_loader.h` implement the producer ABI and `run_producer` -- dlopen the
+   `.so`, run `togsim_kernel` against an `EmitCtx`, and record a `TraceRec` stream
+   (the materializing sink): each dma resolves `base[arg_id] + offset*elem_bytes`
+   and signals its tag at data arrival, each compute looks up the cycle table,
+   core_alloc round-robins the core. Verified standalone on the 256^3 GEMM:
+   addresses/cycles resolved correctly. DONE (sec 10, 10.7.4): the recorded
+   stream is fed into the existing timing core (Core/Simulator) -- TraceRec maps
+   to `Instruction` (compute_cycle + overlapping_cycle, dataflow-buffer deps +
+   runtime-tag barriers).
+
+Legacy path: the ONNX-TOG producer (`run_tog` -> `tog_generator` -> ONNX ->
+C++ `TileGraphParser`) is marked DEPRECATED in place (comments in
+`extension_codecache.py` and `tog_generator.py`) but kept live -- it must not
+break during the transition. It is retired only once this trace pipeline is
+stable. The cycle measurement (`cycle_list`, `x_offset`/`w_offset`) is shared,
+so the two paths stay cycle-consistent meanwhile.
+
+### 9.9 Task-5 completion roadmap: TraceRec -> Core (DONE; see §10)
+
+> **Status: implemented.** This roadmap is retained for context. The dependency
+> model it sketches (a per-`togsim_wait`-handle RAW edge) was *superseded* during
+> implementation by the explicit dataflow-DAG model in §10: edges come from SRAM
+> last-writer-per-buffer plus the vcix preload/matmul FSM, and async-DMA data
+> arrival is gated by an explicit `togsim.memory_barrier` paired on the runtime
+> `(tag_id, tag_slot)` (§10.7.4) — not by a returned event handle. Read the
+> bullets below as the original target shape, with that one substitution.
+
+Grounded by reading `Instruction.h`, `Core.cc`, `TileGraphParser.h/.cc`,
+`Simulator.cc`.
+
+**Target architecture (legacy, reused):** `ONNX -> TileGraphParser -> TileGraph
+(TileLoopNode / TileMemoryNode / TileMemoryWaitNode / TileComputeNode) ->
+Simulator distributes Tiles to Cores -> Core runs Instructions`. We replace only
+the front: build the same `TileGraph` / `Instruction`s from the recorded
+`TraceRec` stream, then hand it to the existing `Simulator`.
+
+**Mapping (TraceRec -> Instruction):** `Instruction(opcode, compute_cycle,
+num_parents, dram_addr, tile_size, tile_stride, elem_bits, tag_idx_list,
+tag_stride_list, accum_tag_idx_list)`; `ready_counter = num_parents`.
+- DMA load/store -> `MOVIN`/`MOVOUT`: `dram_addr = TraceRec.addr`, `tile_size`/
+  `tile_stride`/`elem_bits` from the dma, `tag_idx_list = {tag_slot}` (the
+  SRAM-slot key), `is_async` set. compute_cycle 0.
+- COMPUTE -> `COMP`: `compute_cycle = TraceRec.cycle`,
+  `set_overlapping_cycle(TraceRec.overlapping)`, `set_compute_type(...)`.
+- Dependency (RAW): a compute depends on its loads through the SRAM
+  last-writer-per-buffer analysis (§10); for an async load the last-writer is the
+  `togsim.memory_barrier` paired on the load's runtime `(tag_id, tag_slot)`, so
+  the compute's `ready_counter` only clears once the data has arrived (§10.7.4).
+- SRAM double-buffer / capacity (WAR): the existing Core enforces it through the
+  tag mechanism (`register_tag`/`set_tag_finish`/`mark_tag_used`, DMA.h) keyed by
+  `tag_idx_list`; our `(arg_id, tag_slot)` is that key. Reduction grouping ->
+  `accum_tag_idx_list` (the accumulation-loop index).
+
+**Build/wiring:** compile the bridge into TOGSim (it needs the conan deps;
+include flags are in `TOGSim/build/compile_commands.json`, notably
+`-D_GLIBCXX_USE_CXX11_ABI=0` and the `/root/.conan/data/{robin-hood,spdlog,fmt,
+yaml-cpp,boost}` include dirs). Add `togsim_runtime.cc` + the bridge to
+`TOGSim/CMakeLists.txt`. Either (a) build `TileGraph`/`Tile` nodes from TraceRec
+(maximal reuse of `Simulator`'s tile distribution + Core), or (b) build the
+`Instruction` DAG directly and drive a single Core. (a) is closer to legacy and
+gives multi-core for free.
+
+**Cycle-table feed:** reuse the gem5 `cycle_list` already computed in
+`extension_codecache` (so both paths stay cycle-consistent); pass it +
+`x_offset`/`w_offset` to `cycle_table.build_cycle_table`, dump the sidecar, and
+have the loader populate `EmitCtx.cyc/ovl`.
+
+**Validation:** same post-vcix fixture through both paths; compare the
+`Simulator`'s total cycles / DRAM traffic. Start with the 256^3 GEMM (static
+shape, single-buffer), then multi-tile / double-buffer kernels.
+
+This is a focused C++ integration (TOGSim build + TileGraph construction), not a
+small increment -- best executed as its own push; all the producer-side inputs
+(addresses, cycles, handles, core, tag_slot) are already in the trace.
+
+## Appendix A: current-state references
+
+- `TOGSim/include/DMA.h:27-115` — `tag_table` (overloaded `0/1/-1/>1`) +
+  `waiters`; `register_tag` / `set_tag_finish` / `register_tag_waiter` /
+  `mark_tag_used` (= init / signal / wait / consume).
+- `TOGSim/src/Core.cc:118-140, 214-324` — async-DMA signal path and the `BAR`
+  wait/consume path over the tag table.
+- `TOGSim/include/Instruction.h:40-48, 104-117` — `ready_counter` / `child_inst`
+  (the second, separate dependency mechanism) and the tag fields.
+- `PyTorchSimFrontend/mlir/passes/build_tog.py` — `TogBuilder.print_operation`
+  dispatch (`affine.for` / `memref.dma_start` / `memref.dma_wait` / `vcix.*`);
+  `_affine_for_bounds` (constant-bound resolution → static shape).
+- `PyTorchSimFrontend/mlir/passes/__init__.py`,
+  `PyTorchSimFrontend/mlir/passes/lower_to_llvm.py` — in-process Python MLIR pass
+  orchestration via the bindings; the functional Spike/LLVM path (unchanged).
+- `PyTorchSimFrontend/mlir/mlir_gemm_template.py` — kernel template emitting the
+  `affine.for` nest + `linalg.matmul` + `togsim.transfer` DMA ops.
+
+## 10. Explicit dependency-edge trace (revised dependency model)
+
+Supersedes the in-order / runtime-tag approach for expressing dependencies. The
+trace is an explicit dataflow DAG: every op declares the producers of the data it
+consumes; the consumer (Core) does all resource scheduling. Reached after finding
+that (a) flat in-order over-serializes parallel tiles, (b) the current TOG pass
+does NO dependency analysis (it emits a lexical loop tree + tags resolved at
+runtime by the C++ tag_table), and (c) compute I/O is collapsed away by
+build_skeleton, so dependencies must be recovered before the collapse.
+
+### 10.1 Representation
+
+The dependency edge is "consumer reads the buffer that producer wrote". As
+landed (ABI v9 onward; see STATUS "sec 10 explicit-edge bridge"), each op
+declares the **SRAM buffer ids** it reads and writes (`read_bufs` / `write_bufs`);
+the bridge builds the Instruction DAG by **last-writer per buffer**, scoped per
+work-item. There is no SSA event token threaded by the producer and no event
+handle returned by an op.
+
+- The edge source is data, not order: an op that reads buffer `b` gets an edge
+  from whatever op most recently wrote `b`.
+- No in-order chain, no runtime tag content-hash, no op-pattern heuristics.
+- Resource scheduling -- SA round-robin, double-buffer (<=N in flight), SRAM --
+  stays entirely in the Core. The trace never reasons about SRAM occupancy or
+  timing; it only states producer->consumer order.
+- One exception: an **async** DMA's write completes only at data arrival (DMA
+  response-complete), later than its issue, so its last-writer edge is routed
+  through an explicit `togsim.memory_barrier` that waits the load's runtime
+  `(tag_id, tag_slot)` (§10.7.4). A synchronous DMA is blocking and needs no
+  barrier.
+
+> The sketch below uses an `out_ev = op(ctx, in_events[])` SSA notation to
+> *illustrate* the edges; it predates the landed `read_bufs`/`write_bufs` form
+> and is no longer the literal ABI. Read `in={…}` as "reads these buffers".
+
+Producer C++ form (events threaded like SSA; loop-carried = a reassigned var):
+
+    for mi, ni:                                  // PARALLEL: independent tiles
+      ev acc = compute(ctx, INIT, in={});
+      for ki:                                    // REDUCTION: loop-carried acc
+        ev a = dma_load(ctx, A[mi,ki], in={});
+        ev b = dma_load(ctx, B[ki,ni], in={});
+        ev w = compute(ctx, PRELOAD, in={b});
+        acc  = compute(ctx, MATMUL,  in={a,w,acc});  // new acc event each iter
+      dma_store(ctx, C[mi,ni], in={acc});
+
+The INIT dependency reaches every accumulate transitively through the acc chain
+(INIT -> mm_k0 -> mm_k1 -> store); each node only needs edges to its immediate
+producers. Different (mi,ni) -> separate acc chains -> independent -> parallel.
+
+### 10.2 Two dependency sources (both available pre-collapse in the TOG pass)
+
+A single "SRAM access" analysis is necessary but NOT sufficient -- verified on the
+GEMM post-vcix:
+
+| dependency | source | visible in SRAM? |
+|---|---|---|
+| load -> compute (DMA writes X_spad/W_spad, preload/matmul read) | SRAM last-writer per (buffer, slot) | yes |
+| accumulator chain (INIT writes Y_spad; the drain/epilogue read-modify-writes Y_spad; store reads it) | SRAM last-writer on Y_spad | yes |
+| **preload -> matmul** (preload loads weights into the systolic-array registers; matmul consumes them) | **vcix opcode FSM** (op1=preload pairs with the following op0=matmul; build_tog already tracks this via `current_preload_node`) | **no -- SA-internal, not a memref access** |
+
+So the analysis derives edges from (1) SRAM (buffer, slot) last-writer for loads
+and the accumulator, and (2) the vcix preload/matmul pairing for the SA-weight
+dependency. The slot is a concrete value at run time (the producer runs the
+loops), so matching is by value -- no static affine-overlap math.
+
+Key facts (256^3 GEMM, post-vcix): SRAM buffers are %0=X_spad(A), %1=W_spad(B),
+%2=Y_spad(acc/out). matmul (vcix op0) reads %0 only; preload (vcix op1) reads %1;
+the matmul does NOT read %1 (weights come from the SA), which is exactly why a
+memref-only analysis lets it run before the weight load -- the preload->matmul
+edge must come from the FSM. The accumulation is the epilogue's `transfer_read
+%2 + addf + transfer_write %2`, which IS SRAM-visible.
+
+### 10.3 Components changed (as landed)
+
+- TOG pass (`build_skeleton` + `dep_analysis`, on post-vcix before collapse): per
+  op, the read/write SRAM buffer ids + the preload->matmul pairing (folded as a
+  virtual `SA_WEIGHTS` buffer) -> the read/write buffer sets.
+- ABI (`togsim_runtime.h`): `togsim_dma`/`togsim_compute` carry
+  `read_bufs`/`write_bufs`; an async DMA also carries `(tag_id, tag_slot)` for the
+  `togsim.memory_barrier` pairing. No `in_events[]`, no returned event, no
+  `event_id`/handle-buffer mechanism.
+- `lower_to_emitc`: emits the buffer-id arrays on each op (and lowers
+  `togsim.memory_barrier`).
+- bridge: builds the Instruction DAG by last-writer per buffer (`add_child`);
+  no in-order chain, no runtime tag content-hash.
+- Core: unchanged (ready_counter DAG + SA pipeline + double-buffer already exist).
+
+### 10.4 Open decisions
+
+- Reduction timing: model the acc chain as completion-serial (conservative,
+  simple) first; SA-pipelined (matches legacy's overlap) — RESOLVED via the
+  occupancy/latency split (§10.7).
+- Buffer-id lifetime: the last-writer map is scoped per work-item (reset at each
+  `togsim_core_alloc`).
+
+### 10.5 Known issue: preload concurrency not bounded by #systolic-arrays
+
+Observed in the --trace_so run (256^3 GEMM): 4 PRELOADs execute concurrently
+(issue ~1028, finish ~1119-1122), but with num_systolic_array_per_core = 2 at
+most 2 should overlap, and two preloads on the same SA should serialize (one
+weight register file per array). Cause: a preload's overlapping_cycle equals its
+compute_cycle (91 == 91), so its occupancy (compute - overlapping) is ~0 and the
+Core's SA compute pipeline accepts unbounded back-to-back preloads.
+
+This is a PRE-EXISTING Core SA-model property, NOT introduced by the trace
+pipeline: the legacy build_tog path shows the same -- its 4 preloads issue at
+1215-1218 and finish 1306-1309 (4 concurrent). So it is not a trace-vs-legacy
+regression, but it is a real hardware-fidelity gap: the model should cap
+concurrent preloads at the systolic-array count and serialize same-SA preloads on
+the single weight buffer. Track separately from the trace work (affects both
+paths equally).
+
+### 10.6 Known issue: accumulator dependency over-serializes the reduction
+
+Observed in the --trace_so run: consecutive matmuls run 396 cycles apart (fully
+serial: issue 1120, 1516, 1912, ...), but physically matmuls that accumulate into
+the same output should PIPELINE on the systolic array (the partial sums stream
+through; consecutive matmuls overlap by overlapping_cycle, ~128 effective). They
+should NOT wait the previous matmul to complete.
+
+Cause: the explicit-edge bridge builds a hard completion edge (add_child) for the
+Y_spad accumulator read-modify-write, so matmul_k1 waits matmul_k0's
+finish_instruction -> when it issues, k0 is already done -> the overlapping_cycle
+window is empty -> no pipeline. This is the mechanism behind the 4888 vs legacy
+2095 gap (legacy has NO inter-matmul edges, so its matmuls pipeline on 2 SAs:
+finishes 1704,1707 | 1832,1835 = +128 within an SA, +3 across SAs).
+
+So the accumulator (Y_spad) dependency is a PIPELINED/ordering dependency, not a
+completion barrier. add_child cannot express that. Fix direction: do not create a
+matmul->matmul completion edge through the accumulator -- the accumulation order
+is preserved implicitly by same-SA issue order + the SA pipeline (overlapping_
+cycle), exactly as legacy does. Keep the real barriers: load->compute, and
+store->last-matmul (the store needs the final accumulator). The asymmetry (a
+matmul consuming Y pipelines; the store consuming Y waits) is the crux to model --
+likely "do not barrier when the consumer is a same-unit pipelined compute".
+
+Related to the same root as 10.5 (the SA/compute-pipeline occupancy model): both
+are about modeling the systolic array's streaming/pipelined execution rather than
+treating each compute as an atomic completion.
+
+### 10.7 Occupancy/latency split for pipelined computes (design + prototype)
+
+Idea (keeps add_child uniform): give each compute two completion points instead of
+one. A systolic-array op occupies its unit for occupancy = compute_cycle -
+overlapping_cycle (the initiation interval, ~128 for the matmul) and its result is
+ready at latency = compute_cycle (~395). Then add_child releases:
+  - a same-unit pipelined successor (next matmul, accumulator RMW) at OCCUPANCY
+    -> it starts ~128 later -> pipeline;
+  - a result consumer (the store reads the drained accumulator) at LATENCY
+    -> it waits the full drain (tail).
+So a single add_child mechanism stays, but the release point depends on whether
+the edge is an occupancy-dependency (same-unit pipeline) or a latency-dependency
+(reads the result). This also fixes 10.5: a preload then occupies its SA for its
+occupancy, so concurrent preloads are naturally capped at the SA count.
+
+Prototype (bridge stopgap, committed): skip the matmul->matmul accumulator edge
+(treat it as pipelined, not a barrier); keep every other edge. Result on 256^3
+GEMM: matmuls now issue back-to-back (1120-1127) and finish pipelined on 2 SAs
+(1515,1516 | 1643,1644 | 1771,1772 | 1899,1900 = +128 within an SA, +1 across),
+exactly like legacy. Total 4888 -> 2501 (vs legacy 2095 / 2608-incl-store; our
+matmuls finish at 1900 vs legacy 2091 -- our load chain is shorter). This
+confirms the accumulator dependency is pipelined. The clean replacement is the
+occupancy/latency split above in the Core so add_child stays uniform and the
+bridge needs no matmul-specific skip.
+
+#### 10.7.1 preload->matmul is also an occupancy dependency (preload fully overlaps)
+
+The preload->matmul edge is the SAME kind as matmul->matmul: a same-SA pipeline
+(occupancy) dependency, not a latency barrier. A preload's overlapping_cycle
+equals its compute_cycle (91 == 91), so its occupancy = compute - overlapping = 0
+-- it fully overlaps. With the occupancy/latency split, the matmul (successor)
+released at the preload's OCCUPANCY (= preload issue + 0) starts immediately, so
+the preload's 91-cycle latency is entirely hidden under the matmul.
+
+In the current prototype the preload->matmul edge is still an add_child barrier
+(only matmul->matmul was skipped), so the matmul issues at 1120 -- right after the
+preload finishes at ~1119 -- paying the full 91. The bridge cannot cleanly skip
+preload->matmul (skipping it outright loses the ordering: the matmul could be
+ready before the preload and reach the SA without weights). So preload-overlap is
+another reason the proper fix is the Core occupancy/latency split (10.7), which
+releases the matmul at the preload's occupancy (0) while keeping the issue order.
+
+Net: the Core occupancy/latency split resolves three notes at once -- 10.5
+(concurrent preloads capped at SA count via preload occupancy), 10.6 (matmuls
+pipeline), 10.7.1 (preload fully overlaps) -- all instances of "model the SA as a
+pipeline (occupancy + latency) instead of atomic completion".
+
+#### 10.7.2 Occupancy/latency split: implemented + POC result
+
+Implemented uniformly: Instruction gains add_pipeline_child / release_pipeline_
+children; the Core releases an op's pipeline children when it ISSUES (enters the
+SA pipeline), and its normal children at finish. The bridge classifies edges: a
+preload/matmul -> matmul edge is occupancy (add_pipeline_child), everything else
+is latency (add_child). No matmul-specific skip heuristic.
+
+256^3 GEMM result: preloads issue 1028-1031, matmuls issue 1032-1039 (right after
+the preloads ISSUE, not after they finish at ~1119 -> preload fully overlaps), and
+matmuls finish pipelined on 2 SAs (1427,1428 | 1555,1556 | 1683,1684 | 1811,1812
+= +128 within an SA, +1 across). Total 4888 -> 2501 (matmul-skip) -> 2413
+(occupancy/latency). Legacy is 2095 (matmul completion; our matmuls finish at 1812
+vs legacy 2091 -- shorter load chain -- and our 2413 includes the store).
+
+Note on 10.5 (preload concurrency): NOT fixed by this alone. A preload's
+overlapping_cycle == compute_cycle, so its occupancy is 0 -> it does not hold the
+SA -> 4 preloads still issue concurrently (1028-1031). Capping concurrent preloads
+at the SA count needs the preload to have a non-zero occupancy reflecting the
+weight-load time (a cycle-model input), separate from this edge-release change.
+
+#### 10.7.3 Explicit compute fence: implemented (COMPUTE_BAR), BAR -> MEMORY_BAR
+
+The compute fence is now a first-class trace entity, not a bridge-internal edge:
+  - togsim_ops: `togsim.compute_barrier`; ABI v10 adds `togsim_compute_barrier(ctx)`.
+  - build_skeleton emits a `togsim.compute_barrier` before each store DMA; lower_to_emitc
+    lowers it; the runtime records a COMPUTE_BAR TraceRec.
+  - The two barrier kinds are now named distinctly: Opcode::BAR -> Opcode::MEMORY_BAR
+    (the DMA/tag memory barrier, unchanged) and a new Opcode::COMPUTE_BAR.
+  - Core: COMPUTE_BAR finishes only once ALL compute pipelines drain (every systolic
+    array + the VPU empty); until then it stays in the ready queue (re-checked each
+    cycle). Its ready_counter is gated (pipeline-child of the outstanding async
+    computes) so it is only evaluated after they have ISSUED into the pipeline.
+  - bridge: a COMPUTE_BAR record -> a COMPUTE_BAR Instruction (pipeline-child of the
+    outstanding async matmuls); the following store add_child's the fence.
+
+256^3 GEMM: trace shows `... matmul x N -> COMPUTE_BAR -> STORE`; the COMPUTE_BAR
+instruction finishes at 1813 (after the SAs drain, last matmul ~1812), the store
+issues at 1814. Total 2414 (matches the implicit-flush 2413 + the 1-cycle fence).
+Multiple SAs handled (drains all _sa_compute_pipeline[*]). 7 python tests pass.
+
+#### 10.7.4 load->compute uses MEMORY_BAR (async DMA data wait); fixes a real bug
+
+Bug found: a consumer reading an async-loaded buffer ran BEFORE the data arrived
+(preload issued @1028 but its weight load W finished @1131). Cause: a raw
+add_child on an async DMA fires at the load's ISSUE-complete (program flow), not
+its DATA-ready (resp-complete) -- the async DMA signals data only via the tag
+table (set_tag_finish at resp-complete). So the buffer-edge model alone cannot
+gate compute on async-loaded data.
+
+Fix (symmetric with COMPUTE_BAR): route async load -> compute through a MEMORY_BAR
+that carries the load's tag. The load registers the tag at issue; the MEMORY_BAR
+(made ready after the load issues, via add_child) parks on the tag and is woken at
+resp-complete; consumers depend on the MEMORY_BAR (last_writer[buf] = bar). So the
+memory-arrival notification (set_tag_finish) connects to compute via the existing
+tag mechanism -- now explicit in the trace as a MEMORY_BAR instruction.
+
+256^3 GEMM: preload now issues @1132 (after W resp-done @1131), correct. Total
+2414 (buggy/optimistic) -> 2518 (correct: compute waits the slow weight load).
+Both barriers are explicit and symmetric: MEMORY_BAR (DMA tag, resp-complete) for
+load->compute, COMPUTE_BAR (SA pipeline drain) for compute->store.
+
+## 11. Remaining work + next-session handoff
+
+### 11.1 Status
+
+PR #267 (feature/togsim-cpp-trace -> develop). The trace pipeline runs end-to-end
+through the REAL Simulator/Core on a 256^3 GEMM via `--trace_so`, with an explicit
+dataflow dependency model (SRAM last-writer + vcix FSM) and two explicit barriers:
+MEMORY_BAR for async load->compute data (paired to its DMA by the runtime
+`(tag_id, tag_slot)` tag slot) and COMPUTE_BAR for the SA drain before a store.
+The async-DMA sync is the runtime tag slot, NOT a compile-time event-id (ABI
+bumped to v11; the event-id / event-handle / wait/signal design was removed).
+Legacy ONNX-TOG path kept + DEPRECATED. All togsim python tests pass; TOGSim
+builds.
+
+**Validation (256^3 GEMM, real gem5 cycle table):** through the real Core the
+trace path totals **2518 cycles** vs the legacy path's **2698** on the same
+table. The earlier 10.x notes (with a stub table) report different absolute
+numbers; 2518-vs-2698 is the current real-table figure.
+
+### 11.2 Remaining work (priority order)
+
+1. **Cycle-equivalence closure.** Characterize/close the trace-vs-legacy gap on the
+   256^3 GEMM with the SAME gem5 cycle_list. Sub-items 2-3 are the main drivers.
+2. **Preload concurrency cap (sec 10.5).** 4 preloads run concurrently though there
+   are 2 SAs, because a preload's occupancy is 0 (overlapping_cycle == compute).
+   Give the preload a non-zero occupancy (the weight-load time) so concurrent
+   preloads are capped at the SA count. Pre-existing in BOTH paths.
+3. **Robust gem5 cycle_list wiring.** The extension_codecache `TORCHSIM_DUMP_TRACE_SO=1`
+   hook dumps trace.so + trace_cycles.tsv from the real cycle_list, but is flaky
+   under concurrent compiles (saw cycle_list==[] once). Make it robust (or force a
+   single-thread compile), so `--trace_so --cycle_table` uses real per-tile cycles.
+4. **Parallel output tiles / multi-core.** One dispatch per work-item today; for
+   distributing independent output tiles across cores, emit a dispatch per parallel
+   (m_sub, n_sub) tile. The inner sub-tile loops are currently unlabeled (only the
+   macro loops carry subtile/accumulation), so the axis role must be recovered.
+5. **Cleanup.** The obsolete WAIT/SIGNAL trace records and the event-handle
+   buffer are dropped (v11). COMPUTE_BAR logs finish twice (cosmetic). The
+   preload node mis-attributes an X_spad read (build_tog `_steal_leading_transfer_read`)
+   -> a harmless extra edge.
+6. **P5 op coverage.** Only GEMM is exercised. Extend to conv / SDPA / vector / pool.
+7. **P4.** Symbolic/dynamic shape; streaming sink (coroutine, alloc-blocks).
+8. **Two-function outline** (togsim_kernel_tile) -- DONE (ABI v12). The work-item
+   body is outlined into a uniform `togsim_kernel_tile(ctx, iv, n)` and run via the
+   higher-order `togsim_dispatch` wrapper (round-robin core + TILE_BEGIN/TILE_END);
+   the work-item scope is now the function call. Trace/cycles identical to the old
+   single-function `togsim_core_alloc` form. One general dispatcher serves every
+   kernel.
+9. **Retire the legacy ONNX-TOG path** once the trace path is stable.
+
+### 11.3 Next-session context
+
+- Worktree `/workspace/PyTorchSim-cpptrace`, branch `feature/togsim-cpp-trace`,
+  PR #267 -> develop. The branch is rebased ONTO develop (the retire-floormod base
+  was dropped -- develop already has it). `source .envrc` in the worktree.
+- Build TOGSim: submodules are init'd; `cd TOGSim/build && cmake .. -DCMAKE_BUILD_TYPE=Release && make -j$(nproc)`.
+  The Simulator target has ENABLE_EXPORTS (so a dlopen'd .so resolves the togsim_*
+  callbacks); togsim_runtime.cc + togsim_trace_bridge.cc are picked up by the src glob.
+- Run the trace path:
+  `python -m PyTorchSimFrontend.mlir.passes.lower_to_emitc <postvcix.mlir> --so trace.so [--emit-cpp x.cpp]`
+  then `bin/Simulator --config <yml> --trace_so trace.so [--cycle_table cyc.tsv] [--log_level trace]`.
+- Get a post-vcix fixture: a real torch.compile GEMM with `TORCHSIM_DUMP_MLIR_IR=1
+  pytorchsim_functional_mode=False` writes `outputs/<hash>/..._sample_postvcix.mlir`.
+  Real cycle data + legacy reference: add `TORCHSIM_DUMP_TRACE_SO=1` to also dump
+  trace.so + trace_cycles.tsv in `outputs/<hash>/` (see 11.2 #3). (Prior /tmp
+  fixtures are ephemeral -- regenerate.)
+- Env (.envrc): gem5 `/gem5/release/gem5.opt`, spike `/release/bin/spike`,
+  LLVM `/riscv-llvm/bin`.
+- Tests: `TOGSIM_SKELETON_FIXTURE=<postvcix.mlir> pytest tests/test_togsim_{skeleton,emitc,runtime}.py`.
+  These are NOT in the CI allowlist (`.github/workflows/pytorchsim_test.yml`) -- register them to gate CI.
+- Key files: passes `build_skeleton.py`, `lower_to_emitc.py`, `dep_analysis.py`,
+  `cycle_table.py`, `togsim_ops.py`; `TOGSim/include/{togsim_runtime.h, togsim_loader.h, togsim_trace_bridge.h}`,
+  `TOGSim/src/{togsim_runtime.cc, togsim_trace_bridge.cc}`; `Core.cc`/`Instruction.{h,cc}`
+  (COMPUTE_BAR + MEMORY_BAR rename); `main.cc` (--trace_so); `extension_codecache.py`
+  (TORCHSIM_DUMP_TRACE_SO hook).
+- Local-only backups of the pre-squash/pre-rebase 28-commit history: tag
+  `pr-backup-ccfea43e`, branch `backup-presquash-3cfd4a3f` (NOT pushed).
diff --git a/docs/design/togsim_cpp_trace_HANDOFF.md b/docs/design/togsim_cpp_trace_HANDOFF.md
new file mode 100644
index 00000000..23f642bb
--- /dev/null
+++ b/docs/design/togsim_cpp_trace_HANDOFF.md
@@ -0,0 +1,191 @@
+# Handoff — TOGSim C++ Trace Generation
+
+Continuation notes for picking this work up in a fresh session. Read alongside
+the full design: [`togsim_cpp_trace.md`](./togsim_cpp_trace.md) and the snapshot
+[`togsim_cpp_trace_STATUS.md`](./togsim_cpp_trace_STATUS.md).
+
+## Goal (one line)
+
+Replace the timing-path TOG producer (MLIR -> Python-dict -> ONNX -> C++ parser)
+with a compiled, shape-parametric trace producer (MLIR -> EmitC -> C++ -> `.so`);
+TOGSim's timing core is preserved.
+
+## Current state (one paragraph)
+
+The trace pipeline is implemented end-to-end and runs through the REAL
+Simulator/Core on a 256^3 GEMM (`--trace_so`). Dependencies are an explicit
+dataflow DAG (SRAM last-writer per buffer + the vcix preload/matmul FSM). An
+asynchronous DMA is synced to the consumer of its data by the **runtime tag
+slot** `(tag_id, tag_slot)` through an explicit `togsim.memory_barrier` (lowered
+from the source `memref.dma_wait`); a sync DMA is blocking. ABI is **v11**. An
+earlier design used a compile-time `event_id` / heap event handle with
+`wait`/`signal`; it was removed because one static DMA op runs once per loop
+iteration into a different tag slot, which a compile-time id cannot pair per
+iteration. **Validation:** on the 256^3 GEMM with the real gem5 cycle table, the
+trace path totals **2518 cycles** vs the legacy path's **2698** through the real
+Core; all togsim python tests pass; TOGSim builds.
+
+## Branch
+
+- Work branch: `feature/togsim-cpp-trace` (PR #267 -> develop)
+
+## Status
+
+| Milestone | State |
+|---|---|
+| P0 — ABI header + op vocabulary | DONE (ABI evolved to v11) |
+| P1 — `build_skeleton` pass | DONE, verified — runs on a real GEMM fixture, module verifies, compute grouping + dma/barrier counts match the legacy `build_tog` TOG. |
+| P2 — togsim -> emitc -> cpp -> .so | DONE — `lower_to_emitc.py` builds EmitC, `mlir-translate` -> C++, `g++ -shared` -> `.so`; validated by build/symbol checks and a dlopen run harness. |
+| P3 — TOGSim loader + runtime + cycle table; real-Core run | DONE — runs end-to-end through the real Simulator/Core (256^3 GEMM, `--trace_so`). Runtime tag-slot pairing (ABI v11, `togsim.memory_barrier`), explicit dataflow DAG (read/write_bufs last-writer + vcix FSM), real tile addresses, cycle_table. `togsim_runtime.cc`/`togsim_loader.h`/`togsim_trace_bridge.cc` feed TraceRec into the real Core. Cycle comparison vs legacy on the real gem5 table: trace 2518 vs legacy 2698. Legacy ONNX-TOG path DEPRECATED in place, kept live. |
+| P4 — symbolic-bound dynamic shape, streaming sink | not started |
+| P5 — op-family migration (conv/SDPA/vector) | not started |
+
+### Async-DMA sync: runtime tag slot (current), event-id (removed)
+
+The original P1 threaded the dma->wait dependency as an SSA `!togsim.event`
+value, which fails `module.verify()` on a software-pipelined kernel (the
+`togsim.dma` sits in the prefetch loop nest, its consumer in a sibling compute
+nest, so the value does not dominate its use). An intermediate fix used a
+compile-time `event_id` attribute (later a heap-allocated event handle). Both
+were **removed**: one static `togsim.dma` op executes once per loop iteration
+into a *different* runtime tag slot, so a compile-time id (one per static op)
+cannot pair iteration i's DMA with iteration i's wait.
+
+Current mechanism (ABI v11): `togsim.dma` carries `tag_id` (its tag-memref
+identity) plus the runtime tag-index operand `%tag[%idx]` and returns void. The
+source `memref.dma_wait` is mapped through to an explicit
+`togsim.memory_barrier {tag_id, write_bufs}` carrying the runtime tag index. At
+runtime an async DMA and its barrier are paired by `(tag_id, tag_slot)` through
+the existing Core tag table (`prepare_tag_key`/`set_tag_finish`/
+`register_tag_waiter`): the DMA signals at data arrival, the barrier waits, and
+the barrier becomes the loaded buffer's last-writer so consumers gate on
+arrival. (The one remaining auto-inserted barrier is `togsim.compute_barrier`,
+the compute fence before a store — marked FIXME to become explicit later.)
+
+### P2 decisions
+
+* **ABI v11 (runtime tag slot).** `togsim_dma` returns void and carries
+  `(is_async, tag_id, tag_slot, read_bufs, write_bufs)`. The
+  `togsim_memory_barrier(tag_id, tag_slot, write_bufs)` is the explicit
+  async-DMA sync. No `event_id`, no event handle, no `wait`/`signal`.
+* **C4 drives the upstream EmitC conversion passes** (it does not hand-build
+  EmitC). It only does the parts upstream cannot: rewrite the *unregistered*
+  `togsim.*` ops to `emitc.call_opaque` and rewrite the kernel signature to the
+  ABI form. Then it runs, in-process (`mlir.passmanager`),
+  `func.func(lower-affine), convert-scf-to-emitc, convert-arith-to-emitc,
+  convert-func-to-emitc`. One local fixup: in this LLVM 20 build
+  `convert-scf-to-emitc` emits `emitc.for` with `index` bounds, so
+  `convert-arith-to-emitc` (constants -> `!emitc.size_t`) leaves
+  `unrealized_conversion_cast` on the bounds that nothing folds and
+  `mlir-to-cpp` can't print (design sec 8 risk). `_fold_for_bound_casts`
+  rewrites those bound constants to `index`-typed `emitc.constant`, clearing
+  the casts. (`emitc.for` *does* accept `size_t` bounds with an explicit
+  `: !emitc.size_t`, but keeping the bounds `index` avoids retyping the IV.)
+* **Addresses (wired in P3, approach A):** `togsim_dma` passes `(arg_id, element
+  offset)` with the offset computed from the loop IVs; the runtime adds the
+  tensor base. `togsim.compute` is keyed by `tile_id` for cost.
+
+## Files (key)
+
+- `TOGSim/include/togsim_runtime.h` — extern "C" ABI v11 (`togsim_dma`,
+  `togsim_memory_barrier`, `togsim_compute`, `togsim_compute_barrier`,
+  `togsim_core_alloc`, `togsim_kernel` entry, `TOGSIM_ABI_VERSION`, opaque
+  `EmitCtx`).
+- `PyTorchSimFrontend/mlir/passes/togsim_ops.py` — single source of truth for the
+  skeleton+API MLIR vocabulary (op names, attr keys, op->callee map).
+- `PyTorchSimFrontend/mlir/passes/build_skeleton.py` + `dep_analysis.py` — the P1
+  pass + dependency analysis (reuse build_tog's `TogBuilder`/`_build`; map
+  dma_start->togsim.dma, dma_wait->togsim.memory_barrier, attach read/write_bufs;
+  use-based DCE).
+- `TOGSim/src/togsim_runtime.cc`, `TOGSim/include/togsim_loader.h`,
+  `TOGSim/src/togsim_trace_bridge.cc` — C6 runtime, dlopen loader, and the bridge
+  that feeds the recorded TraceRec stream into the real Core.
+- `tests/test_togsim_skeleton.py` — `test_togsim_ops_contract` (runs anywhere) +
+  `test_build_skeleton_on_fixture` (gated on bindings + a fixture).
+- `PyTorchSimFrontend/mlir/passes/lower_to_emitc.py` — the P2/C4 pass: skeleton
+  module -> EmitC `togsim_kernel` -> C++ (`mlir-translate`) -> `.so` (`g++`).
+  Entry points: `lower_to_emitc(module)`, `build_trace_so(postvcix_path, so)`,
+  and a `__main__` CLI (`--so`, `--emit-cpp`, `--include-dir`).
+- `tests/test_togsim_emitc.py` — `test_build_trace_so` (EmitC + symbol checks) +
+  `test_trace_so_runs` (dlopen the `.so` against a stub runtime, run it). Gated
+  on bindings + `mlir-translate` + a C++ compiler + the fixture.
+
+## Reproduce P1 + P2 (one GEMM kernel)
+
+```bash
+# 1. post-vcix fixture: compile a GEMM (needs the built PyTorchSimDevice .so).
+export pytorchsim_functional_mode=False
+python tests/ops/gemm/test_matmul.py
+FIX=$(find "${TORCHSIM_DUMP_PATH:-.}" -name '*_postvcix.mlir' | head -1)
+# build_skeleton/lower_to_emitc only need the .mlir + bindings, not torch, so a
+# fixture compiled in any worktree is fine.
+
+# 2. P1: skeleton+API MLIR.
+python -m PyTorchSimFrontend.mlir.passes.build_skeleton "$FIX" --out /tmp/skel.mlir
+#   stderr: "skeleton: compute=.. dma=.. memory_barrier=.."
+
+# 3. P2: skeleton -> EmitC -> C++ -> .so (reads skel from $FIX via build_skeleton).
+python -m PyTorchSimFrontend.mlir.passes.lower_to_emitc "$FIX" \
+    --so /tmp/trace.so --emit-cpp /tmp/trace.cpp
+nm -D /tmp/trace.so | grep togsim     # togsim_kernel = T; togsim_dma/memory_barrier/compute = U
+
+# 4. tests
+TOGSIM_SKELETON_FIXTURE="$FIX" python -m pytest \
+    tests/test_togsim_skeleton.py tests/test_togsim_emitc.py -q
+```
+
+Note: `mlir-opt`/`mlir-translate` live in `$TORCHSIM_LLVM_PATH` but are not on
+`$PATH`; `lower_to_emitc` resolves `mlir-translate` from `TORCHSIM_LLVM_PATH`.
+
+## Next steps (P3 is done; remaining work)
+
+The producer is wired into TOGSim and runs through the real Core (trace 2518 vs
+legacy 2698 on the 256^3 GEMM). The parallelism / reduction / core-dispatch
+design is in `togsim_cpp_trace.md` §9. Summary: the producer is core-transparent
+(knows nothing about `num_cores`); it enumerates parallel output-tile work-items
+and calls `togsim_core_alloc` at each work-item boundary. Parallel = independent
+work-items; reduction = program order inside one work-item; core binding = the
+`togsim_core_alloc` runtime callback (policy lives in TOGSim). Async-DMA data
+sync = the runtime `(tag_id, tag_slot)` via `togsim.memory_barrier`. `num_cores`
+is extrinsic so it is never baked; vlane/tile sizes are intrinsic and stay baked.
+Split-K is a deferred exception.
+
+Remaining (priority order; full list in STATUS §7 and design §11.2):
+
+- **SRAM tile lifecycle (double-buffer throttle).** `togsim.dma` carries
+  `tag_slot` (the SRAM slot key); the consumer must use it to throttle in-flight
+  loads to the buffer depth on multi-tile / double-buffered kernels.
+- **Preload concurrency cap (design §10.5).** Give a preload a non-zero occupancy
+  (its weight-load time) so concurrent preloads are capped at the SA count.
+  Pre-existing in BOTH paths.
+- **Per-output-tile dispatch / multi-core.** One `togsim_core_alloc` per
+  work-item today; distribute independent output tiles across cores.
+- **Robust gem5 cycle_list wiring.** The extension_codecache
+  `TORCHSIM_DUMP_TRACE_SO=1` hook is flaky under concurrent compiles.
+- **P5 op coverage** (conv/SDPA/vector) and **P4** (symbolic shape, streaming
+  sink), then **retire the legacy ONNX-TOG path**.
+
+Full design: `togsim_cpp_trace.md` §5-11.
+
+## Environment requirements (for the new session)
+
+- MLIR Python bindings importable (`import mlir.ir`). They ship with the LLVM
+  build at `${TORCHSIM_LLVM_PATH%/bin}/python_packages/mlir_core`; the CI docker
+  image `ghcr.io/psal-postech/torchsim-ci` has them. `passes/__init__` also
+  derives the path from `TORCHSIM_LLVM_PATH`.
+- `pytest` to run the test files directly (`pip install pytest` if absent).
+- `mlir-translate` (in `$TORCHSIM_LLVM_PATH`) and a host C++ compiler (`g++`/
+  `$CXX`) for the P2 `.so` path.
+- TOGSim build (for `--trace_so`): `cd TOGSim/build && cmake ..
+  -DCMAKE_BUILD_TYPE=Release && make -j$(nproc)`. The Simulator target has
+  ENABLE_EXPORTS so a dlopen'd `.so` resolves the `togsim_*` callbacks.
+- When iterating on passes, clear the codegen caches (`$TORCHSIM_DUMP_PATH`,
+  default `outputs/`) between runs — see CLAUDE.md "Codegen changes are sticky".
+
+## Verification that already passes anywhere (sanity)
+
+```bash
+python -m py_compile PyTorchSimFrontend/mlir/passes/build_skeleton.py \
+    PyTorchSimFrontend/mlir/passes/togsim_ops.py tests/test_togsim_skeleton.py
+# contract test (no bindings needed): see test_togsim_ops_contract
+```
diff --git a/docs/design/togsim_cpp_trace_STATUS.md b/docs/design/togsim_cpp_trace_STATUS.md
new file mode 100644
index 00000000..ebf05701
--- /dev/null
+++ b/docs/design/togsim_cpp_trace_STATUS.md
@@ -0,0 +1,226 @@
+# TOGSim C++ Trace Generation — Status Report
+
+Branch: `feature/togsim-cpp-trace`. Design of record: `togsim_cpp_trace.md` (esp.
+§9); continuation notes: `togsim_cpp_trace_HANDOFF.md`. This file is a snapshot of
+progress.
+
+## 1. Goal
+
+Replace the timing-path TOG producer (`MLIR -> Python dict -> ONNX -> C++
+TileGraphParser`) with a compiled, shape-parametric trace producer
+(`MLIR -> skeleton -> EmitC -> C++ -> .so`). TOGSim's timing core is preserved;
+only the producer of its input changes. The key idea: do not flatten the TOG;
+instead **run** a compiled C++ producer that emits the trace as a stream of API
+calls.
+
+Each API call emits one trace record = one modeled instruction, fed to the
+existing timing Core. Dependencies are an explicit dataflow DAG (SRAM
+last-writer per buffer + the vcix preload/matmul FSM). An asynchronous DMA is
+synced to the consumer of its data by the **runtime tag slot** `(tag_id,
+tag_slot)` through an explicit `togsim.memory_barrier` (ABI v11). An earlier
+design used a compile-time `event_id` / event handle with `wait`/`signal`; that
+was removed because one static DMA op runs once per loop iteration into a
+different tag slot, which a single compile-time id cannot pair per iteration.
+
+## 2. Pipeline
+
+```
+post-vcix .mlir (torch.compile output)
+  | build_skeleton.py + dep_analysis.py (P1)  keep loops;
+  |   memref.dma_start -> togsim.dma(tag_id, %tag[%idx], is_async, read/write_bufs);
+  |   memref.dma_wait  -> togsim.memory_barrier(tag_id, tag_slot, write_bufs);
+  |   compute block    -> togsim.compute; DCE the rest
+  v
+skeleton+API MLIR
+  | lower_to_emitc.py (P2/C4)  togsim.* -> emitc.call_opaque; ABI signature; drive upstream
+  |                            lower-affine/convert-*-to-emitc; _retype_for_to_size_t fixups
+  v
+EmitC --mlir-translate--> C++ --g++ -shared--> trace.so
+                                                 | TOGSim loader (C6): dlopen + EmitCtx callbacks
+                                                 v
+                                       TraceRec stream (materializing sink)
+                                                 | togsim_trace_bridge.cc -> existing Core timing
+                                                 v
+                                       cycles / DRAM traffic (real Core)
+```
+
+Side artifact: cycle table `tile_id -> (cycle, overlapping_cycle)` (cycle_table.py).
+
+## 3. Milestones
+
+| | State |
+|---|---|
+| P0 ABI header + togsim vocabulary | DONE (ABI evolved to v11) |
+| P1 build_skeleton | DONE, verified (compute/dma/barrier match legacy TOG) |
+| P2 lower_to_emitc -> .so | DONE (real GEMM .so built and run) |
+| P3 loader/runtime + cycle table + real-Core run | DONE (runs end-to-end through the real Simulator/Core; below) |
+| P4 symbolic/dynamic shape, streaming sink | TODO |
+| P5 op-family migration (conv/SDPA/vector) | TODO |
+
+P3 detail:
+
+| | State |
+|---|---|
+| ABI (core_alloc, runtime tag pairing, dma address) | DONE (v11) |
+| work-item boundary (togsim_core_alloc) | DONE |
+| real tile DRAM addresses (approach A) | DONE, verified on 1024^3 |
+| cycle_table builder (cycle + overlapping) | DONE |
+| async DMA <-> consumer sync (runtime tag slot, memory_barrier) | DONE |
+| explicit dataflow DAG (read/write_bufs last-writer) | DONE |
+| C6 runtime + dlopen loader (materializing) | DONE |
+| TraceRec -> existing Core timing feed | DONE (runs end-to-end through real Core) |
+| cycle comparison vs build_tog (real gem5 table) | DONE: trace 2518 vs legacy 2698 |
+| SRAM tile lifecycle / preload-occupancy refinements | partial (see §7) |
+
+### TraceRec -> Core: now running end-to-end
+
+`TOGSim/src/togsim_trace_bridge.cc` (`trace_to_tilegraph`) + a `--trace_so` mode
+in `main.cc` feed the recorded trace into the REAL Simulator/Core. The producer
+`.so` is `dlopen`'d (the Simulator is built with ENABLE_EXPORTS so the `.so`
+resolves the `togsim_*` callbacks back into the binary), its trace recorded, then
+bridged to a `TileGraph`: one `TileSubGraph` per work-item (core_alloc marker)
+bound to its core, one `Tile` of MOVIN/MOVOUT/COMP/MEMORY_BAR/COMPUTE_BAR
+`Instruction`s. Dependency edges are built by **last-writer per SRAM buffer**
+(`read_bufs`/`write_bufs`); an async load's last-writer is the MEMORY_BAR paired
+to it by the runtime `(tag_id, tag_slot)` (so a consumer waits actual data
+arrival), and a COMPUTE_BAR drains the systolic-array pipeline before a store.
+Build it (`cd TOGSim/build && cmake .. && make`) and run:
+`bin/Simulator --config <yml> --trace_so gemm_trace.so`.
+
+### Cycle comparison vs legacy build_tog (256^3 GEMM, real gem5 table)
+
+Ran the same kernel through the legacy path (torch.compile -> gem5 -> build_tog
+-> Simulator) and the trace path (the same post-vcix IR -> trace .so + the SAME
+gem5 cycle_list -> --trace_so), both through the REAL Core. extension_codecache
+has an opt-in TORCHSIM_DUMP_TRACE_SO=1 hook that dumps trace.so + trace_cycles.tsv
+from the same cycle_list/offsets (best-effort, never breaks the legacy path);
+compute-unit routing uses compute_type and the tag key uses a per-tensor addr_id
+(set_addr_name(arg_id)+prepare_tag_key) so A and B don't collide on tag_slot 0.
+
+**Result: the trace path totals 2518 cycles vs the legacy path's 2698 on the
+same gem5 cycle table.** All togsim python tests pass; TOGSim builds. Compute
+work and DRAM traffic match; the remaining difference is scheduling (the
+explicit dataflow DAG plus the occupancy/latency SA-pipeline model overlap
+differently than legacy's per-iteration BARs).
+
+**Subtile + multi-tile-K now runs** (256x512x256 forced to 128x128 subtiles, 2
+K-tiles: 5774 cycles, no crash). This needed `build_skeleton` to strip the
+`-acc_iv` accumulation marker from the dma_wait tag index so the memory_barrier
+slot stays subtile-only and pairs with its load (see §3, `tag_slot`); before the
+strip the producer evaluated `-acc_iv` to a negative slot at the 2nd K-tile and
+TOGSim aborted with "Key does not exist in ... tag table".
+
+## 4. Components
+
+- `build_skeleton.py` + `dep_analysis.py` — in-place reduction of post-vcix to
+  "loop skeleton + togsim.* API"; `memref.dma_wait` mapped through to an explicit
+  `togsim.memory_barrier`; read/write SRAM buffer ids attached; reuses legacy
+  `TogBuilder` traversal.
+- `lower_to_emitc.py` — skeleton -> EmitC by driving the upstream conversion
+  passes plus `_retype_for_to_size_t` (clears residual index<->size_t casts).
+  `togsim_dma` carries `(tag_id, runtime tag-index, is_async, read/write_bufs)`
+  and returns void; `togsim_memory_barrier` carries `(tag_id, tag_slot,
+  write_bufs)`; `togsim_core_alloc` inserted at the work-item boundary.
+- `cycle_table.py` — `tile_id -> (cycle, overlapping)`, overlapping
+  `= max(cycle - offset[type], 0)` (legacy formula); JSON sidecar.
+- `TOGSim/src/togsim_runtime.cc` + `TOGSim/include/togsim_loader.h` — C6 runtime
+  and `run_producer` (dlopen -> togsim_kernel -> records TraceRec). dma resolves
+  `base[arg] + offset*elem_bytes` and signals its tag at data arrival; the
+  matching memory_barrier waits the `(tag_id, tag_slot)`; compute looks up the
+  cycle table; core_alloc round-robins a runtime core pool.
+- `TOGSim/src/togsim_trace_bridge.cc` — bridges the recorded TraceRec stream into
+  the existing `TileGraph`/`Instruction` form for the real Core.
+- `TOGSim/include/togsim_runtime.h` — producer ABI v11.
+
+## 5. Locked design decisions
+
+1. **Trace is a DAG, not a time order.** The consumer (existing Core) schedules
+   per-core timelines from: op kind -> hardware unit, SRAM-buffer last-writer ->
+   data dependency, same-core -> serial (reduction accumulate), SRAM slot ->
+   capacity. Emission order != execution order.
+2. **Async-DMA sync = runtime tag slot.** A `togsim.dma` carries `(tag_id,
+   tag_slot)`; the matching `togsim.memory_barrier` (lowered from the source
+   `memref.dma_wait`) waits on the same pair through the existing Core tag table
+   (`prepare_tag_key`/`set_tag_finish`/`register_tag_waiter`). The DMA signals at
+   data arrival; the barrier becomes the loaded buffer's last-writer so consumers
+   gate on arrival. A sync DMA is blocking (no barrier). This replaced an earlier
+   `event_id` / heap event-handle design, which could not pair a DMA op with its
+   wait per loop iteration (one static op, a different tag slot each iteration).
+   No `calc_tag` content-hash, no magic values, no FIFO.
+3. **Core = runtime allocation.** `togsim_core_alloc` returns a core id (no free).
+   `num_cores` is never baked into the producer -- it is the runtime pool size.
+   A work-item's reduction stays on one core (sticky); different work-items get
+   different cores -> multi-core.
+4. **Intrinsic baked / extrinsic parametric.** vlane / tile sizes / systolic
+   define instructions (baked); num_cores only distributes (runtime).
+5. **Execution model:** P3 materializing (run producer to completion -> record ->
+   feed existing Core); P4 streaming (coroutine, alloc-blocks on resources).
+6. **Double-buffer = resource constraint.** Producer emits everything (no skew);
+   capacity is the consumer's throttle. Requires SRAM tile lifecycle
+   (alloc/free) in the trace -- the currently missing piece.
+
+## 6. Verification (reproducible)
+
+- togsim python tests pass: skeleton (contract + fixture), emitc (build + dlopen
+  run), cycle_table, runtime. TOGSim builds.
+- 256^3 GEMM: core_alloc -> dma(tag_id, tag_slot) -> memory_barrier(tag_id,
+  tag_slot) -> compute; addresses A/B/C resolved (offset 0, single tile).
+- 1024^3 GEMM: per-tile addresses correct (A[m,k]=m*1024+k -> 0,256,512;
+  B[k,n]=k*1024+n -> 0,262144,524288).
+- End-to-end through the real Core (256^3 GEMM, real gem5 table): trace 2518
+  cycles vs legacy 2698.
+- Legacy ONNX-TOG path untouched (comment-only diff), marked DEPRECATED, kept as
+  the comparison reference.
+
+## 6b. Reference timer (early sanity check; superseded by the real Core feed)
+
+`togsim::simulate(RunResult, TimingParams)` (togsim_runtime.cc) was an early
+standalone scheduler that timed the recorded TraceRec to prove the stream is
+sufficient to be timed: per core a DMA-engine timeline (DMAs serialize, overlap
+compute), a compute timeline (serial = reduction accumulate, with the `finish =
+prev.finish + cycle - overlapped` pipeline overlap of Core.cc), and data deps.
+It is NOT the production Core (no DRAM/NoC/L2 contention). It has since been
+superseded: the recorded stream is now bridged into the real Tile/TileGraph ->
+Core (see §3, and the 2518-vs-2698 result above). Retained here as context.
+
+## 7. Remaining work (priority order)
+
+1. DONE. Map TraceRec -> existing TOGSim Core Instructions (Tile/TileGraph,
+   compute_cycle+overlapping, dataflow-buffer deps + runtime-tag barriers) and
+   run through the real Core. Result: trace 2518 vs legacy 2698 on the same gem5
+   table.
+2. SRAM tile lifecycle in the trace (double-buffer throttle). togsim_dma carries
+   `tag_slot` (the lowered SRAM tag index = the slot key the existing Core's
+   Instruction.tag_idx needs); 0 for single-buffer kernels. Remaining: the
+   consumer must use it to throttle in-flight loads to the buffer depth. The
+   SRAM-buffer key is effectively (arg_id, tag_slot) since each load's DRAM
+   tensor maps to its spad.
+3. Preload concurrency cap / preload occupancy (design doc §10.5): give a preload
+   a non-zero occupancy so concurrent preloads are capped at the SA count.
+   Pre-existing in BOTH paths.
+4. (later) deeper double-buffer pipelines (more tag slots), two-function outline,
+   P4 streaming, symbolic shape, P5 op coverage (conv/SDPA/vector).
+
+## 8. Risks / open
+
+- SRAM lifecycle (double-buffer throttle) not yet implemented -- central to
+  double-buffer/capacity accuracy on multi-tile kernels.
+- LLVM 20 emitc constraints absorbed: emitc.for index bounds; old
+  subscript-returns-element model; arith.divui/remui not lowerable -> core id is
+  a runtime allocation (which became a design improvement).
+
+### Explicit dataflow-edge dependency model: implemented
+
+The dependency model is an explicit dataflow DAG, not in-order or runtime-tag
+content-hashing. `togsim_dma`/`togsim_compute` carry read_bufs/write_bufs (SRAM
+buffer ids; a virtual SA_WEIGHTS buffer folds the preload->matmul edge).
+dep_analysis + build_skeleton attach them; lower_to_emitc emits them; the runtime
+records them; the bridge builds the Instruction DAG by last-writer per buffer,
+scoped per work-item. The one runtime-paired edge is the async-DMA data wait,
+routed through an explicit `togsim.memory_barrier` keyed on `(tag_id, tag_slot)`
+(see design doc §10.7.4). The systolic-array pipeline uses the occupancy/latency
+split (§10.7), so accumulating matmuls pipeline rather than serialize.
+
+Net (256^3 GEMM, real gem5 table, real Core): trace 2518 vs legacy 2698.
+Per-output-tile dispatch for multi-core distribution is the next refinement
+(today one dispatch per work-item).
diff --git a/scripts/trace_timeline.py b/scripts/trace_timeline.py
new file mode 100644
index 00000000..5cf9608b
--- /dev/null
+++ b/scripts/trace_timeline.py
@@ -0,0 +1,249 @@
+#!/usr/bin/env python3
+"""Convert a TOGSim `--log_level trace` log into a Chrome Trace Event JSON that
+opens in Perfetto (https://ui.perfetto.dev) or chrome://tracing as an interactive
+timeline (Gantt).
+
+Each instruction becomes one duration slice, grouped per core (pid). Lanes:
+  dram-rd -- loads crossing the DRAM bus (read bandwidth)
+  dram-wr -- stores crossing the DRAM bus (write bandwidth)
+  sa / sa0.. -- COMP compute_type 1 (matmul) / 2 (preload)
+  vector  -- COMP compute_type 0 (vector)
+Time unit = core cycles. Barriers (MEMORY_BAR/COMPUTE_BAR) are not drawn. A DMA bar
+runs from the op's first DRAM response (DRAM_RESP_FIRST, logged by the Core -- so it
+captures data moving even while still injecting) to its completion (load: data-ready;
+store: finished), serialized per direction so each is one visible bar (packed row =
+saturated bus). A compute slice's width is its occupancy (compute_cycle - overlapping).
+
+Usage:
+  bin/Simulator --config <yml> --trace_so <so> --cycle_table <tsv> --log_level trace \
+      2>&1 | python scripts/trace_timeline.py -o timeline.json
+  # or
+  python scripts/trace_timeline.py trace.log -o timeline.json
+Then drag timeline.json into https://ui.perfetto.dev .
+"""
+import argparse
+import json
+import re
+import sys
+
+# [cycle][Core C][TAG ][INST_ID=N] OPCODE (detail...)
+_LINE = re.compile(
+    r"\[(\d+)\]\[Core (\d+)\]\[([A-Z_]+)\s*\](?:\[INST_ID=(-?\d+)\])?\s*(\w+)?(.*)")
+
+# Only 3 lanes per core. Barriers are dropped (see _HIDE).
+_LANE = {"MOVIN": "dma", "MOVOUT": "dma"}
+_HIDE = {"MEMORY_BAR", "COMPUTE_BAR", "TILE_BEGIN", "TILE_END"}
+_CT_NAME = {0: "vector", 1: "matmul", 2: "preload"}
+
+# Perfetto/catapult reserved color names; slices are tinted by tile (= the
+# togsim_dispatch work-item / output tile) so one tile's ops share a color across
+# lanes/cores. 16 names so a core's tiles (which stride by num_cores) stay
+# distinct -- an 8-name palette collapsed to 4 colors per core under 2-core
+# even/odd assignment.
+_TILE_PALETTE = ["good", "bad", "terrible", "yellow", "olive", "rail_response",
+                 "rail_load", "rail_animation", "rail_idle", "thread_state_running",
+                 "thread_state_runnable", "thread_state_iowait",
+                 "thread_state_uninterruptible", "generic_work", "startup",
+                 "vsync_highlight_color"]
+
+
+def _tile_color(detail):
+    m = re.search(r"\btile=(\d+)", detail or "")
+    return _TILE_PALETTE[int(m.group(1)) % len(_TILE_PALETTE)] if m else None
+
+
+_DMA_SHORT = {"MOVIN": "MVIN", "MOVOUT": "MVOUT"}
+
+
+def _tile_of(detail):
+    m = re.search(r"\btile=(-?\d+)", detail or "")
+    return m.group(1) if m else "?"
+
+
+def _label(opcode, detail):
+    if opcode == "COMP":
+        m = re.search(r"compute_type=(\d+)", detail)
+        ct = int(m.group(1)) if m else -1
+        return f"T{_tile_of(detail)} {_CT_NAME.get(ct, 'comp')}"
+    # DMA: keep each load's OWN identity (addr_name) so the input/weight/K-panel
+    # loads stay distinct; tile is conveyed by color (and args), not the name.
+    m = re.search(r"addr_name=(\w+)", detail or "")
+    who = m.group(1) if m else "?"
+    return f"{who} (T{_tile_of(detail)} {_DMA_SHORT.get(opcode, opcode)})"
+
+
+def _lane(opcode, detail):
+    if opcode == "COMP":
+        m = re.search(r"compute_type=(\d+)", detail)
+        ct = int(m.group(1)) if m else -1
+        return "vector" if ct == 0 else "sa"
+    return _LANE.get(opcode, "dma")
+
+
+def parse(lines):
+    # key = (core, inst_id) -> record
+    insts = {}
+    for ln in lines:
+        m = _LINE.search(ln)
+        if not m:
+            continue
+        cyc, core, tag, iid, opcode, detail = m.groups()
+        if iid is None or opcode is None:
+            continue
+        cyc, core, iid = int(cyc), int(core), int(iid)
+        key = (core, iid)
+        r = insts.setdefault(key, {
+            "core": core, "iid": iid, "opcode": opcode, "detail": detail,
+            "issued": None, "finished": None, "resp": None, "dma_issue": None,
+            "first_resp": None})
+        if not r["opcode"] or r["opcode"] == opcode:
+            r["opcode"] = opcode
+            if detail.strip():
+                r["detail"] = detail
+        if tag == "INST_ISSUED" and r["issued"] is None:
+            r["issued"] = cyc
+        elif tag == "INST_FINISHED":
+            r["finished"] = cyc
+        elif tag == "DRAM_RESP_DONE":
+            r["resp"] = cyc
+        elif tag == "DRAM_RESP_FIRST" and r["first_resp"] is None:  # first data arrived
+            r["first_resp"] = cyc
+        elif tag == "ASYNC_DMA_ISSUE":   # all requests injected (engine done)
+            r["dma_issue"] = cyc
+    return insts
+
+
+def _occ(detail):
+    """(compute_cycle, overlapping_cycle) from a COMP detail string."""
+    cc = re.search(r"compute_cycle=(\d+)", detail)
+    ov = re.search(r"overlapping_cycle=(\d+)", detail)
+    return (int(cc.group(1)) if cc else 0, int(ov.group(1)) if ov else 0)
+
+
+def to_chrome(insts, num_sa=1):
+    """Model each hardware unit as a server and replay its ops in issue order, so
+    real idle gaps (bubbles) show and slices don't nest:
+      dma    : MOVIN/MOVOUT -- 1 DMA engine; slice = actual transfer
+               (ASYNC_DMA_ISSUE -> data-ready).
+      vector : COMP type 0  -- 1 VPU.
+      sa     : COMP type 1/2 -- each op on the SA the Core reports (`sa=` field;
+               weight-pinned), so lanes auto-split sa0..; rr fallback if absent.
+    A compute slice's width is compute_cycle - overlapping_cycle (its occupancy =
+    latency minus the tail that overlaps the next op), starting when the unit
+    actually picks it up: start = max(issue, unit_free). num_sa>1 -> lanes sa0.. ."""
+    by_core = {}
+    for r in insts.values():
+        op, detail, core = r["opcode"], r["detail"], r["core"]
+        if op in _HIDE:
+            continue
+        u = by_core.setdefault(core, {"dma": [], "vector": [], "sa": []})
+        if op == "COMP":
+            m = re.search(r"compute_type=(\d+)", detail)
+            ct = int(m.group(1)) if m else -1
+            u["vector" if ct == 0 else "sa"].append(r)
+        else:
+            u["dma"].append(r)
+
+    events, lanes, cores = [], set(), set()
+
+    def add(core, lane, ts, dur, name, r):
+        lanes.add((core, lane))
+        cores.add(core)
+        args = {"inst_id": r["iid"], "tile": _tile_of(r["detail"]),
+                "issued": r["issued"], "first_data": r["first_resp"],
+                "finished": r["finished"], "data_ready": r["resp"]}
+        am = re.search(r"addr_name=(\w+)", r["detail"] or "")
+        if am:
+            args["addr"] = am.group(1)
+        ev = {"name": name, "cat": lane, "ph": "X", "ts": ts,
+              "dur": max(dur, 1), "pid": core, "tid": lane, "args": args}
+        cname = _tile_color(r["detail"])
+        if cname:
+            ev["cname"] = cname
+        events.append(ev)
+
+    def issue_key(r):
+        return r["issued"] if r["issued"] is not None else 0
+
+    nsa = max(num_sa, 1)
+    for core, u in sorted(by_core.items()):
+        # DMA data crossing the DRAM bus, split by direction (reads and writes are
+        # asymmetric). A LOAD's data comes back on the response, so its bar runs
+        # [first DRAM response, data-ready]. A STORE's data goes out with the
+        # request (fire-and-forget; its acks arrive after it has finished), so its
+        # bar runs [issued, finished]. Serialized per direction so each op is one
+        # visible bar: a packed row = the bus is saturated, gaps = it is idle.
+        for lane, op, sk, ek in (("dram-rd", "MOVIN", "first_resp", "resp"),
+                                 ("dram-wr", "MOVOUT", "issued", "finished")):
+            free = 0
+            rows = [r for r in u["dma"] if r["opcode"] == op
+                    and r[sk] is not None and r[ek] is not None and r[ek] > r[sk]]
+            for r in sorted(rows, key=lambda r: r[ek]):
+                start = max(r[sk], free)
+                free = max(r[ek], start + 1)
+                add(core, lane, start, free - start, _label(r["opcode"], r["detail"]), r)
+        # VPU: one server; slice = occupancy (compute_cycle - overlapping_cycle).
+        free = 0
+        for r in sorted(u["vector"], key=issue_key):
+            if r["issued"] is None:
+                continue
+            cc, ov = _occ(r["detail"])
+            dur = max(cc - ov, 1)
+            start = max(r["issued"], free)
+            free = start + dur
+            add(core, "vector", start, dur, "vector", r)
+        # SA: each op runs on the systolic array the Core reports (the `sa=` field
+        # = its weight-pinned / round-robin assignment); fall back to round-robin
+        # by issue order for older logs without the field. Each SA is one server.
+        rows = sorted(u["sa"], key=issue_key)
+
+        def _sa_of(r, i):
+            m = re.search(r"\bsa=(-?\d+)", r["detail"])
+            return int(m.group(1)) if (m and int(m.group(1)) >= 0) else (i % nsa)
+
+        max_sa = max([nsa] + [_sa_of(r, i) + 1 for i, r in enumerate(rows)])
+        sa_free = [0] * max_sa
+        for i, r in enumerate(rows):
+            if r["issued"] is None:
+                continue
+            s = _sa_of(r, i)
+            cc, ov = _occ(r["detail"])
+            dur = max(cc - ov, 1)
+            start = max(r["issued"], sa_free[s])
+            sa_free[s] = start + dur
+            lane = "sa" if max_sa == 1 else f"sa{s}"
+            add(core, lane, start, dur, _label(r["opcode"], r["detail"]), r)
+
+    for c in sorted(cores):
+        events.append({"name": "process_name", "ph": "M", "pid": c, "tid": 0,
+                       "args": {"name": f"Core {c}"}})
+    order = {"dram-rd": 0, "dram-wr": 1,
+             "sa": 2, "sa0": 2, "sa1": 3, "sa2": 4, "sa3": 5, "vector": 7}
+    for c, lane in sorted(lanes, key=lambda x: (x[0], order.get(x[1], 5))):
+        events.append({"name": "thread_name", "ph": "M", "pid": c, "tid": lane,
+                       "args": {"name": lane}})
+        events.append({"name": "thread_sort_index", "ph": "M", "pid": c, "tid": lane,
+                       "args": {"sort_index": order.get(lane, 5)}})
+    return {"traceEvents": events, "displayTimeUnit": "ns"}
+
+
+def main(argv):
+    ap = argparse.ArgumentParser()
+    ap.add_argument("input", nargs="?", help="trace log file (default: stdin)")
+    ap.add_argument("-o", "--out", default="timeline.json")
+    ap.add_argument("-s", "--num-sa", type=int, default=1,
+                    help="systolic arrays per core (num_systolic_array_per_core); "
+                         ">1 splits into sa0..saN-1 lanes")
+    a = ap.parse_args(argv[1:])
+    src = open(a.input) if a.input else sys.stdin
+    insts = parse(src)
+    trace = to_chrome(insts, a.num_sa)
+    with open(a.out, "w") as fh:
+        json.dump(trace, fh)
+    n = sum(1 for e in trace["traceEvents"] if e["ph"] == "X")
+    sys.stderr.write(f"wrote {a.out}: {n} slices -> open in https://ui.perfetto.dev\n")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main(sys.argv))
diff --git a/tests/fixtures/gemm256_postvcix.mlir b/tests/fixtures/gemm256_postvcix.mlir
new file mode 100644
index 00000000..740e8ab2
--- /dev/null
+++ b/tests/fixtures/gemm256_postvcix.mlir
@@ -0,0 +1,419 @@
+#map = affine_map<(d0, d1) -> (d0 * 256 + d1)>
+#map1 = affine_map<(d0, d1) -> (d0 * 65536 + d1 * 256)>
+#map2 = affine_map<(d0, d1) -> (d0 + d1)>
+#map3 = affine_map<(d0, d1) -> (d0 * 256 + d1 * 512)>
+#map4 = affine_map<(d0, d1, d2) -> (-d0 + d1 + d2 floordiv 2)>
+#map5 = affine_map<(d0, d1, d2)[s0, s1] -> (d0 * s0 + d1 * s1 + d2)>
+#map6 = affine_map<(d0)[s0] -> (d0 floordiv s0)>
+#map7 = affine_map<(d0)[s0] -> (d0 mod s0)>
+#map8 = affine_map<(d0, d1, d2) -> (-d0 + d1 * 2 + d2)>
+module {
+  memref.global @X_spad : memref<256x256xf32, 1>
+  memref.global @W_spad : memref<256x256xf32, 1>
+  memref.global @Y_spad : memref<256x256xf32, 1>
+  func.func @kernel(%arg0: memref<65536xf32>, %arg1: memref<65536xf32>, %arg2: memref<65536xf32>) {
+    %0 = memref.get_global @X_spad : memref<256x256xf32, 1>
+    %1 = memref.get_global @W_spad : memref<256x256xf32, 1>
+    %2 = memref.get_global @Y_spad : memref<256x256xf32, 1>
+    %cst = arith.constant dense<0.000000e+00> : vector<512xf32>
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c3 = arith.constant 3 : index
+    %c2 = arith.constant 2 : index
+    %alloc = memref.alloc() : memref<1xi32>
+    affine.for %arg3 = 0 to 256 step 256 {
+      affine.for %arg4 = 0 to 256 step 256 {
+        affine.vector_store %cst, %2[0, 0] : memref<256x256xf32, 1>, vector<512xf32>
+        affine.for %arg5 = 0 to 256 step 256 {
+          %4 = affine.apply #map(%arg3, %arg5)
+          %c1_1 = arith.constant 1 : index
+          %alloc_2 = memref.alloc() : memref<1xi32>
+          %5 = affine.apply #map(%arg5, %arg4)
+          %c1_3 = arith.constant 1 : index
+          %alloc_4 = memref.alloc() : memref<1xi32>
+          %c0_5 = arith.constant 0 : index
+          %c0_6 = arith.constant 0 : index
+          %c0_7 = arith.constant 0 : index
+          %6 = affine.apply #map1(%c0_5, %c0_6)
+          %7 = affine.apply #map2(%6, %4)
+          %8 = affine.apply #map3(%c0_5, %c0_6)
+          %9 = affine.apply #map2(%c0_5, %c0_6)
+          memref.dma_start %arg0[%7], %0[%c0_7, %8], %c2, %alloc_2[%9], %c1_1, %c1 : memref<65536xf32>, memref<256x256xf32, 1>, memref<1xi32> {async = true, dram_stride = [256, 1], fine_grained = true, sram_stride = [1, 256], subtile_size = [256, 256]}
+          %c0_8 = arith.constant 0 : index
+          %c0_9 = arith.constant 0 : index
+          %c0_10 = arith.constant 0 : index
+          %10 = affine.apply #map1(%c0_8, %c0_9)
+          %11 = affine.apply #map2(%10, %5)
+          %12 = affine.apply #map3(%c0_8, %c0_9)
+          %13 = affine.apply #map2(%c0_8, %c0_9)
+          memref.dma_start %arg1[%11], %1[%c0_10, %12], %c2, %alloc_4[%13], %c1_3, %c1 : memref<65536xf32>, memref<256x256xf32, 1>, memref<1xi32> {async = true, dram_stride = [256, 1], fine_grained = true, sram_stride = [1, 256], subtile_size = [256, 256]}
+          %c0_11 = arith.constant 0 : index
+          %c8_i64 = arith.constant 8 : i64
+          %c256 = arith.constant 256 : index
+          %c256_12 = arith.constant 256 : index
+          %c256_13 = arith.constant 256 : index
+          %c128 = arith.constant 128 : index
+          %c1_14 = arith.constant 1 : index
+          %cst_15 = arith.constant 0.000000e+00 : f32
+          affine.for %arg6 = 0 to 2 {
+            affine.for %arg7 = 0 to 2 {
+              %14 = affine.apply #map4(%arg5, %c0_11, %c0_11)
+              memref.dma_wait %alloc_4[%14], %c1_14 : memref<1xi32>
+              %c0_16 = arith.constant 0 : index
+              %c128_17 = arith.constant 128 : index
+              %15 = affine.apply #map5(%arg6, %arg7, %c0_16)[%c256, %c128_17]
+              %16 = affine.apply #map6(%15)[%c256_12]
+              %17 = affine.apply #map7(%15)[%c256_12]
+              %18 = vector.transfer_read %1[%16, %17], %cst_15 : memref<256x256xf32, 1>, vector<8xf32>
+              "vcix.iv"(%18, %c8_i64) {imm = 0 : i64, opcode = 1 : i64, rd = 0 : i64} : (vector<8xf32>, i64) -> ()
+              %c8 = arith.constant 8 : index
+              %c128_18 = arith.constant 128 : index
+              %19 = affine.apply #map5(%arg6, %arg7, %c8)[%c256, %c128_18]
+              %20 = affine.apply #map6(%19)[%c256_12]
+              %21 = affine.apply #map7(%19)[%c256_12]
+              %22 = vector.transfer_read %1[%20, %21], %cst_15 : memref<256x256xf32, 1>, vector<8xf32>
+              "vcix.iv"(%22, %c8_i64) {imm = 0 : i64, opcode = 1 : i64, rd = 0 : i64} : (vector<8xf32>, i64) -> ()
+              %c16 = arith.constant 16 : index
+              %c128_19 = arith.constant 128 : index
+              %23 = affine.apply #map5(%arg6, %arg7, %c16)[%c256, %c128_19]
+              %24 = affine.apply #map6(%23)[%c256_12]
+              %25 = affine.apply #map7(%23)[%c256_12]
+              %26 = vector.transfer_read %1[%24, %25], %cst_15 : memref<256x256xf32, 1>, vector<8xf32>
+              "vcix.iv"(%26, %c8_i64) {imm = 0 : i64, opcode = 1 : i64, rd = 0 : i64} : (vector<8xf32>, i64) -> ()
+              %c24 = arith.constant 24 : index
+              %c128_20 = arith.constant 128 : index
+              %27 = affine.apply #map5(%arg6, %arg7, %c24)[%c256, %c128_20]
+              %28 = affine.apply #map6(%27)[%c256_12]
+              %29 = affine.apply #map7(%27)[%c256_12]
+              %30 = vector.transfer_read %1[%28, %29], %cst_15 : memref<256x256xf32, 1>, vector<8xf32>
+              "vcix.iv"(%30, %c8_i64) {imm = 0 : i64, opcode = 1 : i64, rd = 0 : i64} : (vector<8xf32>, i64) -> ()
+              %c32 = arith.constant 32 : index
+              %c128_21 = arith.constant 128 : index
+              %31 = affine.apply #map5(%arg6, %arg7, %c32)[%c256, %c128_21]
+              %32 = affine.apply #map6(%31)[%c256_12]
+              %33 = affine.apply #map7(%31)[%c256_12]
+              %34 = vector.transfer_read %1[%32, %33], %cst_15 : memref<256x256xf32, 1>, vector<8xf32>
+              "vcix.iv"(%34, %c8_i64) {imm = 0 : i64, opcode = 1 : i64, rd = 0 : i64} : (vector<8xf32>, i64) -> ()
+              %c40 = arith.constant 40 : index
+              %c128_22 = arith.constant 128 : index
+              %35 = affine.apply #map5(%arg6, %arg7, %c40)[%c256, %c128_22]
+              %36 = affine.apply #map6(%35)[%c256_12]
+              %37 = affine.apply #map7(%35)[%c256_12]
+              %38 = vector.transfer_read %1[%36, %37], %cst_15 : memref<256x256xf32, 1>, vector<8xf32>
+              "vcix.iv"(%38, %c8_i64) {imm = 0 : i64, opcode = 1 : i64, rd = 0 : i64} : (vector<8xf32>, i64) -> ()
+              %c48 = arith.constant 48 : index
+              %c128_23 = arith.constant 128 : index
+              %39 = affine.apply #map5(%arg6, %arg7, %c48)[%c256, %c128_23]
+              %40 = affine.apply #map6(%39)[%c256_12]
+              %41 = affine.apply #map7(%39)[%c256_12]
+              %42 = vector.transfer_read %1[%40, %41], %cst_15 : memref<256x256xf32, 1>, vector<8xf32>
+              "vcix.iv"(%42, %c8_i64) {imm = 0 : i64, opcode = 1 : i64, rd = 0 : i64} : (vector<8xf32>, i64) -> ()
+              %c56 = arith.constant 56 : index
+              %c128_24 = arith.constant 128 : index
+              %43 = affine.apply #map5(%arg6, %arg7, %c56)[%c256, %c128_24]
+              %44 = affine.apply #map6(%43)[%c256_12]
+              %45 = affine.apply #map7(%43)[%c256_12]
+              %46 = vector.transfer_read %1[%44, %45], %cst_15 : memref<256x256xf32, 1>, vector<8xf32>
+              "vcix.iv"(%46, %c8_i64) {imm = 0 : i64, opcode = 1 : i64, rd = 0 : i64} : (vector<8xf32>, i64) -> ()
+              %c64 = arith.constant 64 : index
+              %c128_25 = arith.constant 128 : index
+              %47 = affine.apply #map5(%arg6, %arg7, %c64)[%c256, %c128_25]
+              %48 = affine.apply #map6(%47)[%c256_12]
+              %49 = affine.apply #map7(%47)[%c256_12]
+              %50 = vector.transfer_read %1[%48, %49], %cst_15 : memref<256x256xf32, 1>, vector<8xf32>
+              "vcix.iv"(%50, %c8_i64) {imm = 0 : i64, opcode = 1 : i64, rd = 0 : i64} : (vector<8xf32>, i64) -> ()
+              %c72 = arith.constant 72 : index
+              %c128_26 = arith.constant 128 : index
+              %51 = affine.apply #map5(%arg6, %arg7, %c72)[%c256, %c128_26]
+              %52 = affine.apply #map6(%51)[%c256_12]
+              %53 = affine.apply #map7(%51)[%c256_12]
+              %54 = vector.transfer_read %1[%52, %53], %cst_15 : memref<256x256xf32, 1>, vector<8xf32>
+              "vcix.iv"(%54, %c8_i64) {imm = 0 : i64, opcode = 1 : i64, rd = 0 : i64} : (vector<8xf32>, i64) -> ()
+              %c80 = arith.constant 80 : index
+              %c128_27 = arith.constant 128 : index
+              %55 = affine.apply #map5(%arg6, %arg7, %c80)[%c256, %c128_27]
+              %56 = affine.apply #map6(%55)[%c256_12]
+              %57 = affine.apply #map7(%55)[%c256_12]
+              %58 = vector.transfer_read %1[%56, %57], %cst_15 : memref<256x256xf32, 1>, vector<8xf32>
+              "vcix.iv"(%58, %c8_i64) {imm = 0 : i64, opcode = 1 : i64, rd = 0 : i64} : (vector<8xf32>, i64) -> ()
+              %c88 = arith.constant 88 : index
+              %c128_28 = arith.constant 128 : index
+              %59 = affine.apply #map5(%arg6, %arg7, %c88)[%c256, %c128_28]
+              %60 = affine.apply #map6(%59)[%c256_12]
+              %61 = affine.apply #map7(%59)[%c256_12]
+              %62 = vector.transfer_read %1[%60, %61], %cst_15 : memref<256x256xf32, 1>, vector<8xf32>
+              "vcix.iv"(%62, %c8_i64) {imm = 0 : i64, opcode = 1 : i64, rd = 0 : i64} : (vector<8xf32>, i64) -> ()
+              %c96 = arith.constant 96 : index
+              %c128_29 = arith.constant 128 : index
+              %63 = affine.apply #map5(%arg6, %arg7, %c96)[%c256, %c128_29]
+              %64 = affine.apply #map6(%63)[%c256_12]
+              %65 = affine.apply #map7(%63)[%c256_12]
+              %66 = vector.transfer_read %1[%64, %65], %cst_15 : memref<256x256xf32, 1>, vector<8xf32>
+              "vcix.iv"(%66, %c8_i64) {imm = 0 : i64, opcode = 1 : i64, rd = 0 : i64} : (vector<8xf32>, i64) -> ()
+              %c104 = arith.constant 104 : index
+              %c128_30 = arith.constant 128 : index
+              %67 = affine.apply #map5(%arg6, %arg7, %c104)[%c256, %c128_30]
+              %68 = affine.apply #map6(%67)[%c256_12]
+              %69 = affine.apply #map7(%67)[%c256_12]
+              %70 = vector.transfer_read %1[%68, %69], %cst_15 : memref<256x256xf32, 1>, vector<8xf32>
+              "vcix.iv"(%70, %c8_i64) {imm = 0 : i64, opcode = 1 : i64, rd = 0 : i64} : (vector<8xf32>, i64) -> ()
+              %c112 = arith.constant 112 : index
+              %c128_31 = arith.constant 128 : index
+              %71 = affine.apply #map5(%arg6, %arg7, %c112)[%c256, %c128_31]
+              %72 = affine.apply #map6(%71)[%c256_12]
+              %73 = affine.apply #map7(%71)[%c256_12]
+              %74 = vector.transfer_read %1[%72, %73], %cst_15 : memref<256x256xf32, 1>, vector<8xf32>
+              "vcix.iv"(%74, %c8_i64) {imm = 0 : i64, opcode = 1 : i64, rd = 0 : i64} : (vector<8xf32>, i64) -> ()
+              %c120 = arith.constant 120 : index
+              %c128_32 = arith.constant 128 : index
+              %75 = affine.apply #map5(%arg6, %arg7, %c120)[%c256, %c128_32]
+              %76 = affine.apply #map6(%75)[%c256_12]
+              %77 = affine.apply #map7(%75)[%c256_12]
+              %78 = vector.transfer_read %1[%76, %77], %cst_15 : memref<256x256xf32, 1>, vector<8xf32>
+              "vcix.iv"(%78, %c8_i64) {imm = 0 : i64, opcode = 1 : i64, rd = 0 : i64} : (vector<8xf32>, i64) -> ()
+              affine.for %arg8 = 0 to 2 {
+                %79 = affine.apply #map8(%arg5, %c0_11, %c0_11)
+                memref.dma_wait %alloc_2[%79], %c1_14 : memref<1xi32>
+                %c0_33 = arith.constant 0 : index
+                %80 = affine.apply #map5(%arg7, %arg8, %c0_33)[%c256_13, %c128]
+                %81 = affine.apply #map6(%80)[%c256]
+                %82 = affine.apply #map7(%80)[%c256]
+                %83 = vector.transfer_read %0[%81, %82], %cst_15 : memref<256x256xf32, 1>, vector<8xf32>
+                "vcix.iv"(%83, %c8_i64) {imm = 0 : i64, opcode = 0 : i64, rd = 0 : i64} : (vector<8xf32>, i64) -> ()
+                %c8_34 = arith.constant 8 : index
+                %84 = affine.apply #map5(%arg7, %arg8, %c8_34)[%c256_13, %c128]
+                %85 = affine.apply #map6(%84)[%c256]
+                %86 = affine.apply #map7(%84)[%c256]
+                %87 = vector.transfer_read %0[%85, %86], %cst_15 : memref<256x256xf32, 1>, vector<8xf32>
+                "vcix.iv"(%87, %c8_i64) {imm = 0 : i64, opcode = 0 : i64, rd = 0 : i64} : (vector<8xf32>, i64) -> ()
+                %c16_35 = arith.constant 16 : index
+                %88 = affine.apply #map5(%arg7, %arg8, %c16_35)[%c256_13, %c128]
+                %89 = affine.apply #map6(%88)[%c256]
+                %90 = affine.apply #map7(%88)[%c256]
+                %91 = vector.transfer_read %0[%89, %90], %cst_15 : memref<256x256xf32, 1>, vector<8xf32>
+                "vcix.iv"(%91, %c8_i64) {imm = 0 : i64, opcode = 0 : i64, rd = 0 : i64} : (vector<8xf32>, i64) -> ()
+                %c24_36 = arith.constant 24 : index
+                %92 = affine.apply #map5(%arg7, %arg8, %c24_36)[%c256_13, %c128]
+                %93 = affine.apply #map6(%92)[%c256]
+                %94 = affine.apply #map7(%92)[%c256]
+                %95 = vector.transfer_read %0[%93, %94], %cst_15 : memref<256x256xf32, 1>, vector<8xf32>
+                "vcix.iv"(%95, %c8_i64) {imm = 0 : i64, opcode = 0 : i64, rd = 0 : i64} : (vector<8xf32>, i64) -> ()
+                %c32_37 = arith.constant 32 : index
+                %96 = affine.apply #map5(%arg7, %arg8, %c32_37)[%c256_13, %c128]
+                %97 = affine.apply #map6(%96)[%c256]
+                %98 = affine.apply #map7(%96)[%c256]
+                %99 = vector.transfer_read %0[%97, %98], %cst_15 : memref<256x256xf32, 1>, vector<8xf32>
+                "vcix.iv"(%99, %c8_i64) {imm = 0 : i64, opcode = 0 : i64, rd = 0 : i64} : (vector<8xf32>, i64) -> ()
+                %c40_38 = arith.constant 40 : index
+                %100 = affine.apply #map5(%arg7, %arg8, %c40_38)[%c256_13, %c128]
+                %101 = affine.apply #map6(%100)[%c256]
+                %102 = affine.apply #map7(%100)[%c256]
+                %103 = vector.transfer_read %0[%101, %102], %cst_15 : memref<256x256xf32, 1>, vector<8xf32>
+                "vcix.iv"(%103, %c8_i64) {imm = 0 : i64, opcode = 0 : i64, rd = 0 : i64} : (vector<8xf32>, i64) -> ()
+                %c48_39 = arith.constant 48 : index
+                %104 = affine.apply #map5(%arg7, %arg8, %c48_39)[%c256_13, %c128]
+                %105 = affine.apply #map6(%104)[%c256]
+                %106 = affine.apply #map7(%104)[%c256]
+                %107 = vector.transfer_read %0[%105, %106], %cst_15 : memref<256x256xf32, 1>, vector<8xf32>
+                "vcix.iv"(%107, %c8_i64) {imm = 0 : i64, opcode = 0 : i64, rd = 0 : i64} : (vector<8xf32>, i64) -> ()
+                %c56_40 = arith.constant 56 : index
+                %108 = affine.apply #map5(%arg7, %arg8, %c56_40)[%c256_13, %c128]
+                %109 = affine.apply #map6(%108)[%c256]
+                %110 = affine.apply #map7(%108)[%c256]
+                %111 = vector.transfer_read %0[%109, %110], %cst_15 : memref<256x256xf32, 1>, vector<8xf32>
+                "vcix.iv"(%111, %c8_i64) {imm = 0 : i64, opcode = 0 : i64, rd = 0 : i64} : (vector<8xf32>, i64) -> ()
+                %c64_41 = arith.constant 64 : index
+                %112 = affine.apply #map5(%arg7, %arg8, %c64_41)[%c256_13, %c128]
+                %113 = affine.apply #map6(%112)[%c256]
+                %114 = affine.apply #map7(%112)[%c256]
+                %115 = vector.transfer_read %0[%113, %114], %cst_15 : memref<256x256xf32, 1>, vector<8xf32>
+                "vcix.iv"(%115, %c8_i64) {imm = 0 : i64, opcode = 0 : i64, rd = 0 : i64} : (vector<8xf32>, i64) -> ()
+                %c72_42 = arith.constant 72 : index
+                %116 = affine.apply #map5(%arg7, %arg8, %c72_42)[%c256_13, %c128]
+                %117 = affine.apply #map6(%116)[%c256]
+                %118 = affine.apply #map7(%116)[%c256]
+                %119 = vector.transfer_read %0[%117, %118], %cst_15 : memref<256x256xf32, 1>, vector<8xf32>
+                "vcix.iv"(%119, %c8_i64) {imm = 0 : i64, opcode = 0 : i64, rd = 0 : i64} : (vector<8xf32>, i64) -> ()
+                %c80_43 = arith.constant 80 : index
+                %120 = affine.apply #map5(%arg7, %arg8, %c80_43)[%c256_13, %c128]
+                %121 = affine.apply #map6(%120)[%c256]
+                %122 = affine.apply #map7(%120)[%c256]
+                %123 = vector.transfer_read %0[%121, %122], %cst_15 : memref<256x256xf32, 1>, vector<8xf32>
+                "vcix.iv"(%123, %c8_i64) {imm = 0 : i64, opcode = 0 : i64, rd = 0 : i64} : (vector<8xf32>, i64) -> ()
+                %c88_44 = arith.constant 88 : index
+                %124 = affine.apply #map5(%arg7, %arg8, %c88_44)[%c256_13, %c128]
+                %125 = affine.apply #map6(%124)[%c256]
+                %126 = affine.apply #map7(%124)[%c256]
+                %127 = vector.transfer_read %0[%125, %126], %cst_15 : memref<256x256xf32, 1>, vector<8xf32>
+                "vcix.iv"(%127, %c8_i64) {imm = 0 : i64, opcode = 0 : i64, rd = 0 : i64} : (vector<8xf32>, i64) -> ()
+                %c96_45 = arith.constant 96 : index
+                %128 = affine.apply #map5(%arg7, %arg8, %c96_45)[%c256_13, %c128]
+                %129 = affine.apply #map6(%128)[%c256]
+                %130 = affine.apply #map7(%128)[%c256]
+                %131 = vector.transfer_read %0[%129, %130], %cst_15 : memref<256x256xf32, 1>, vector<8xf32>
+                "vcix.iv"(%131, %c8_i64) {imm = 0 : i64, opcode = 0 : i64, rd = 0 : i64} : (vector<8xf32>, i64) -> ()
+                %c104_46 = arith.constant 104 : index
+                %132 = affine.apply #map5(%arg7, %arg8, %c104_46)[%c256_13, %c128]
+                %133 = affine.apply #map6(%132)[%c256]
+                %134 = affine.apply #map7(%132)[%c256]
+                %135 = vector.transfer_read %0[%133, %134], %cst_15 : memref<256x256xf32, 1>, vector<8xf32>
+                "vcix.iv"(%135, %c8_i64) {imm = 0 : i64, opcode = 0 : i64, rd = 0 : i64} : (vector<8xf32>, i64) -> ()
+                %c112_47 = arith.constant 112 : index
+                %136 = affine.apply #map5(%arg7, %arg8, %c112_47)[%c256_13, %c128]
+                %137 = affine.apply #map6(%136)[%c256]
+                %138 = affine.apply #map7(%136)[%c256]
+                %139 = vector.transfer_read %0[%137, %138], %cst_15 : memref<256x256xf32, 1>, vector<8xf32>
+                "vcix.iv"(%139, %c8_i64) {imm = 0 : i64, opcode = 0 : i64, rd = 0 : i64} : (vector<8xf32>, i64) -> ()
+                %c120_48 = arith.constant 120 : index
+                %140 = affine.apply #map5(%arg7, %arg8, %c120_48)[%c256_13, %c128]
+                %141 = affine.apply #map6(%140)[%c256]
+                %142 = affine.apply #map7(%140)[%c256]
+                %143 = vector.transfer_read %0[%141, %142], %cst_15 : memref<256x256xf32, 1>, vector<8xf32>
+                "vcix.iv"(%143, %c8_i64) {imm = 0 : i64, opcode = 0 : i64, rd = 0 : i64} : (vector<8xf32>, i64) -> ()
+                "vcix.i"(%c8_i64) {imm = 4 : i64, lmul = 0 : i64, opcode = 1 : i64, rd = 0 : i64, rs2 = 0 : i64, sew = 32 : i64} : (i64) -> ()
+                %c0_49 = arith.constant 0 : index
+                %144 = affine.apply #map5(%arg6, %arg8, %c0_49)[%c256_13, %c128]
+                %145 = "vcix.v.i"(%c8_i64) {imm = 0 : i64, opcode = 2 : i64, rs2 = 0 : i64} : (i64) -> vector<8xf32>
+                %146 = affine.apply #map6(%144)[%c256_12]
+                %147 = affine.apply #map7(%144)[%c256_12]
+                %148 = vector.transfer_read %2[%146, %147], %cst_15 : memref<256x256xf32, 1>, vector<8xf32>
+                %149 = arith.addf %148, %145 : vector<8xf32>
+                vector.transfer_write %149, %2[%146, %147] : vector<8xf32>, memref<256x256xf32, 1>
+                %c8_50 = arith.constant 8 : index
+                %150 = affine.apply #map5(%arg6, %arg8, %c8_50)[%c256_13, %c128]
+                %151 = "vcix.v.i"(%c8_i64) {imm = 0 : i64, opcode = 2 : i64, rs2 = 0 : i64} : (i64) -> vector<8xf32>
+                %152 = affine.apply #map6(%150)[%c256_12]
+                %153 = affine.apply #map7(%150)[%c256_12]
+                %154 = vector.transfer_read %2[%152, %153], %cst_15 : memref<256x256xf32, 1>, vector<8xf32>
+                %155 = arith.addf %154, %151 : vector<8xf32>
+                vector.transfer_write %155, %2[%152, %153] : vector<8xf32>, memref<256x256xf32, 1>
+                %c16_51 = arith.constant 16 : index
+                %156 = affine.apply #map5(%arg6, %arg8, %c16_51)[%c256_13, %c128]
+                %157 = "vcix.v.i"(%c8_i64) {imm = 0 : i64, opcode = 2 : i64, rs2 = 0 : i64} : (i64) -> vector<8xf32>
+                %158 = affine.apply #map6(%156)[%c256_12]
+                %159 = affine.apply #map7(%156)[%c256_12]
+                %160 = vector.transfer_read %2[%158, %159], %cst_15 : memref<256x256xf32, 1>, vector<8xf32>
+                %161 = arith.addf %160, %157 : vector<8xf32>
+                vector.transfer_write %161, %2[%158, %159] : vector<8xf32>, memref<256x256xf32, 1>
+                %c24_52 = arith.constant 24 : index
+                %162 = affine.apply #map5(%arg6, %arg8, %c24_52)[%c256_13, %c128]
+                %163 = "vcix.v.i"(%c8_i64) {imm = 0 : i64, opcode = 2 : i64, rs2 = 0 : i64} : (i64) -> vector<8xf32>
+                %164 = affine.apply #map6(%162)[%c256_12]
+                %165 = affine.apply #map7(%162)[%c256_12]
+                %166 = vector.transfer_read %2[%164, %165], %cst_15 : memref<256x256xf32, 1>, vector<8xf32>
+                %167 = arith.addf %166, %163 : vector<8xf32>
+                vector.transfer_write %167, %2[%164, %165] : vector<8xf32>, memref<256x256xf32, 1>
+                %c32_53 = arith.constant 32 : index
+                %168 = affine.apply #map5(%arg6, %arg8, %c32_53)[%c256_13, %c128]
+                %169 = "vcix.v.i"(%c8_i64) {imm = 0 : i64, opcode = 2 : i64, rs2 = 0 : i64} : (i64) -> vector<8xf32>
+                %170 = affine.apply #map6(%168)[%c256_12]
+                %171 = affine.apply #map7(%168)[%c256_12]
+                %172 = vector.transfer_read %2[%170, %171], %cst_15 : memref<256x256xf32, 1>, vector<8xf32>
+                %173 = arith.addf %172, %169 : vector<8xf32>
+                vector.transfer_write %173, %2[%170, %171] : vector<8xf32>, memref<256x256xf32, 1>
+                %c40_54 = arith.constant 40 : index
+                %174 = affine.apply #map5(%arg6, %arg8, %c40_54)[%c256_13, %c128]
+                %175 = "vcix.v.i"(%c8_i64) {imm = 0 : i64, opcode = 2 : i64, rs2 = 0 : i64} : (i64) -> vector<8xf32>
+                %176 = affine.apply #map6(%174)[%c256_12]
+                %177 = affine.apply #map7(%174)[%c256_12]
+                %178 = vector.transfer_read %2[%176, %177], %cst_15 : memref<256x256xf32, 1>, vector<8xf32>
+                %179 = arith.addf %178, %175 : vector<8xf32>
+                vector.transfer_write %179, %2[%176, %177] : vector<8xf32>, memref<256x256xf32, 1>
+                %c48_55 = arith.constant 48 : index
+                %180 = affine.apply #map5(%arg6, %arg8, %c48_55)[%c256_13, %c128]
+                %181 = "vcix.v.i"(%c8_i64) {imm = 0 : i64, opcode = 2 : i64, rs2 = 0 : i64} : (i64) -> vector<8xf32>
+                %182 = affine.apply #map6(%180)[%c256_12]
+                %183 = affine.apply #map7(%180)[%c256_12]
+                %184 = vector.transfer_read %2[%182, %183], %cst_15 : memref<256x256xf32, 1>, vector<8xf32>
+                %185 = arith.addf %184, %181 : vector<8xf32>
+                vector.transfer_write %185, %2[%182, %183] : vector<8xf32>, memref<256x256xf32, 1>
+                %c56_56 = arith.constant 56 : index
+                %186 = affine.apply #map5(%arg6, %arg8, %c56_56)[%c256_13, %c128]
+                %187 = "vcix.v.i"(%c8_i64) {imm = 0 : i64, opcode = 2 : i64, rs2 = 0 : i64} : (i64) -> vector<8xf32>
+                %188 = affine.apply #map6(%186)[%c256_12]
+                %189 = affine.apply #map7(%186)[%c256_12]
+                %190 = vector.transfer_read %2[%188, %189], %cst_15 : memref<256x256xf32, 1>, vector<8xf32>
+                %191 = arith.addf %190, %187 : vector<8xf32>
+                vector.transfer_write %191, %2[%188, %189] : vector<8xf32>, memref<256x256xf32, 1>
+                %c64_57 = arith.constant 64 : index
+                %192 = affine.apply #map5(%arg6, %arg8, %c64_57)[%c256_13, %c128]
+                %193 = "vcix.v.i"(%c8_i64) {imm = 0 : i64, opcode = 2 : i64, rs2 = 0 : i64} : (i64) -> vector<8xf32>
+                %194 = affine.apply #map6(%192)[%c256_12]
+                %195 = affine.apply #map7(%192)[%c256_12]
+                %196 = vector.transfer_read %2[%194, %195], %cst_15 : memref<256x256xf32, 1>, vector<8xf32>
+                %197 = arith.addf %196, %193 : vector<8xf32>
+                vector.transfer_write %197, %2[%194, %195] : vector<8xf32>, memref<256x256xf32, 1>
+                %c72_58 = arith.constant 72 : index
+                %198 = affine.apply #map5(%arg6, %arg8, %c72_58)[%c256_13, %c128]
+                %199 = "vcix.v.i"(%c8_i64) {imm = 0 : i64, opcode = 2 : i64, rs2 = 0 : i64} : (i64) -> vector<8xf32>
+                %200 = affine.apply #map6(%198)[%c256_12]
+                %201 = affine.apply #map7(%198)[%c256_12]
+                %202 = vector.transfer_read %2[%200, %201], %cst_15 : memref<256x256xf32, 1>, vector<8xf32>
+                %203 = arith.addf %202, %199 : vector<8xf32>
+                vector.transfer_write %203, %2[%200, %201] : vector<8xf32>, memref<256x256xf32, 1>
+                %c80_59 = arith.constant 80 : index
+                %204 = affine.apply #map5(%arg6, %arg8, %c80_59)[%c256_13, %c128]
+                %205 = "vcix.v.i"(%c8_i64) {imm = 0 : i64, opcode = 2 : i64, rs2 = 0 : i64} : (i64) -> vector<8xf32>
+                %206 = affine.apply #map6(%204)[%c256_12]
+                %207 = affine.apply #map7(%204)[%c256_12]
+                %208 = vector.transfer_read %2[%206, %207], %cst_15 : memref<256x256xf32, 1>, vector<8xf32>
+                %209 = arith.addf %208, %205 : vector<8xf32>
+                vector.transfer_write %209, %2[%206, %207] : vector<8xf32>, memref<256x256xf32, 1>
+                %c88_60 = arith.constant 88 : index
+                %210 = affine.apply #map5(%arg6, %arg8, %c88_60)[%c256_13, %c128]
+                %211 = "vcix.v.i"(%c8_i64) {imm = 0 : i64, opcode = 2 : i64, rs2 = 0 : i64} : (i64) -> vector<8xf32>
+                %212 = affine.apply #map6(%210)[%c256_12]
+                %213 = affine.apply #map7(%210)[%c256_12]
+                %214 = vector.transfer_read %2[%212, %213], %cst_15 : memref<256x256xf32, 1>, vector<8xf32>
+                %215 = arith.addf %214, %211 : vector<8xf32>
+                vector.transfer_write %215, %2[%212, %213] : vector<8xf32>, memref<256x256xf32, 1>
+                %c96_61 = arith.constant 96 : index
+                %216 = affine.apply #map5(%arg6, %arg8, %c96_61)[%c256_13, %c128]
+                %217 = "vcix.v.i"(%c8_i64) {imm = 0 : i64, opcode = 2 : i64, rs2 = 0 : i64} : (i64) -> vector<8xf32>
+                %218 = affine.apply #map6(%216)[%c256_12]
+                %219 = affine.apply #map7(%216)[%c256_12]
+                %220 = vector.transfer_read %2[%218, %219], %cst_15 : memref<256x256xf32, 1>, vector<8xf32>
+                %221 = arith.addf %220, %217 : vector<8xf32>
+                vector.transfer_write %221, %2[%218, %219] : vector<8xf32>, memref<256x256xf32, 1>
+                %c104_62 = arith.constant 104 : index
+                %222 = affine.apply #map5(%arg6, %arg8, %c104_62)[%c256_13, %c128]
+                %223 = "vcix.v.i"(%c8_i64) {imm = 0 : i64, opcode = 2 : i64, rs2 = 0 : i64} : (i64) -> vector<8xf32>
+                %224 = affine.apply #map6(%222)[%c256_12]
+                %225 = affine.apply #map7(%222)[%c256_12]
+                %226 = vector.transfer_read %2[%224, %225], %cst_15 : memref<256x256xf32, 1>, vector<8xf32>
+                %227 = arith.addf %226, %223 : vector<8xf32>
+                vector.transfer_write %227, %2[%224, %225] : vector<8xf32>, memref<256x256xf32, 1>
+                %c112_63 = arith.constant 112 : index
+                %228 = affine.apply #map5(%arg6, %arg8, %c112_63)[%c256_13, %c128]
+                %229 = "vcix.v.i"(%c8_i64) {imm = 0 : i64, opcode = 2 : i64, rs2 = 0 : i64} : (i64) -> vector<8xf32>
+                %230 = affine.apply #map6(%228)[%c256_12]
+                %231 = affine.apply #map7(%228)[%c256_12]
+                %232 = vector.transfer_read %2[%230, %231], %cst_15 : memref<256x256xf32, 1>, vector<8xf32>
+                %233 = arith.addf %232, %229 : vector<8xf32>
+                vector.transfer_write %233, %2[%230, %231] : vector<8xf32>, memref<256x256xf32, 1>
+                %c120_64 = arith.constant 120 : index
+                %234 = affine.apply #map5(%arg6, %arg8, %c120_64)[%c256_13, %c128]
+                %235 = "vcix.v.i"(%c8_i64) {imm = 0 : i64, opcode = 2 : i64, rs2 = 0 : i64} : (i64) -> vector<8xf32>
+                %236 = affine.apply #map6(%234)[%c256_12]
+                %237 = affine.apply #map7(%234)[%c256_12]
+                %238 = vector.transfer_read %2[%236, %237], %cst_15 : memref<256x256xf32, 1>, vector<8xf32>
+                %239 = arith.addf %238, %235 : vector<8xf32>
+                vector.transfer_write %239, %2[%236, %237] : vector<8xf32>, memref<256x256xf32, 1>
+              } {inner_loop = true}
+            } {inner_loop = true}
+          } {inner_loop = true}
+        } {accumulation_loop = true, subtile_loop = "k"}
+        affine.for %arg5 = 0 to 1 {
+        } {inner_loop = false}
+        %3 = affine.apply #map(%arg3, %arg4)
+        %c1_0 = arith.constant 1 : index
+        memref.dma_start %2[%c0, %c0], %arg2[%3], %c3, %alloc[%c0], %c1_0, %c1 : memref<256x256xf32, 1>, memref<65536xf32>, memref<1xi32> {dram_stride = [256, 1], padding = 0 : i64, sram_stride = [1, 256]}
+      } {outer_loop = true, subtile_loop = "n"}
+    } {outer_loop = true, subtile_loop = "m"}
+    return
+  }
+  func.func @wrapper_kernel(%arg0: memref<65536xf32>, %arg1: memref<65536xf32>, %arg2: memref<65536xf32>) {
+    call @kernel(%arg0, %arg1, %arg2) : (memref<65536xf32>, memref<65536xf32>, memref<65536xf32>) -> ()
+    return
+  }
+}
diff --git a/tests/test_togsim_emitc.py b/tests/test_togsim_emitc.py
new file mode 100644
index 00000000..b0bd2d8e
--- /dev/null
+++ b/tests/test_togsim_emitc.py
@@ -0,0 +1,152 @@
+"""Tests for the C4 emitc lowering + compiled .so trace producer (P2).
+
+The pipeline under test (docs/design/togsim_cpp_trace.md, sec 5-7):
+
+    post-vcix .mlir --build_skeleton--> skeleton+API
+                    --lower_to_emitc--> EmitC module
+                    --mlir-translate--> C++
+                    --g++ -shared----> trace .so  (exports togsim_kernel;
+                                                    togsim_* left undefined)
+
+`test_build_trace_so` builds the .so and checks the EmitC/symbol-table shape.
+`test_trace_so_runs` additionally dlopens it against a stub runtime and confirms
+the producer executes and emits a non-empty deterministic trace.
+
+Both are skipped unless the MLIR bindings, `mlir-translate` (from
+TORCHSIM_LLVM_PATH), a host C++ compiler, AND a post-vcix `.mlir` fixture (via
+`TOGSIM_SKELETON_FIXTURE`) are available -- the same fixture used by
+test_togsim_skeleton.py.
+"""
+import importlib.util
+import os
+import pathlib
+import shutil
+import subprocess
+import sys
+import tempfile
+
+import pytest
+
+_ROOT = pathlib.Path(__file__).resolve().parents[1]
+_CXX = os.environ.get("CXX", "g++")
+_INCLUDE = _ROOT / "TOGSim" / "include"
+
+
+def _mlir_translate():
+    return os.path.join(os.environ.get("TORCHSIM_LLVM_PATH", "/usr/bin"),
+                        "mlir-translate")
+
+
+def _tools_ready():
+    return (importlib.util.find_spec("mlir") is not None
+            and os.path.isfile(_mlir_translate())
+            and shutil.which(_CXX) is not None)
+
+
+def _fixture():
+    fix = os.environ.get("TOGSIM_SKELETON_FIXTURE")
+    if not fix or not os.path.isfile(fix):
+        pytest.skip("set TOGSIM_SKELETON_FIXTURE to a post-vcix kernel .mlir")
+    return fix
+
+
+_HARNESS = r'''
+#include <cstdio>
+#include <cstdint>
+#include <cstdlib>
+#include <dlfcn.h>
+#include "togsim_runtime.h"
+static int n_dma=0, n_membar=0, n_compute=0, n_core=0, bad=0;
+extern "C" {
+void togsim_dma(EmitCtx*, int32_t, int32_t, uint64_t, int32_t,
+                const int64_t*, const int64_t*, int32_t, int32_t,
+                int32_t, uint64_t, const int64_t*, int32_t,
+                const int64_t*, int32_t){ ++n_dma; }
+void togsim_compute(EmitCtx*, uint64_t, int32_t, int32_t, const int64_t*,
+                    const int64_t*, int32_t, const int64_t*, int32_t){ ++n_compute; }
+void togsim_memory_barrier(EmitCtx*, int32_t tag_id, uint64_t, const int64_t*, int32_t){
+  ++n_membar; if(tag_id<0) ++bad; }   // tag_id pairs it with its async dma
+void togsim_dispatch(EmitCtx* ctx, togsim_tile_fn fn, int64_t* iv, int32_t n){
+  ++n_core; fn(ctx, iv, n); }   // count a work-item + run its (outlined) body
+void togsim_compute_barrier(EmitCtx*){}
+}
+int main(int argc, char** argv){
+  void* h = dlopen(argv[1], RTLD_NOW | RTLD_GLOBAL);
+  if(!h){ printf("dlopen failed: %s\n", dlerror()); return 2; }
+  auto emit = (void(*)(EmitCtx*, int64_t*, int32_t))dlsym(h, "togsim_kernel");
+  if(!emit){ printf("dlsym failed: %s\n", dlerror()); return 3; }
+  emit(nullptr, nullptr, 0);
+  printf("TRACE core=%d dma=%d membar=%d compute=%d bad=%d\n",
+         n_core, n_dma, n_membar, n_compute, bad);
+  return 0;
+}
+'''
+
+
+@pytest.mark.skipif(not _tools_ready(),
+                    reason="need mlir bindings + mlir-translate + C++ compiler")
+def test_build_trace_so():
+    fix = _fixture()
+    sys.path.insert(0, str(_ROOT))
+    from PyTorchSimFrontend.mlir.passes import lower_to_emitc as c4
+
+    with tempfile.TemporaryDirectory() as d:
+        so = os.path.join(d, "trace.so")
+        emitc_text = c4.build_trace_so(fix, so)
+        assert os.path.isfile(so)
+
+        # EmitC form: one entry func, dma/memory_barrier/compute as call_opaque targets.
+        assert "emitc.func" in emitc_text
+        assert ("@%s" % c4.ENTRY) in emitc_text
+        assert 'emitc.call_opaque "togsim_dma"' in emitc_text
+        assert 'emitc.call_opaque "togsim_memory_barrier"' in emitc_text
+        assert 'emitc.call_opaque "togsim_compute"' in emitc_text
+
+        # Symbol table: entry exported (defined, text), runtime hooks undefined
+        # so the TOGSim loader resolves them at dlopen.
+        nm = subprocess.run(["nm", "-D", so], capture_output=True, text=True).stdout
+        syms = {parts[-1]: parts[-2] for parts in
+                (ln.split() for ln in nm.splitlines()) if len(parts) >= 2}
+        assert syms.get("togsim_kernel") == "T", nm
+        assert syms.get("togsim_dma") == "U", nm
+        assert syms.get("togsim_dispatch") == "U", nm
+        assert syms.get("togsim_memory_barrier") == "U", nm
+        # The per-work-item dispatch wrapper is emitted (outlined tile fn).
+        assert 'emitc.call_opaque "togsim_dispatch"' in emitc_text
+
+
+@pytest.mark.skipif(not _tools_ready(),
+                    reason="need mlir bindings + mlir-translate + C++ compiler")
+def test_trace_so_runs():
+    fix = _fixture()
+    sys.path.insert(0, str(_ROOT))
+    from PyTorchSimFrontend.mlir.passes import lower_to_emitc as c4
+
+    with tempfile.TemporaryDirectory() as d:
+        so = os.path.join(d, "trace.so")
+        c4.build_trace_so(fix, so)
+
+        harness_cpp = os.path.join(d, "harness.cpp")
+        harness_bin = os.path.join(d, "harness")
+        with open(harness_cpp, "w") as fh:
+            fh.write(_HARNESS)
+        # -rdynamic so the harness's togsim_* are visible to the dlopened .so.
+        build = subprocess.run(
+            [_CXX, "-std=gnu++17", "-O2", "-rdynamic", "-I", str(_INCLUDE),
+             harness_cpp, "-o", harness_bin, "-ldl"],
+            capture_output=True, text=True)
+        assert build.returncode == 0, build.stderr
+
+        run = subprocess.run([harness_bin, so], capture_output=True, text=True)
+        assert run.returncode == 0, run.stdout + run.stderr
+        out = run.stdout.strip()
+        assert out.startswith("TRACE "), out
+        counts = dict(kv.split("=") for kv in out.split()[1:])
+        # The producer ran and emitted a real trace, with >=1 work-item (core alloc).
+        assert int(counts["core"]) >= 1
+        assert int(counts["dma"]) >= 1
+        assert int(counts["compute"]) >= 1
+        # Async loads are synced by explicit memory barriers, each carrying a
+        # valid (non-negative) tag_id that pairs it with its dma.
+        assert int(counts["membar"]) >= 1, out
+        assert int(counts["bad"]) == 0, out
diff --git a/tests/test_togsim_runtime.py b/tests/test_togsim_runtime.py
new file mode 100644
index 00000000..a5d6cb3d
--- /dev/null
+++ b/tests/test_togsim_runtime.py
@@ -0,0 +1,189 @@
+"""P3 task 5: the TOGSim C6 runtime + loader (togsim_runtime.cc / togsim_loader.h).
+
+Builds a producer `.so` from a post-vcix fixture, links the real C6 runtime, runs
+the loader (`run_producer`) against the `.so`, and checks the recorded trace:
+DRAM addresses are resolved (base[arg_id] + offset*elem_bytes), compute cycles
+are looked up from the cycle table, and every wait gets a handle a dma minted.
+
+Uses a checked-in post-vcix `.mlir` fixture (tests/fixtures/), so it is
+self-contained; skipped only when the MLIR bindings, `mlir-translate`, or a C++
+compiler are missing.
+"""
+import importlib.util
+import os
+import pathlib
+import shutil
+import subprocess
+import sys
+import tempfile
+
+import pytest
+
+_ROOT = pathlib.Path(__file__).resolve().parents[1]
+_CXX = os.environ.get("CXX", "g++")
+_INCLUDE = _ROOT / "TOGSim" / "include"
+_RUNTIME = _ROOT / "TOGSim" / "src" / "togsim_runtime.cc"
+
+
+def _mlir_translate():
+    return os.path.join(os.environ.get("TORCHSIM_LLVM_PATH", "/usr/bin"),
+                        "mlir-translate")
+
+
+def _tools_ready():
+    return (importlib.util.find_spec("mlir") is not None
+            and os.path.isfile(_mlir_translate())
+            and shutil.which(_CXX) is not None
+            and _RUNTIME.is_file())
+
+
+# Checked-in post-vcix kernel: a 256^3 single-output-tile GEMM (X/W/Y_spad
+# 256x256), matching the trace assertions below. Self-contained so the test
+# runs wherever the tools are present -- no setup/env needed.
+_FIXTURE = pathlib.Path(__file__).resolve().parent / "fixtures" / "gemm256_postvcix.mlir"
+
+
+def _fixture():
+    if not _FIXTURE.is_file():
+        pytest.skip(f"missing checked-in fixture {_FIXTURE}")
+    return str(_FIXTURE)
+
+
+# Drives the loader with known tensor bases + a synthetic cycle table, then
+# checks the recorded trace. Tailored to a single-output-tile GEMM (256^3):
+# 3 dmas A/B/C at offset 0 -> addr == base; args 0/1/2; dirs load/load/store.
+_MAIN = r'''
+#include <cstdio>
+#include <cstdint>
+#include <utility>
+#include <vector>
+#include "togsim_loader.h"
+using namespace togsim;
+int main(int argc, char** argv) {
+  uint64_t bases[3] = {0x1000, 0x2000, 0x3000};
+  int64_t  cyc[3]   = {100, 200, 300};
+  int64_t  ovl[3]   = {0, 200, 172};
+  int32_t  pcores[1] = {0};  // round-robin work-items over core 0 (single-core harness)
+  RunResult r = run_producer(argv[1], nullptr, 0, bases, 3, cyc, ovl, 3, pcores, 1);
+  if (!r.ok) { printf("run failed\n"); return 2; }
+  int ndisp=0, nd=0, nc=0, nm=0, fail=0;
+  std::vector<uint64_t> dma_a; std::vector<int> dma_arg, dma_dir;
+  std::vector<std::pair<int,uint64_t>> async_tags;  // (tag_id, tag_slot) of async dmas
+  for (auto& t : r.trace) {
+    if (t.kind == TraceRec::TILE_BEGIN) ndisp++;   // one per work-item
+    else if (t.kind == TraceRec::DMA) {
+      nd++; dma_a.push_back(t.addr);
+      dma_arg.push_back(t.arg_id); dma_dir.push_back(t.dir);
+      if (t.is_async) async_tags.push_back({t.tag_id, t.tag_slot});
+    } else if (t.kind == TraceRec::COMPUTE) {
+      nc++;
+      int64_t want = (t.tile_id < 3) ? cyc[t.tile_id] : -1;
+      if (t.cycle != want) { printf("compute %lu cyc %ld!=%ld\n",
+          (unsigned long)t.tile_id, (long)t.cycle, (long)want); fail++; }
+    } else if (t.kind == TraceRec::MEMORY_BAR) {
+      nm++; bool ok=false;
+      for (auto& k : async_tags) if (k.first==t.tag_id && k.second==t.tag_slot) ok=true;
+      if (!ok) { printf("membar tag (%d,%lu) pairs no async dma\n",
+          t.tag_id, (unsigned long)t.tag_slot); fail++; }
+    }
+  }
+  const uint64_t exp[3] = {0x1000, 0x2000, 0x3000};
+  const int ea[3] = {0,1,2}, ed[3] = {0,0,1};
+  for (int i = 0; i < nd && i < 3; ++i)
+    if (dma_a[i]!=exp[i] || dma_arg[i]!=ea[i] || dma_dir[i]!=ed[i]) {
+      printf("dma[%d] addr=%#lx arg=%d dir=%d\n", i,
+             (unsigned long)dma_a[i], dma_arg[i], dma_dir[i]); fail++;
+    }
+  printf("dispatch=%d dma=%d compute=%d membar=%d fail=%d\n", ndisp, nd, nc, nm, fail);
+  printf(fail ? "RESULT FAIL\n" : "RESULT PASS\n");
+  return fail ? 1 : 0;
+}
+'''
+
+
+@pytest.mark.skipif(not _tools_ready(),
+                    reason="need mlir bindings + mlir-translate + C++ compiler + runtime")
+def test_runtime_loads_and_records():
+    fix = _fixture()
+    sys.path.insert(0, str(_ROOT))
+    from PyTorchSimFrontend.mlir.passes import lower_to_emitc as c4
+
+    with tempfile.TemporaryDirectory() as d:
+        so = os.path.join(d, "trace.so")
+        c4.build_trace_so(fix, so)
+
+        main_cpp = os.path.join(d, "main.cpp")
+        binp = os.path.join(d, "runtime_test")
+        with open(main_cpp, "w") as fh:
+            fh.write(_MAIN)
+        build = subprocess.run(
+            [_CXX, "-std=gnu++17", "-O2", "-rdynamic", "-I", str(_INCLUDE),
+             main_cpp, str(_RUNTIME), "-o", binp, "-ldl"],
+            capture_output=True, text=True)
+        assert build.returncode == 0, build.stderr
+
+        run = subprocess.run([binp, so], capture_output=True, text=True)
+        out = run.stdout
+        assert "RESULT PASS" in out, out + run.stderr
+        assert run.returncode == 0, out
+        # at least the GEMM's 3 dmas were recorded with resolved addresses.
+        line = [l for l in out.splitlines() if l.startswith("dispatch=")][0]
+        counts = dict(kv.split("=") for kv in line.split())
+        assert int(counts["dma"]) >= 1
+        assert int(counts["compute"]) >= 1
+        assert int(counts["fail"]) == 0
+
+
+_SIM_MAIN = r'''
+#include <cstdio>
+#include <cstdint>
+#include "togsim_loader.h"
+using namespace togsim;
+int main(int argc, char** argv) {
+  uint64_t bases[3] = {0x1000, 0x2000, 0x3000};
+  int64_t  cyc[3]   = {100, 200, 300};
+  int64_t  ovl[3]   = {0, 200, 172};
+  int32_t  pcores[1] = {0};  // round-robin work-items over core 0 (single-core harness)
+  RunResult r = run_producer(argv[1], nullptr, 0, bases, 3, cyc, ovl, 3, pcores, 1);
+  if (!r.ok) { printf("run failed\n"); return 2; }
+  TimingParams p; p.dma_latency = 100;
+  SimResult s = simulate(r, p);
+  // serial baseline: no overlap at all.
+  uint64_t serial = 0;
+  for (auto& t : r.trace) {
+    if (t.kind == TraceRec::DMA) serial += p.dma_latency;
+    else if (t.kind == TraceRec::COMPUTE) serial += (uint64_t)t.cycle;
+  }
+  printf("SIM total=%lu compute=%d dma=%d serial=%lu\n",
+         (unsigned long)s.total_cycle, s.n_compute, s.n_dma, (unsigned long)serial);
+  // The trace is schedulable into cycles; overlap (dma||compute, compute
+  // pipelining) makes it no worse than the fully-serial baseline.
+  bool ok = s.total_cycle > 0 && s.n_compute > 0 && s.total_cycle <= serial;
+  printf(ok ? "RESULT PASS\n" : "RESULT FAIL\n");
+  return ok ? 0 : 1;
+}
+'''
+
+
+@pytest.mark.skipif(not _tools_ready(),
+                    reason="need mlir bindings + mlir-translate + C++ compiler + runtime")
+def test_simulate_produces_cycles():
+    fix = _fixture()
+    sys.path.insert(0, str(_ROOT))
+    from PyTorchSimFrontend.mlir.passes import lower_to_emitc as c4
+
+    with tempfile.TemporaryDirectory() as d:
+        so = os.path.join(d, "trace.so")
+        c4.build_trace_so(fix, so)
+        main_cpp = os.path.join(d, "sim.cpp")
+        binp = os.path.join(d, "sim_test")
+        with open(main_cpp, "w") as fh:
+            fh.write(_SIM_MAIN)
+        build = subprocess.run(
+            [_CXX, "-std=gnu++17", "-O2", "-rdynamic", "-I", str(_INCLUDE),
+             main_cpp, str(_RUNTIME), "-o", binp, "-ldl"],
+            capture_output=True, text=True)
+        assert build.returncode == 0, build.stderr
+        run = subprocess.run([binp, so], capture_output=True, text=True)
+        assert "RESULT PASS" in run.stdout, run.stdout + run.stderr
+        assert run.returncode == 0, run.stdout
diff --git a/tests/test_togsim_skeleton.py b/tests/test_togsim_skeleton.py
new file mode 100644
index 00000000..56601966
--- /dev/null
+++ b/tests/test_togsim_skeleton.py
@@ -0,0 +1,184 @@
+"""Tests for the C++ trace-generation front-end pieces (docs/design/togsim_cpp_trace.md).
+
+Two layers:
+
+* `test_togsim_ops_contract` runs anywhere (no MLIR bindings, no torch). It pins
+  the skeleton+API vocabulary (`togsim_ops.py`) and checks it stays in lockstep
+  with the runtime ABI header (`togsim_runtime.h`) -- the single thing most
+  likely to silently drift.
+* `test_build_skeleton_on_fixture` exercises the real `build_skeleton` pass, and
+  is skipped unless the MLIR bindings are importable AND a post-vcix `.mlir`
+  fixture is supplied via the `TOGSIM_SKELETON_FIXTURE` env var. (A valid
+  build_tog-compatible fixture is hard to hand-write reliably; point this at a
+  kernel dump from a real run.)
+"""
+import os
+import importlib.util
+import pathlib
+
+import pytest
+
+_ROOT = pathlib.Path(__file__).resolve().parents[1]
+_OPS_PY = _ROOT / "PyTorchSimFrontend" / "mlir" / "passes" / "togsim_ops.py"
+_HEADER = _ROOT / "TOGSim" / "include" / "togsim_runtime.h"
+
+
+def _load_togsim_ops():
+    spec = importlib.util.spec_from_file_location("togsim_ops", _OPS_PY)
+    mod = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(mod)
+    return mod
+
+
+def test_togsim_ops_contract():
+    ts = _load_togsim_ops()
+    header = _HEADER.read_text()
+
+    # Every op maps to a callee, and every callee is the header's free function.
+    assert set(ts.EMITC_CALLEE) == set(ts.OP_NAMES)
+    for callee in ts.EMITC_CALLEE.values():
+        assert callee in header, f"{callee} missing from togsim_runtime.h"
+
+    # Entry point symbol agrees with the header.
+    assert ts.ENTRY_SYMBOL == "togsim_kernel"
+    assert ts.ENTRY_SYMBOL in header
+
+    # Runtime callee emitted directly by lower_to_emitc: the work-item dispatch
+    # wrapper. (The outlined tile fn TILE_SYMBOL is producer-generated.)
+    assert ts.DISPATCH_CALLEE in header
+
+    # Direction enum agrees with the header's togsim_dma_dir.
+    assert (ts.DIR_LOAD, ts.DIR_STORE) == (0, 1)
+    assert "TOGSIM_DMA_LOAD  = 0" in header
+    assert "TOGSIM_DMA_STORE = 1" in header
+
+
+def _mlir_available():
+    return importlib.util.find_spec("mlir") is not None
+
+
+@pytest.mark.skipif(not _mlir_available(), reason="MLIR Python bindings not installed")
+def test_build_skeleton_on_fixture():
+    fixture = os.environ.get("TOGSIM_SKELETON_FIXTURE")
+    if not fixture or not os.path.isfile(fixture):
+        pytest.skip("set TOGSIM_SKELETON_FIXTURE to a post-vcix kernel .mlir")
+
+    import sys
+    sys.path.insert(0, str(_ROOT))
+    from PyTorchSimFrontend.mlir.passes import build_skeleton
+
+    import mlir.ir as ir
+    ctx = ir.Context()
+    ctx.allow_unregistered_dialects = True
+    with ctx:
+        module = ir.Module.parse(pathlib.Path(fixture).read_text(), ctx)
+        report = build_skeleton.build_skeleton(module)
+        out = str(module)
+
+    # The data-movement ops are gone; the API ops took their place.
+    assert "memref.dma_start" not in out
+    assert "memref.dma_wait" not in out
+    assert "togsim.dma" in out
+    assert "togsim.memory_barrier" in out   # the explicit async-DMA sync (was dma_wait)
+    assert "event_id" not in out            # static pairing replaced by the runtime tag
+    # Loop skeleton is preserved.
+    assert ("affine.for" in out) or ("scf.for" in out)
+    assert module.operation.verify()
+    print(report)
+
+
+@pytest.mark.skipif(not _mlir_available(), reason="MLIR Python bindings not installed")
+def test_strip_accum_terms_drops_reduction_marker():
+    """Regression: the dma_wait tag index built by lower_to_vcix carries a `-d_i`
+    term for each accumulation (reduction) loop var -- a sentinel marker, not an
+    offset. build_skeleton must drop those so a memory_barrier waits on the same
+    subtile slot the async load wrote; otherwise the producer evaluates `-acc_iv`
+    to a negative slot at reduction iteration > 0, the recorded barrier slot
+    diverges from the load slot, and TOGSim aborts with "Key does not exist in ...
+    tag table" on subtile + multi-tile-K. See docs/design/togsim_cpp_trace.md and
+    legacy TileGraphParser.cc (which skips stride -1 for the same reason)."""
+    import sys
+    sys.path.insert(0, str(_ROOT))
+    from PyTorchSimFrontend.mlir.passes import build_skeleton as bs
+
+    import mlir.ir as ir
+    ctx = ir.Context()
+    ctx.allow_unregistered_dialects = True
+    with ctx, ir.Location.unknown(ctx):
+        module = ir.Module.parse(
+            "func.func @k() {\n"
+            "  %r = arith.constant 1 : index\n"   # stand-in reduction iv
+            "  %a = arith.constant 0 : index\n"   # subtile dim 1
+            "  %b = arith.constant 0 : index\n"   # subtile dim 2
+            "  return\n"
+            "}", ctx)
+        block = module.body.operations[0].regions[0].blocks[0]
+        consts = [op.results[0] for op in block.operations if op.name == "arith.constant"]
+        anchor = [op for op in block.operations if op.name == "func.return"][0]
+        r, a, b = consts
+
+        def neg_dims(val):
+            amap = ir.AffineMapAttr(val.owner.attributes["map"]).value
+            return [p for p in (bs._neg_coeff_dim(s) for s in bs._flatten_add(amap.results[0]))
+                    if p is not None]
+
+        # #map8-style: -d0 (reduction) + d1 + d2 floordiv 2.
+        d0, d1, d2 = (ir.AffineDimExpr.get(i) for i in range(3))
+        expr = d0 * -1 + d1 + ir.AffineExpr.get_floor_div(d2, 2)
+        with ir.InsertionPoint(anchor):
+            apply = ir.Operation.create(
+                "affine.apply", results=[ir.IndexType.get()], operands=[r, a, b],
+                attributes={"map": ir.AffineMapAttr.get(ir.AffineMap.get(3, 0, [expr]))})
+        tag_in = apply.results[0]
+        assert neg_dims(tag_in) == [0]                       # the reduction marker is present
+
+        tag_out = bs._strip_accum_terms(ctx, tag_in, anchor)
+        assert tag_out is not tag_in                         # a new, reduced apply was emitted
+        out_map = ir.AffineMapAttr(tag_out.owner.attributes["map"]).value
+        assert out_map.n_dims == 2                           # the reduction dim was dropped
+        assert neg_dims(tag_out) == []                       # no reduction marker remains
+        assert list(tag_out.owner.operands) == [a, b]        # only the subtile operands survive
+
+        # No-op: an index with no reduction marker is returned unchanged.
+        plain = d0 + d1
+        with ir.InsertionPoint(anchor):
+            papply = ir.Operation.create(
+                "affine.apply", results=[ir.IndexType.get()], operands=[a, b],
+                attributes={"map": ir.AffineMapAttr.get(ir.AffineMap.get(2, 0, [plain]))})
+        pin = papply.results[0]
+        assert bs._strip_accum_terms(ctx, pin, anchor) is pin
+
+        assert module.operation.verify()
+
+
+@pytest.mark.skipif(not _mlir_available(), reason="MLIR Python bindings not installed")
+def test_cycle_table_on_fixture():
+    fixture = os.environ.get("TOGSIM_SKELETON_FIXTURE")
+    if not fixture or not os.path.isfile(fixture):
+        pytest.skip("set TOGSIM_SKELETON_FIXTURE to a post-vcix kernel .mlir")
+
+    import sys
+    sys.path.insert(0, str(_ROOT))
+    from PyTorchSimFrontend.mlir.passes import build_skeleton, cycle_table
+
+    import mlir.ir as ir
+    ctx = ir.Context()
+    ctx.allow_unregistered_dialects = True
+    with ctx:
+        module = ir.Module.parse(pathlib.Path(fixture).read_text(), ctx)
+        build_skeleton.build_skeleton(module)
+        types = cycle_table._compute_types(module)
+        # synthetic per-tile cycles (gem5 sample-mode is reused at P3 task 5).
+        cyc = [10 * (i + 1) for i in range(len(types))]
+        x_off, w_off = 4, 0
+        table = cycle_table.build_cycle_table(module, cyc, x_off, w_off)
+
+    assert len(table) == len(types) >= 1
+    # cycle is carried verbatim; overlapping_cycle follows the legacy formula.
+    for (cy, ov), t, raw in zip(table, types, cyc):
+        assert cy == raw
+        if t == cycle_table.VECTOR_COMPUTE:
+            assert ov == 0
+        else:
+            off = w_off if t == cycle_table.MATMUL_PRELOAD else x_off
+            assert ov == max(raw - off, 0)