From d73ad80974393ec6f3c129a1da0e22b8ad3fdd9a Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Thu, 18 Jun 2026 14:25:19 +0900 Subject: [PATCH 1/3] [Frontend] Fix floor/mod guard + decompose unit-dim/vlane-axis edge cases Three fixes from the max-effort review of this branch: - get_dma_info: after retiring the floor/mod recompile branches, a residual floor/mod (store-side ModularIndexing, reduction-axis floor/mod, incompatible radix) that axis-split/graph-copy did not linearize was silently bucketed by its base symbol in the dram_stride loop, emitting a wrong DRAM descriptor. Raise NotImplementedError instead of mis-striding silently. No test triggers it (0 floor/mod reach get_dma_info in the suite) -- it is a safety net. - decompose_transfer collapse fast path: keep=[g[-1]] picked the last dim of each reassociation group, which is a unit dim when trailing unit dims attach after the non-unit one (e.g. [..,4,1,1]); strides/subtile were read from the wrong axis. Pick the non-unit dim in each group. - decompose_transfer >4D peel: new_vlane fell back to 0 whenever the vlane split axis was not among the inner 4 dims, conflating peeled-into-the-outer-loop (genuinely unrepresentable -> raise) with a unit lane axis (default 0 is fine). Validated: elementwise, gemm, conv2d, cat, floor/mod suite (incl. pixel_shuffle >4D peel), softmax, layernorm, batchnorm -- all pass, no spurious raise. Co-Authored-By: Claude Opus 4.8 --- PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 11 +++++++++++ .../mlir/passes/decompose_transfer.py | 15 +++++++++++++-- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py index 6529d8d9..725e0dc6 100644 --- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py +++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py @@ -1172,6 +1172,17 @@ def get_dma_info(self, name, index, broadcast=True, store_reduction=False, buffe index = index.subs({s: 0 for s in indirect_syms}, simultaneous=True) indirect_dims = [f"{i}" for i in indirect_syms] + # axis-split + graph-copy linearize aligned floor/mod upstream. Anything that + # reaches here still carrying floor/mod (store-side ModularIndexing, + # reduction-axis floor/mod, incompatible-radix views) would be silently + # mis-strided in the dram_stride computation below, so fail loudly instead. + if index.has(FloorDiv) or index.has(ModularIndexing): + raise NotImplementedError( + f"Unlinearized floor/mod in DMA index: {index}. axis-split/graph-copy " + f"did not eliminate it; this view is unsupported " + f"(see docs/axis-split-scheduling.md)." + ) + # Reduction can have two type of tile size if broadcast and (total_dims != local_dims or (self.reduction_depth!=len(total_dims) and total_dims[:self.reduction_depth] == local_dims)): local_dims = total_dims # Brodatcast tile shape diff --git a/PyTorchSimFrontend/mlir/passes/decompose_transfer.py b/PyTorchSimFrontend/mlir/passes/decompose_transfer.py index 29841f85..c0e82b66 100644 --- a/PyTorchSimFrontend/mlir/passes/decompose_transfer.py +++ b/PyTorchSimFrontend/mlir/passes/decompose_transfer.py @@ -159,7 +159,9 @@ def _emit(sram_mem, sram_indices, dram_idx_val, vsa_val, dr_attr, tl_attr, st_at reassoc = ArrayAttr.get( [ArrayAttr.get([IntegerAttr.get(i64, d) for d in g]) for g in groups]) collapsed_ty = MemRefType.get(target, elem, memory_space=space) - keep = [g[-1] for g in groups] # the non-unit dim in each group + # the non-unit dim in each group (g[-1] is wrong when trailing unit dims + # attach after it, e.g. [..,4,1,1] -> the kept dim must be the extent-4 one). + keep = [next((d for d in g if tile_shape[d] > 1), g[-1]) for g in groups] dr_attr = ArrayAttr.get([IntegerAttr.get(i64, dram_stride[i]) for i in keep]) tl_attr = ArrayAttr.get([IntegerAttr.get(i64, tile_stride[i]) for i in keep]) st_attr = (ArrayAttr.get([IntegerAttr.get(i64, subtile[i]) for i in keep]) @@ -198,7 +200,16 @@ def _emit(sram_mem, sram_indices, dram_idx_val, vsa_val, dr_attr, tl_attr, st_at st_attr = (ArrayAttr.get([IntegerAttr.get(i64, subtile[d]) for d in inner]) if subtile is not None else None) # the vlane axis must survive into the inner descriptor (it is the lane dim). - new_vlane = inner.index(vlane_axis) if vlane_axis in inner else 0 + if vlane_axis in inner: + new_vlane = inner.index(vlane_axis) + elif vlane_axis in peeled: + # lane dim peeled into the outer loop nest: it cannot be expressed in the + # <=4D descriptor, and _phys's lane-banking assumes it is a real axis. + raise NotImplementedError( + f"vlane split axis {vlane_axis} peeled into the outer loop nest; " + f">4D DMA peel cannot place the lane dim in the <=4D descriptor") + else: + new_vlane = 0 # vlane axis is a unit dim; lane on the first inner dim # Lane-banked physical stride for split-outer dims (vlane_stride defaults to 1). split_extent = tile_shape[vlane_axis] From 64d018249f462eec1a14aea9b83bcfdd11c99db8 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Thu, 18 Jun 2026 14:53:34 +0900 Subject: [PATCH 2/3] [Frontend] graph-copy per-dim ranges + vcix C++-parity guards (review) More fixes from the max-effort review, after verifying each against the C++ reference and reachability: - graph_copy _relayout_args: ranges picked the consumer iteration shape by rank alone (max key=len), so for two equal-rank operands with different per-dim extents the broadcast-from operand's smaller shape could win and the real incompatible-radix conflict on the broadcast-to dim was missed (order-dependent: a commutative reorder flipped correct relayout into a silent miss). Use per-dim max extent over the max-rank operands. - lower_to_vcix _sew/_legalize_vector_type: mirror the C++ legalizeVectorType -- F16/BF16 return sew 0 (transcendentals stay unlowered for -convert-math-to-llvm, as in the validated path) instead of being lowered to VCIX, and add the missing rank != 1 guard. - lower_to_vcix matmul: port the C++ guards as loud failures -- M/N/K must be a multiple of the systolic size when > SS (else the N//SS / K//SS loops drop the tail tile), and A vs B must agree on the K subtile (last-writer-wins would pick one silently). Latent today (heuristic/autotune only emit SS-multiple tiles). - Doc-only: graph-copy is default-on (TORCHSIM_GRAPH_COPY=0 to disable); fixed the two stale 'no-op unless set' comments. Validated: elementwise, gemm, bmm, conv2d, group_conv, cat, floor/mod suite, softmax, layernorm -- all pass. Co-Authored-By: Claude Opus 4.8 --- PyTorchSimFrontend/mlir/graph_copy.py | 13 +++++++++--- PyTorchSimFrontend/mlir/mlir_scheduling.py | 2 +- .../mlir/passes/lower_to_vcix.py | 20 +++++++++++++++++-- 3 files changed, 29 insertions(+), 6 deletions(-) diff --git a/PyTorchSimFrontend/mlir/graph_copy.py b/PyTorchSimFrontend/mlir/graph_copy.py index 51c2e9b6..c58fab59 100644 --- a/PyTorchSimFrontend/mlir/graph_copy.py +++ b/PyTorchSimFrontend/mlir/graph_copy.py @@ -14,8 +14,8 @@ realize() (not a clone, which Inductor inlines) is what actually forces the buffer boundary; see the PoC notes in docs. -Gated by TORCHSIM_GRAPH_COPY (install() is a no-op otherwise). Behavior-neutral -unless a genuine incompatible-radix conflict is detected. +Default-on; set TORCHSIM_GRAPH_COPY=0 to disable (install() is then a no-op). +Behavior-neutral unless a genuine incompatible-radix conflict is detected. """ import os from torch._inductor import lowering as L @@ -64,7 +64,14 @@ def _relayout_args(args): # multi-var-view input) this is just that operand's shape -- still enough to # detect a multi-var floor and copy_input it (case 7); the 2-operand radix # conflict (case 5) naturally needs >=2 operands. - ranges = max((t.get_size() for t in tbs), key=len) + # Per-dim max extent over the max-rank operands (order-independent). Picking a + # single operand by rank alone (max key=len) would, for two equal-rank operands + # with different per-dim extents, take a broadcast-from operand's smaller shape + # and then miss the genuine conflict on the broadcast-to dim. + maxrank = max(len(t.get_size()) for t in tbs) + full = [t.get_size() for t in tbs if len(t.get_size()) == maxrank] + ranges = [max((s[d] for s in full), key=lambda v: (axis_split._as_int(v) or -1)) + for d in range(maxrank)] extents = [axis_split._as_int(s) for s in ranges] dbg = os.environ.get("TORCHSIM_GRAPH_COPY_DEBUG") if dbg: diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py index 48eead47..c78d4c53 100644 --- a/PyTorchSimFrontend/mlir/mlir_scheduling.py +++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py @@ -394,6 +394,6 @@ def get_order(n): # Install the graph-copy (incompatible-radix relayout) lowering hook once at import. -# No-op unless TORCHSIM_GRAPH_COPY is set; see graph_copy.py. +# Default-on; set TORCHSIM_GRAPH_COPY=0 to disable. See graph_copy.py. from . import graph_copy as _graph_copy _graph_copy.install() diff --git a/PyTorchSimFrontend/mlir/passes/lower_to_vcix.py b/PyTorchSimFrontend/mlir/passes/lower_to_vcix.py index 4286ba19..1aa31e96 100644 --- a/PyTorchSimFrontend/mlir/passes/lower_to_vcix.py +++ b/PyTorchSimFrontend/mlir/passes/lower_to_vcix.py @@ -40,8 +40,9 @@ def _sew(elt_ty): - if ir.F16Type.isinstance(elt_ty) or ir.BF16Type.isinstance(elt_ty): - return 16 + # Mirror C++ legalizeVectorType: only F32/F64/integer/index get a sew. F16/BF16 + # return 0 so transcendental math ops stay unlowered (-convert-math-to-llvm), + # matching the validated path -- do NOT emit VCIX for them here. if ir.F32Type.isinstance(elt_ty): return 32 if ir.F64Type.isinstance(elt_ty): @@ -59,6 +60,8 @@ def _log2(x): def _legalize_vector_type(vt, vlen): """Mirror legalizeVectorType: return (n, legal_vector_type) or (0, None).""" + if len(vt.shape) != 1: # C++ guards getRank() != 1 + return 0, None elt_ty = vt.element_type sew = _sew(elt_ty) if sew == 0: @@ -337,6 +340,12 @@ def _lower_matmul(op, SS, vlen): mtA, mtB = ir.MemRefType(A.type), ir.MemRefType(B.type) elt = mtA.element_type M, K, N = mtA.shape[0], mtA.shape[1], mtB.shape[1] + # Mirror the C++ guard: a dimension > SS must be an exact multiple, else the + # N//SS / K//SS loop trip counts below silently drop the tail tile. + for _dim, _name in ((M, "M"), (N, "N"), (K, "K")): + if _dim > SS and _dim % SS != 0: + raise NotImplementedError( + f"matmul {_name}={_dim} must be a multiple of systolic size {SS} when > {SS}") elen = _elt_bits(elt) nr_element = vlen // elen i64 = ir.IntegerType.get_signless(64) @@ -364,6 +373,7 @@ def a64(v): return ir.IntegerAttr.get(i64, v) AAsync = BAsync = BiasAsync = 0 BiasIdx = None subtileM, subtileN, subtileK = M, N, K + a_subk = b_subk = None for o in _iter_ops(outer[-1].regions[0].blocks[0]): if o.operation.name != "memref.dma_start": continue @@ -382,15 +392,21 @@ def a64(v): return ir.IntegerAttr.get(i64, v) ATag, AAsync = d.tag, d.is_async() if len(sub) >= 2: subtileM, subtileK = sub[-2], sub[-1] + a_subk = sub[-1] elif argn == idxMap[1]: BTag, BAsync = d.tag, d.is_async() if len(sub) >= 2: subtileK, subtileN = sub[-2], sub[-1] + b_subk = sub[-2] elif argn == idxMap[2]: BiasTag, BiasAsync = d.tag, d.is_async() BiasIdx = d.tag_idx if ATag is None or BTag is None: return False + # A and B must agree on the K subtile (last-writer-wins would otherwise pick one silently). + if a_subk is not None and b_subk is not None and a_subk != b_subk: + raise NotImplementedError( + f"Mismatched subtile K between A ({a_subk}) and B ({b_subk}) matmul operands") KStep = subtileK push_length = min(subtileM, SS) From 4cf46d65dd3cbe22777e3afcee10a3b7f7248c72 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Thu, 18 Jun 2026 16:10:40 +0900 Subject: [PATCH 3/3] [Frontend] vcix: lower fused matmul whose operand is vector_store'd, fix exp chunk Two fixes to the C++->Python vcix port (lower_to_vcix.py) that SDPA exercises but the gemm/bmm/conv tests do not: - _lower_matmul bailed with 'if ATag is None or BTag is None: return False', gating on an MVIN dma_start tag for both operands. In SDPA's fused scores.V matmul, operand B is the softmax output produced in place by affine.vector_store, not DMAed, so BTag stayed None and the matmul was left un-lowered -> wrong attention output. Mirror the C++ MatmulOpLowering: an operand is initialized by either a dma_start OR a preceding affine.vector_store into its root memref; bail only when an operand is truly uninitialized. BTag/BAsync stay None/0 and are only read under 'if BAsync:', so the B dma_wait is correctly skipped (as in C++). - _make_sf_vc_v_iv n>1 transcendental chunking called vector.ExtractStridedSliceOp(offsets, sizes, strides, vec) -- wrong arg order, missing the result type and vector operand, raising TypeError under these MLIR bindings. Pass (result=legal_ty, vector=vec, offsets, sizes, strides). Only reached by large transcendentals (n>1), e.g. SDPA softmax exp, so CI's small-tile (n==1) tests never hit it. Validated end-to-end (Spike+TOGSim allclose): SDPA 56 cases pass (was crash/wrong); matmul/bmm/conv2d regress clean. Bisected: C++ vcix passes SDPA, Python vcix did not; exp chunking and fine-grained ruled out separately. Co-Authored-By: Claude Opus 4.8 --- .../mlir/passes/lower_to_vcix.py | 28 +++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/PyTorchSimFrontend/mlir/passes/lower_to_vcix.py b/PyTorchSimFrontend/mlir/passes/lower_to_vcix.py index 1aa31e96..404a8ab4 100644 --- a/PyTorchSimFrontend/mlir/passes/lower_to_vcix.py +++ b/PyTorchSimFrontend/mlir/passes/lower_to_vcix.py @@ -116,9 +116,10 @@ def _make_sf_vc_v_iv(vec, op_vt, n, legal_ty, opcode, imm): else: for i in range(total // elt_count): ext = vector.ExtractStridedSliceOp( + legal_ty, vec, ir.ArrayAttr.get([_i64(i * elt_count)]), ir.ArrayAttr.get([_i64(elt_count)]), - ir.ArrayAttr.get([_i64(1)]), vec).result + ir.ArrayAttr.get([_i64(1)])).result v = _viv(ext, legal_ty, opcode, imm, rvl) res = vector.InsertStridedSliceOp( v, res, ir.ArrayAttr.get([_i64(i * elt_count)]), @@ -374,7 +375,28 @@ def a64(v): return ir.IntegerAttr.get(i64, v) BiasIdx = None subtileM, subtileN, subtileK = M, N, K a_subk = b_subk = None + # Mirror the C++ isAInitialized / isBInitialized flags: an operand is + # "initialized" either by an MVIN dma_start (tag found below) or by a + # preceding affine.vector_store into its root memref (the fused case, e.g. + # SDPA scores.V where B is the softmax output produced in-place, not DMAed). + isAInit = isBInit = False + + def _root(v): + owner = v.owner + if not isinstance(owner, ir.Block): + nm = owner.name + if nm in ("memref.reinterpret_cast", "memref.cast"): + return owner.operands[0] + return v + rootA, rootB = _root(A), _root(B) for o in _iter_ops(outer[-1].regions[0].blocks[0]): + if o.operation.name == "affine.vector_store": + dest = _root(o.operation.operands[1]) + if dest == rootA: + isAInit = True + elif dest == rootB: + isBInit = True + continue if o.operation.name != "memref.dma_start": continue d = _DmaView(o.operation) @@ -390,18 +412,20 @@ def a64(v): return ir.IntegerAttr.get(i64, v) sub = d.subtile_size() if argn == idxMap[0]: ATag, AAsync = d.tag, d.is_async() + isAInit = True if len(sub) >= 2: subtileM, subtileK = sub[-2], sub[-1] a_subk = sub[-1] elif argn == idxMap[1]: BTag, BAsync = d.tag, d.is_async() + isBInit = True if len(sub) >= 2: subtileK, subtileN = sub[-2], sub[-1] b_subk = sub[-2] elif argn == idxMap[2]: BiasTag, BiasAsync = d.tag, d.is_async() BiasIdx = d.tag_idx - if ATag is None or BTag is None: + if not isAInit or not isBInit: return False # A and B must agree on the K subtile (last-writer-wins would otherwise pick one silently). if a_subk is not None and b_subk is not None and a_subk != b_subk: