From 8e23d63cafcc24d6d5f11bbccc135e4f4ba482d7 Mon Sep 17 00:00:00 2001
From: tomjen12 <tomjen12@amd.com>
Date: Thu, 18 Jun 2026 00:06:05 -0500
Subject: [PATCH 01/15] Add tuned HIP GiMMiK preload variants

---
 gimmik/hip.py                                 | 113 ++++++++++++-
 gimmik/kernels/hip/base.mako                  |  32 ++++
 .../kernels/hip/bstream-msplit-preload-c.mako |  98 +++++++++++
 .../hip/bstream-msplit-width-preload-c.mako   | 138 +++++++++++++++
 gimmik/kernels/hip/bstream-msplit.mako        |  10 +-
 gimmik/kernels/hip/bstream-preload-c.mako     |  63 +++++++
 .../kernels/hip/bstream-width-preload-c.mako  | 103 ++++++++++++
 gimmik/kernels/hip/bstream.mako               |  12 +-
 .../kernels/hip/cstream-ksplit-preload-c.mako | 103 ++++++++++++
 .../hip/cstream-ksplit-width-preload-c.mako   | 157 ++++++++++++++++++
 gimmik/kernels/hip/cstream-ksplit.mako        |   6 +-
 gimmik/kernels/hip/cstream-preload-c.mako     |  51 ++++++
 .../kernels/hip/cstream-width-preload-c.mako  | 106 ++++++++++++
 gimmik/kernels/hip/cstream.mako               |   8 +-
 14 files changed, 978 insertions(+), 22 deletions(-)
 create mode 100644 gimmik/kernels/hip/bstream-msplit-preload-c.mako
 create mode 100644 gimmik/kernels/hip/bstream-msplit-width-preload-c.mako
 create mode 100644 gimmik/kernels/hip/bstream-preload-c.mako
 create mode 100644 gimmik/kernels/hip/bstream-width-preload-c.mako
 create mode 100644 gimmik/kernels/hip/cstream-ksplit-preload-c.mako
 create mode 100644 gimmik/kernels/hip/cstream-ksplit-width-preload-c.mako
 create mode 100644 gimmik/kernels/hip/cstream-preload-c.mako
 create mode 100644 gimmik/kernels/hip/cstream-width-preload-c.mako

diff --git a/gimmik/hip.py b/gimmik/hip.py
index a58c8fa..ee38c9f 100644
--- a/gimmik/hip.py
+++ b/gimmik/hip.py
@@ -8,23 +8,128 @@ class HIPMatMul(MatMul):
     basemeta = {'block': (128, 1, 1), 'width': 1, 'shared': 0}
 
     def _kernel_generators(self, dtype, dsize, *, gcn_arch=None, warp_size=64):
+        max_block_threads = 1024
+        max_shared = 64*1024
+
+        def emit(name, args, meta):
+            block = meta.get('block', self.basemeta['block'])
+            shared = meta.get('shared', self.basemeta['shared'])
+            threads = block[0]*block[1]*block[2]
+
+            if threads <= max_block_threads and shared <= max_shared:
+                yield (name, args, meta)
+
         # B loading, C streaming kernel
-        yield ('cstream', {}, {})
+        yield from emit('cstream', {}, {})
 
         # B streaming, C accumulation kernel
-        yield ('bstream', {}, {})
+        yield from emit('bstream', {}, {})
 
         # Four-way m-split B streaming, C accumulation kernel
         ms, bsz, blkx = 4, 24, 64
         args = {'msplit': ms, 'bsz': bsz, 'blockx': blkx}
         meta = {'block': (blkx, ms, 1), 'shared': 2*bsz*blkx*dsize}
-        yield ('bstream-msplit', args, meta)
+        yield from emit('bstream-msplit', args, meta)
 
         # Two-way k-split B loading, C streaming kernel
         ks, csz, blkx = 2, 24, 64
         args = {'ksplit': ks, 'csz': csz, 'blockx': blkx}
         meta = {'block': (blkx, ks, 1), 'shared': (ks - 1)*csz*blkx*dsize}
-        yield ('cstream-ksplit', args, meta)
+        yield from emit('cstream-ksplit', args, meta)
+
+        # Tuned HIP variants
+        msplits, ksplits = [4, 8], [2, 4]
+        bsz, csz, blkx = 8, 8, 64
+        width = 2 if self.aligne is not None and self.aligne % 2 == 0 else 1
+
+        # B loading, C streaming kernel
+        args = {'blockx': blkx}
+        meta = {'block': (blkx, 1, 1), 'desc': f'cstream/x{blkx}'}
+        yield from emit('cstream', args, meta)
+
+        # B streaming, C accumulation kernel
+        meta = {'block': (blkx, 1, 1), 'desc': f'bstream/x{blkx}'}
+        yield from emit('bstream', args, meta)
+
+        for ms in msplits:
+            # m-split B streaming, C accumulation kernel
+            args = {'msplit': ms, 'bsz': bsz, 'blockx': blkx}
+            shared = 2*bsz*blkx*dsize
+            meta = {'block': (blkx, ms, 1), 'shared': shared,
+                    'desc': f'bstream-msplit/m{ms}-b{bsz}-x{blkx}'}
+            yield from emit('bstream-msplit', args, meta)
+
+        for ks in ksplits:
+            # k-split B loading, C streaming kernel
+            args = {'ksplit': ks, 'csz': csz, 'blockx': blkx}
+            shared = (ks - 1)*csz*blkx*dsize
+            meta = {'block': (blkx, ks, 1), 'shared': shared,
+                    'desc': f'cstream-ksplit/k{ks}-c{csz}-x{blkx}'}
+            yield from emit('cstream-ksplit', args, meta)
+
+        # B loading, C preloading, C streaming kernel
+        args = {'blockx': blkx}
+        meta = {'block': (blkx, 1, 1), 'desc': f'cstream-preload-c/x{blkx}'}
+        yield from emit('cstream-preload-c', args, meta)
+
+        # B streaming, C preloading, C accumulation kernel
+        meta = {'block': (blkx, 1, 1), 'desc': f'bstream-preload-c/x{blkx}'}
+        yield from emit('bstream-preload-c', args, meta)
+
+        if width > 1:
+            args = {'dtype': f'{dtype}{width}', 'width': width,
+                    'blockx': blkx}
+            meta = {'block': (blkx, 1, 1), 'width': width,
+                    'desc': f'cstream-width-preload-c/w{width}-x{blkx}'}
+            yield from emit('cstream-width-preload-c', args, meta)
+
+            meta = {'block': (blkx, 1, 1), 'width': width,
+                    'desc': f'bstream-width-preload-c/w{width}-x{blkx}'}
+            yield from emit('bstream-width-preload-c', args, meta)
+
+        for ms in msplits:
+            # m-split B streaming, C preloading, C accumulation kernel
+            args = {'msplit': ms, 'bsz': bsz, 'blockx': blkx}
+            shared = 2*bsz*blkx*dsize
+            meta = {'block': (blkx, ms, 1), 'shared': shared,
+                    'desc': f'bstream-msplit-preload-c/m{ms}-b{bsz}-x{blkx}'}
+            yield from emit('bstream-msplit-preload-c', args, meta)
+
+            if width > 1:
+                args = {'msplit': ms, 'bsz': bsz, 'blockx': blkx,
+                        'dtype': f'{dtype}{width}', 'width': width}
+                meta = {
+                    'block': (blkx, ms, 1), 'shared': shared*width,
+                    'width': width,
+                    'desc': (
+                        f'bstream-msplit-width-preload-c/w{width}-'
+                        f'm{ms}-b{bsz}-x{blkx}'
+                    )
+                }
+                yield from emit('bstream-msplit-width-preload-c', args, meta)
+
+        for ks in ksplits:
+            # k-split B loading, C preloading, C streaming kernel
+            args = {'ksplit': ks, 'csz': csz, 'blockx': blkx}
+            shared = (ks - 1)*csz*blkx*dsize
+            meta = {
+                'block': (blkx, ks, 1), 'shared': shared,
+                'desc': f'cstream-ksplit-preload-c/k{ks}-c{csz}-x{blkx}'
+            }
+            yield from emit('cstream-ksplit-preload-c', args, meta)
+
+            if width > 1:
+                args = {'ksplit': ks, 'csz': csz, 'blockx': blkx,
+                        'dtype': f'{dtype}{width}', 'width': width}
+                meta = {
+                    'block': (blkx, ks, 1), 'shared': shared*width,
+                    'width': width,
+                    'desc': (
+                        f'cstream-ksplit-width-preload-c/w{width}-'
+                        f'k{ks}-c{csz}-x{blkx}'
+                    )
+                }
+                yield from emit('cstream-ksplit-width-preload-c', args, meta)
 
     def _process_meta(self, meta):
         if self.n is not None:
diff --git a/gimmik/kernels/hip/base.mako b/gimmik/kernels/hip/base.mako
index 874fbbd..b40a0b9 100644
--- a/gimmik/kernels/hip/base.mako
+++ b/gimmik/kernels/hip/base.mako
@@ -9,4 +9,36 @@ static inline __device__ ${dtype} make_zero()
 { return 0; }
 % endif
 
+static inline __device__ void
+nt_store_c(${dtype}* p, ${dtype} v)
+{
+% if dtype.endswith('4'):
+    __builtin_nontemporal_store(v.x, &p->x);
+    __builtin_nontemporal_store(v.y, &p->y);
+    __builtin_nontemporal_store(v.z, &p->z);
+    __builtin_nontemporal_store(v.w, &p->w);
+% elif dtype.endswith('2'):
+    __builtin_nontemporal_store(v.x, &p->x);
+    __builtin_nontemporal_store(v.y, &p->y);
+% else:
+    __builtin_nontemporal_store(v, p);
+% endif
+}
+
+static inline __device__ ${dtype}
+nt_load_c(const ${dtype}* p)
+{
+% if dtype.endswith('4'):
+    return make_${dtype}(__builtin_nontemporal_load(&p->x),
+                         __builtin_nontemporal_load(&p->y),
+                         __builtin_nontemporal_load(&p->z),
+                         __builtin_nontemporal_load(&p->w));
+% elif dtype.endswith('2'):
+    return make_${dtype}(__builtin_nontemporal_load(&p->x),
+                         __builtin_nontemporal_load(&p->y));
+% else:
+    return __builtin_nontemporal_load(p);
+% endif
+}
+
 ${next.body()}
diff --git a/gimmik/kernels/hip/bstream-msplit-preload-c.mako b/gimmik/kernels/hip/bstream-msplit-preload-c.mako
new file mode 100644
index 0000000..8b6f008
--- /dev/null
+++ b/gimmik/kernels/hip/bstream-msplit-preload-c.mako
@@ -0,0 +1,98 @@
+<%inherit file='base'/>
+
+<%
+mx = partition(A, into=msplit, by='rows')
+bchunks = chunk(bix, bsz)
+%>
+
+__global__ __launch_bounds__(${blockx*msplit}) void
+% if n is None:
+${kname}(int n,
+         const ${dtype}* __restrict__ b, int ldb,
+         ${dtype}* __restrict__ c, int ldc)
+{
+  % if width > 1:
+    n = ((n + ${width} - 1) / ${width}) * ${width};
+    ldb /= ${width};
+    ldc /= ${width};
+  % endif
+% else:
+${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
+{
+    const int n = ${-(-n // width)};
+    const ${'long long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width};
+    const ${'long long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width};
+% endif
+    int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    ${dtype} bv, csub[${-(-m // msplit)}];
+    __shared__ ${dtype} bsub[2][${bsz}][${blockx}];
+
+## Fill the initial shared memory block
+% for cid in range(msplit):
+    if (i < n && threadIdx.y == ${cid})
+    {
+  % for kx in bchunks[0]:
+    % if loop.index % msplit == cid:
+        bsub[0][${loop.index}][threadIdx.x] = b[i + ${kx}*ldb];
+    % endif
+  % endfor
+
+  ## Preload C values for active rows owned by this m-split lane
+  % for j, jx in enumerate(mx[cid]):
+    % if afix[jx] != -1:
+      % if beta == 0:
+        csub[${j}] = make_zero();
+      % elif beta == 1:
+        csub[${j}] = nt_load_c(&c[i + ${jx}*ldc]);
+      % else:
+        csub[${j}] = ${beta}*nt_load_c(&c[i + ${jx}*ldc]);
+      % endif
+    % endif
+  % endfor
+    }
+% endfor
+    __syncthreads();
+
+## Iterate over each row-chunk of B
+% for bb in range(len(bchunks)):
+  ## Iterate over each row-chunk of C
+  % for cid, mcx in enumerate(mx):
+    if (i < n && threadIdx.y == ${cid})
+    {
+    ## Start filling the next shared memory block
+    % if not loop.parent.last:
+      % for kx in bchunks[bb + 1]:
+        % if loop.index % msplit == cid:
+        bsub[${(bb + 1) % 2}][${loop.index}][threadIdx.x] = b[i + ${kx}*ldb];
+        % endif
+      % endfor
+    % endif
+    ## Accumulate our dot products
+    % for kx in bchunks[bb]:
+        bv = bsub[${bb % 2}][${loop.index}][threadIdx.x];
+      % for j, jx in enumerate(A[mcx, kx]):
+        % if jx != 0:
+        csub[${j}] += ${jx}*bv;
+        % endif
+        ## If we're done with this dot product then store to global
+        % if kx == alix[mcx[j]]:
+        nt_store_c(&c[i + ${mcx[j]}*ldc], csub[${j}]);
+        % endif
+      % endfor
+    % endfor
+    ## Handle rows of A which are all zero
+    % if loop.parent.last:
+      % for j, jx in enumerate(afix):
+        % if jx == -1 and j % msplit == cid and beta == 0:
+        nt_store_c(&c[i + ${j}*ldc], make_zero());
+        % elif jx == -1 and j % msplit == cid and beta != 1:
+        nt_store_c(&c[i + ${j}*ldc], ${beta}*nt_load_c(&c[i + ${j}*ldc]));
+        % endif
+      % endfor
+    % endif
+    }
+  % endfor
+    __syncthreads();
+% endfor
+}
diff --git a/gimmik/kernels/hip/bstream-msplit-width-preload-c.mako b/gimmik/kernels/hip/bstream-msplit-width-preload-c.mako
new file mode 100644
index 0000000..9659db7
--- /dev/null
+++ b/gimmik/kernels/hip/bstream-msplit-width-preload-c.mako
@@ -0,0 +1,138 @@
+<%inherit file='base'/>
+
+% if width == 2:
+static inline __device__ ${dtype}
+gimmik_vmul(${dtype[:-1]} a, ${dtype} b)
+{
+    return make_${dtype}(a*b.x, a*b.y);
+}
+
+static inline __device__ ${dtype}
+gimmik_vadd(${dtype} a, ${dtype} b)
+{
+    return make_${dtype}(a.x + b.x, a.y + b.y);
+}
+
+static inline __device__ ${dtype}
+gimmik_vmadd(${dtype} acc, ${dtype[:-1]} a, ${dtype} b)
+{
+    return make_${dtype}(acc.x + a*b.x, acc.y + a*b.y);
+}
+% elif width == 4:
+static inline __device__ ${dtype}
+gimmik_vmul(${dtype[:-1]} a, ${dtype} b)
+{
+    return make_${dtype}(a*b.x, a*b.y, a*b.z, a*b.w);
+}
+
+static inline __device__ ${dtype}
+gimmik_vadd(${dtype} a, ${dtype} b)
+{
+    return make_${dtype}(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
+}
+
+static inline __device__ ${dtype}
+gimmik_vmadd(${dtype} acc, ${dtype[:-1]} a, ${dtype} b)
+{
+    return make_${dtype}(acc.x + a*b.x, acc.y + a*b.y, acc.z + a*b.z, acc.w + a*b.w);
+}
+% else:
+#error "bstream_msplit_width_preload_c only supports width=2 or width=4"
+% endif
+
+<%
+mx = partition(A, into=msplit, by='rows')
+bchunks = chunk(bix, bsz)
+%>
+
+__global__ __launch_bounds__(${blockx*msplit}) void
+% if n is None:
+${kname}(int n,
+         const ${dtype}* __restrict__ b, int ldb,
+         ${dtype}* __restrict__ c, int ldc)
+{
+  % if width > 1:
+    n = (n + ${width} - 1) / ${width};
+    ldb /= ${width};
+    ldc /= ${width};
+  % endif
+% else:
+${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
+{
+    const int n = ${-(-n // width)};
+    const ${'long long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width};
+    const ${'long long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width};
+% endif
+    int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    ${dtype} bv, csub[${-(-m // msplit)}];
+    __shared__ ${dtype} bsub[2][${bsz}][${blockx}];
+
+## Fill the initial shared memory block
+% for cid in range(msplit):
+    if (i < n && threadIdx.y == ${cid})
+    {
+  % for kx in bchunks[0]:
+    % if loop.index % msplit == cid:
+        bsub[0][${loop.index}][threadIdx.x] = b[i + ${kx}*ldb];
+    % endif
+  % endfor
+
+  ## Preload C values for active rows owned by this m-split lane
+  % for j, jx in enumerate(mx[cid]):
+    % if afix[jx] != -1:
+      % if beta == 0:
+        csub[${j}] = make_zero();
+      % elif beta == 1:
+        csub[${j}] = nt_load_c(&c[i + ${jx}*ldc]);
+      % else:
+        csub[${j}] = gimmik_vmul(${beta}, nt_load_c(&c[i + ${jx}*ldc]));
+      % endif
+    % endif
+  % endfor
+    }
+% endfor
+    __syncthreads();
+
+## Iterate over each row-chunk of B
+% for bb in range(len(bchunks)):
+  ## Iterate over each row-chunk of C
+  % for cid, mcx in enumerate(mx):
+    if (i < n && threadIdx.y == ${cid})
+    {
+    ## Start filling the next shared memory block
+    % if not loop.parent.last:
+      % for kx in bchunks[bb + 1]:
+        % if loop.index % msplit == cid:
+        bsub[${(bb + 1) % 2}][${loop.index}][threadIdx.x] = b[i + ${kx}*ldb];
+        % endif
+      % endfor
+    % endif
+    ## Accumulate our dot products
+    % for kx in bchunks[bb]:
+        bv = bsub[${bb % 2}][${loop.index}][threadIdx.x];
+      % for j, jx in enumerate(A[mcx, kx]):
+        % if jx != 0:
+        csub[${j}] = gimmik_vmadd(csub[${j}], ${jx}, bv);
+        % endif
+        ## If we're done with this dot product then store to global
+        % if kx == alix[mcx[j]]:
+        nt_store_c(&c[i + ${mcx[j]}*ldc], csub[${j}]);
+        % endif
+      % endfor
+    % endfor
+    ## Handle rows of A which are all zero
+    % if loop.parent.last:
+      % for j, jx in enumerate(afix):
+        % if jx == -1 and j % msplit == cid and beta == 0:
+        nt_store_c(&c[i + ${j}*ldc], make_zero());
+        % elif jx == -1 and j % msplit == cid and beta != 1:
+        nt_store_c(&c[i + ${j}*ldc], gimmik_vmul(${beta}, nt_load_c(&c[i + ${j}*ldc])));
+        % endif
+      % endfor
+    % endif
+    }
+  % endfor
+    __syncthreads();
+% endfor
+}
diff --git a/gimmik/kernels/hip/bstream-msplit.mako b/gimmik/kernels/hip/bstream-msplit.mako
index 6359ca1..6470477 100644
--- a/gimmik/kernels/hip/bstream-msplit.mako
+++ b/gimmik/kernels/hip/bstream-msplit.mako
@@ -66,11 +66,11 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
         % endif
         ## If we're done with this dot product then store to global
         % if kx == alix[mcx[j]] and beta == 0:
-        c[i + ${mcx[j]}*ldc] = csub[${j}];
+        nt_store_c(&c[i + ${mcx[j]}*ldc], csub[${j}]);
         % elif kx == alix[mcx[j]] and beta == 1:
-        c[i + ${mcx[j]}*ldc] += csub[${j}];
+        nt_store_c(&c[i + ${mcx[j]}*ldc], nt_load_c(&c[i + ${mcx[j]}*ldc]) + csub[${j}]);
         % elif kx == alix[mcx[j]]:
-        c[i + ${mcx[j]}*ldc] = csub[${j}] + ${beta}*c[i + ${mcx[j]}*ldc];
+        nt_store_c(&c[i + ${mcx[j]}*ldc], csub[${j}] + ${beta}*nt_load_c(&c[i + ${mcx[j]}*ldc]));
         % endif
       % endfor
     % endfor
@@ -78,9 +78,9 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
     % if loop.parent.last:
       % for j, jx in enumerate(afix):
         % if jx == -1 and j % msplit == cid and beta == 0:
-        c[i + ${j}*ldc] = make_zero();
+        nt_store_c(&c[i + ${j}*ldc], make_zero());
         % elif jx == -1 and j % msplit == cid and beta != 1:
-        c[i + ${j}*ldc] *= ${beta};
+        nt_store_c(&c[i + ${j}*ldc], nt_load_c(&c[i + ${j}*ldc])*${beta});
         % endif
       % endfor
     % endif
diff --git a/gimmik/kernels/hip/bstream-preload-c.mako b/gimmik/kernels/hip/bstream-preload-c.mako
new file mode 100644
index 0000000..30b08f6
--- /dev/null
+++ b/gimmik/kernels/hip/bstream-preload-c.mako
@@ -0,0 +1,63 @@
+<%inherit file='base'/>
+
+__global__ __launch_bounds__(${blockx}) void
+% if n is None:
+${kname}(int n,
+         const ${dtype}* __restrict__ b, int ldb,
+         ${dtype}* __restrict__ c, int ldc)
+{
+  % if width > 1:
+    n = ((n + ${width} - 1) / ${width}) * ${width};
+    ldb /= ${width};
+    ldc /= ${width};
+  % endif
+% else:
+${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
+{
+    const int n = ${-(-n // width)};
+    const ${'long long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width};
+    const ${'long long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width};
+% endif
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i < n)
+    {
+        ${dtype} bv, csub[${m}];
+
+## Preload C values for rows which will receive a non-zero dot product
+% for j, jx in enumerate(afix):
+  % if jx != -1:
+    % if beta == 0:
+        csub[${j}] = make_zero();
+    % elif beta == 1:
+        csub[${j}] = nt_load_c(&c[i + ${j}*ldc]);
+    % else:
+        csub[${j}] = ${beta}*nt_load_c(&c[i + ${j}*ldc]);
+    % endif
+  % endif
+% endfor
+
+## Iterate through the used rows of B
+% for kx in bix:
+        bv = b[i + ${kx}*ldb];
+  % for j, jx in enumerate(A[:, kx]):
+    % if jx != 0:
+        csub[${j}] += ${jx}*bv;
+    % endif
+    ##
+    % if kx == alix[j]:
+        nt_store_c(&c[i + ${j}*ldc], csub[${j}]);
+    % endif
+  % endfor
+% endfor
+
+## Handle rows of A which are all zero
+% for j, jx in enumerate(afix):
+  % if jx == -1 and beta == 0:
+        nt_store_c(&c[i + ${j}*ldc], make_zero());
+  % elif jx == -1 and beta != 1:
+        nt_store_c(&c[i + ${j}*ldc], ${beta}*nt_load_c(&c[i + ${j}*ldc]));
+  % endif
+% endfor
+    }
+}
diff --git a/gimmik/kernels/hip/bstream-width-preload-c.mako b/gimmik/kernels/hip/bstream-width-preload-c.mako
new file mode 100644
index 0000000..97a7571
--- /dev/null
+++ b/gimmik/kernels/hip/bstream-width-preload-c.mako
@@ -0,0 +1,103 @@
+<%inherit file='base'/>
+
+% if width == 2:
+static inline __device__ ${dtype}
+gimmik_vmul(${dtype[:-1]} a, ${dtype} b)
+{
+    return make_${dtype}(a*b.x, a*b.y);
+}
+
+static inline __device__ ${dtype}
+gimmik_vadd(${dtype} a, ${dtype} b)
+{
+    return make_${dtype}(a.x + b.x, a.y + b.y);
+}
+
+static inline __device__ ${dtype}
+gimmik_vmadd(${dtype} acc, ${dtype[:-1]} a, ${dtype} b)
+{
+    return make_${dtype}(acc.x + a*b.x, acc.y + a*b.y);
+}
+% elif width == 4:
+static inline __device__ ${dtype}
+gimmik_vmul(${dtype[:-1]} a, ${dtype} b)
+{
+    return make_${dtype}(a*b.x, a*b.y, a*b.z, a*b.w);
+}
+
+static inline __device__ ${dtype}
+gimmik_vadd(${dtype} a, ${dtype} b)
+{
+    return make_${dtype}(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
+}
+
+static inline __device__ ${dtype}
+gimmik_vmadd(${dtype} acc, ${dtype[:-1]} a, ${dtype} b)
+{
+    return make_${dtype}(acc.x + a*b.x, acc.y + a*b.y, acc.z + a*b.z, acc.w + a*b.w);
+}
+% else:
+#error "bstream_width_preload_c only supports width=2 or width=4"
+% endif
+
+__global__ __launch_bounds__(${blockx}) void
+% if n is None:
+${kname}(int n,
+         const ${dtype}* __restrict__ b, int ldb,
+         ${dtype}* __restrict__ c, int ldc)
+{
+  % if width > 1:
+    n = (n + ${width} - 1) / ${width};
+    ldb /= ${width};
+    ldc /= ${width};
+  % endif
+% else:
+${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
+{
+    const int n = ${-(-n // width)};
+    const ${'long long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width};
+    const ${'long long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width};
+% endif
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i < n)
+    {
+        ${dtype} bv, csub[${m}];
+
+## Preload C values for rows which will receive a non-zero dot product
+% for j, jx in enumerate(afix):
+  % if jx != -1:
+    % if beta == 0:
+        csub[${j}] = make_zero();
+    % elif beta == 1:
+        csub[${j}] = nt_load_c(&c[i + ${j}*ldc]);
+    % else:
+        csub[${j}] = gimmik_vmul(${beta}, nt_load_c(&c[i + ${j}*ldc]));
+    % endif
+  % endif
+% endfor
+
+## Iterate through the used rows of B
+% for kx in bix:
+        bv = b[i + ${kx}*ldb];
+  % for j, jx in enumerate(A[:, kx]):
+    % if jx != 0:
+        csub[${j}] = gimmik_vmadd(csub[${j}], ${jx}, bv);
+    % endif
+    ##
+    % if kx == alix[j]:
+        nt_store_c(&c[i + ${j}*ldc], csub[${j}]);
+    % endif
+  % endfor
+% endfor
+
+## Handle rows of A which are all zero
+% for j, jx in enumerate(afix):
+  % if jx == -1 and beta == 0:
+        nt_store_c(&c[i + ${j}*ldc], make_zero());
+  % elif jx == -1 and beta != 1:
+        nt_store_c(&c[i + ${j}*ldc], gimmik_vmul(${beta}, nt_load_c(&c[i + ${j}*ldc])));
+  % endif
+% endfor
+    }
+}
diff --git a/gimmik/kernels/hip/bstream.mako b/gimmik/kernels/hip/bstream.mako
index 2f6dc62..9634c73 100644
--- a/gimmik/kernels/hip/bstream.mako
+++ b/gimmik/kernels/hip/bstream.mako
@@ -1,6 +1,6 @@
 <%inherit file='base'/>
 
-__global__ __launch_bounds__(128) void
+__global__ __launch_bounds__(${blockx}) void
 % if n is None:
 ${kname}(int n,
          const ${dtype}* __restrict__ b, int ldb,
@@ -35,11 +35,11 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
     % endif
     ##
     % if kx == alix[j] and beta == 0:
-        c[i + ${j}*ldc] = csub[${j}];
+        nt_store_c(&c[i + ${j}*ldc], csub[${j}]);
     % elif kx == alix[j] and beta == 1:
-        c[i + ${j}*ldc] += csub[${j}];
+        nt_store_c(&c[i + ${j}*ldc], nt_load_c(&c[i + ${j}*ldc]) + csub[${j}]);
     % elif kx == alix[j]:
-        c[i + ${j}*ldc] = csub[${j}] + ${beta}*c[i + ${j}*ldc];
+        nt_store_c(&c[i + ${j}*ldc], csub[${j}] + ${beta}*nt_load_c(&c[i + ${j}*ldc]));
     % endif
   % endfor
 % endfor
@@ -47,9 +47,9 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
 ## Handle rows of A which are all zero
 % for j, jx in enumerate(afix):
   % if jx == -1 and beta == 0:
-        c[i + ${j}*ldc] = make_zero();
+        nt_store_c(&c[i + ${j}*ldc], make_zero());
   % elif jx == -1 and beta != 1:
-        c[i + ${j}*ldc] *= ${beta};
+        nt_store_c(&c[i + ${j}*ldc], nt_load_c(&c[i + ${j}*ldc])*${beta});
   % endif
 % endfor
     }
diff --git a/gimmik/kernels/hip/cstream-ksplit-preload-c.mako b/gimmik/kernels/hip/cstream-ksplit-preload-c.mako
new file mode 100644
index 0000000..507c34f
--- /dev/null
+++ b/gimmik/kernels/hip/cstream-ksplit-preload-c.mako
@@ -0,0 +1,103 @@
+<%inherit file='base'/>
+
+<%
+kparts = partition(A, ksplit, by='cols')
+cchunks = chunk(range(m), csz)
+loaded = set()
+%>
+
+__global__ __launch_bounds__(${blockx*ksplit}) void
+% if n is None:
+${kname}(int n,
+         const ${dtype}* __restrict__ b, int ldb,
+         ${dtype}* __restrict__ c, int ldc)
+{
+  % if width > 1:
+    n = ((n + ${width} - 1) / ${width}) * ${width};
+    ldb /= ${width};
+    ldc /= ${width};
+  % endif
+% else:
+${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
+{
+    const int n = ${-(-n // width)};
+    const ${'long long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width};
+    const ${'long long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width};
+% endif
+    int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    ${dtype} cv[${-(-csz // ksplit)}], bv[${-(-k // ksplit)}], dotp;
+    __shared__ ${dtype} csub[${ksplit - 1}][${csz}][${blockx}];
+
+## Iterate over the row-partitions of C
+% for cchunk in cchunks:
+  ## Iterate over the row-partitions of B
+  % for bid, kbx in enumerate(kparts):
+    if (i < n && threadIdx.y == ${bid})
+    {
+    ## Evaluate our partial dot products
+    % for j in cchunk:
+      ## Load in any missing parts of B
+      % for kx in kbx:
+        % if A[j, kx] != 0 and kx not in loaded:
+        bv[${loop.index}] = b[i + ${kx}*ldb]; <% loaded.add(kx) %>
+        % endif
+      % endfor
+      <%
+      dotex = dot(lambda kx: f'bv[{kx}]', A[j, kbx])
+      has_dotp = any(A[j, kx] != 0 for kx in range(k))
+      %>
+      % if dotex != '0.0':
+        dotp = ${dotex};
+      % else:
+        dotp = make_zero();
+      % endif
+      ## Save to a register
+      % if loop.index % ksplit == bid:
+        % if beta == 0:
+        cv[${loop.index // ksplit}] = dotp;
+        % elif beta == 1 and has_dotp:
+        cv[${loop.index // ksplit}] = nt_load_c(&c[i + ${j}*ldc]);
+        cv[${loop.index // ksplit}] += dotp;
+        % elif has_dotp:
+        cv[${loop.index // ksplit}] = ${beta}*nt_load_c(&c[i + ${j}*ldc]);
+        cv[${loop.index // ksplit}] += dotp;
+        % endif
+      ## Save to shared memory
+      % else:
+        csub[${bid - (bid > loop.index % ksplit)}][${loop.index}][threadIdx.x] = dotp;
+      % endif
+    % endfor
+    }
+  % endfor
+    __syncthreads();
+  ## Iterate over the column-partitions of B
+  % for bid, kbx in enumerate(kparts):
+    if (i < n && threadIdx.y == ${bid})
+    {
+    ## Sum and output the final set of dot products
+    % for j in cchunk:
+      % if loop.index % ksplit == bid:
+        <% has_dotp = any(A[j, kx] != 0 for kx in range(k)) %>
+        % if beta == 0:
+        dotp = cv[${loop.index // ksplit}] + ${' + '.join(f'csub[{i}][{loop.index}][threadIdx.x]'
+                                                          for i in range(ksplit - 1))};
+        nt_store_c(&c[i + ${j}*ldc], dotp);
+        % elif beta == 1 and has_dotp:
+        dotp = cv[${loop.index // ksplit}] + ${' + '.join(f'csub[{i}][{loop.index}][threadIdx.x]'
+                                                          for i in range(ksplit - 1))};
+        nt_store_c(&c[i + ${j}*ldc], dotp);
+        % elif beta != 1 and has_dotp:
+        dotp = cv[${loop.index // ksplit}] + ${' + '.join(f'csub[{i}][{loop.index}][threadIdx.x]'
+                                                          for i in range(ksplit - 1))};
+        nt_store_c(&c[i + ${j}*ldc], dotp);
+        % elif beta != 1:
+        nt_store_c(&c[i + ${j}*ldc], ${beta}*nt_load_c(&c[i + ${j}*ldc]));
+        % endif
+      % endif
+    % endfor
+    }
+  % endfor
+    __syncthreads();
+% endfor
+}
diff --git a/gimmik/kernels/hip/cstream-ksplit-width-preload-c.mako b/gimmik/kernels/hip/cstream-ksplit-width-preload-c.mako
new file mode 100644
index 0000000..b435913
--- /dev/null
+++ b/gimmik/kernels/hip/cstream-ksplit-width-preload-c.mako
@@ -0,0 +1,157 @@
+<%inherit file='base'/>
+
+% if width == 2:
+static inline __device__ ${dtype}
+gimmik_vmul(${dtype[:-1]} a, ${dtype} b)
+{
+    return make_${dtype}(a*b.x, a*b.y);
+}
+
+static inline __device__ ${dtype}
+gimmik_vadd(${dtype} a, ${dtype} b)
+{
+    return make_${dtype}(a.x + b.x, a.y + b.y);
+}
+
+static inline __device__ ${dtype}
+gimmik_vmadd(${dtype} acc, ${dtype[:-1]} a, ${dtype} b)
+{
+    // Keep the multiply-add expression visible to the compiler.
+    return make_${dtype}(acc.x + a*b.x, acc.y + a*b.y);
+}
+% elif width == 4:
+static inline __device__ ${dtype}
+gimmik_vmul(${dtype[:-1]} a, ${dtype} b)
+{
+    return make_${dtype}(a*b.x, a*b.y, a*b.z, a*b.w);
+}
+
+static inline __device__ ${dtype}
+gimmik_vadd(${dtype} a, ${dtype} b)
+{
+    return make_${dtype}(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
+}
+
+static inline __device__ ${dtype}
+gimmik_vmadd(${dtype} acc, ${dtype[:-1]} a, ${dtype} b)
+{
+    // Keep the multiply-add expression visible to the compiler.
+    return make_${dtype}(acc.x + a*b.x, acc.y + a*b.y, acc.z + a*b.z, acc.w + a*b.w);
+}
+% else:
+#error "cstream_ksplit_width_preload_c only supports width=2 or width=4"
+% endif
+
+<%
+kparts = partition(A, ksplit, by='cols')
+cchunks = chunk(range(m), csz)
+loaded = set()
+%>
+
+__global__ __launch_bounds__(${blockx*ksplit}) void
+% if n is None:
+${kname}(int n,
+         const ${dtype}* __restrict__ b, int ldb,
+         ${dtype}* __restrict__ c, int ldc)
+{
+  % if width > 1:
+    n = (n + ${width} - 1) / ${width};
+    ldb /= ${width};
+    ldc /= ${width};
+  % endif
+% else:
+${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
+{
+    const int n = ${-(-n // width)};
+    const ${'long long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width};
+    const ${'long long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width};
+% endif
+    int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    ${dtype} cv[${-(-csz // ksplit)}], bv[${-(-k // ksplit)}], dotp;
+    __shared__ ${dtype} csub[${ksplit - 1}][${csz}][${blockx}];
+
+## Iterate over the row-partitions of C
+% for cchunk in cchunks:
+  ## Iterate over the column-partitions of B
+  % for bid, kbx in enumerate(kparts):
+    if (i < n && threadIdx.y == ${bid})
+    {
+    ## Evaluate our partial dot products
+    % for j in cchunk:
+      ## Load in any missing parts of B
+      % for kx in kbx:
+        % if A[j, kx] != 0 and kx not in loaded:
+        bv[${loop.index}] = b[i + ${kx}*ldb]; <% loaded.add(kx) %>
+        % endif
+      % endfor
+
+      ## Expand vectorized partial dot product
+      <%
+      nzixs = []
+      for l_idx, kx in enumerate(kbx):
+          if A[j, kx] != 0:
+              nzixs.append((l_idx, kx))
+
+      has_dotp = any(A[j, kx] != 0 for kx in range(k))
+      if not nzixs:
+          dotex = 'make_zero()'
+      else:
+          first_l_idx, first_kx = nzixs[0]
+          dotex = f"gimmik_vmul({A[j, first_kx]}, bv[{first_l_idx}])"
+          for l_idx, kx in nzixs[1:]:
+              dotex = f"gimmik_vmadd({dotex}, {A[j, kx]}, bv[{l_idx}])"
+      %>
+      dotp = ${dotex};
+
+      ## Save to a register
+      % if loop.index % ksplit == bid:
+        % if beta == 0:
+        cv[${loop.index // ksplit}] = dotp;
+        % elif beta == 1 and has_dotp:
+        cv[${loop.index // ksplit}] = nt_load_c(&c[i + ${j}*ldc]);
+        cv[${loop.index // ksplit}] = gimmik_vadd(cv[${loop.index // ksplit}], dotp);
+        % elif has_dotp:
+        cv[${loop.index // ksplit}] = gimmik_vmul(${beta}, nt_load_c(&c[i + ${j}*ldc]));
+        cv[${loop.index // ksplit}] = gimmik_vadd(cv[${loop.index // ksplit}], dotp);
+        % endif
+      ## Save to shared memory
+      % else:
+        csub[${bid - (bid > loop.index % ksplit)}][${loop.index}][threadIdx.x] = dotp;
+      % endif
+    % endfor
+    }
+  % endfor
+    __syncthreads();
+    
+  ## Sum and output the final set of dot products
+  % for bid, kbx in enumerate(kparts):
+    if (i < n && threadIdx.y == ${bid})
+    {
+    % for j in cchunk:
+      % if loop.index % ksplit == bid:
+        <%
+        has_dotp = any(A[j, kx] != 0 for kx in range(k))
+        sum_expr = f"cv[{loop.index // ksplit}]"
+        for s_idx in range(ksplit - 1):
+            sum_expr = f"gimmik_vadd({sum_expr}, csub[{s_idx}][{loop.index}][threadIdx.x])"
+        %>
+        % if beta == 0:
+        dotp = ${sum_expr};
+        nt_store_c(&c[i + ${j}*ldc], dotp);
+        % elif beta == 1 and has_dotp:
+        dotp = ${sum_expr};
+        nt_store_c(&c[i + ${j}*ldc], dotp);
+        % elif beta != 1 and has_dotp:
+        dotp = ${sum_expr};
+        nt_store_c(&c[i + ${j}*ldc], dotp);
+        % elif beta != 1:
+        nt_store_c(&c[i + ${j}*ldc], gimmik_vmul(${beta}, nt_load_c(&c[i + ${j}*ldc])));
+        % endif
+      % endif
+    % endfor
+    }
+  % endfor
+    __syncthreads();
+% endfor
+}
diff --git a/gimmik/kernels/hip/cstream-ksplit.mako b/gimmik/kernels/hip/cstream-ksplit.mako
index bae2d2a..6fd3210 100644
--- a/gimmik/kernels/hip/cstream-ksplit.mako
+++ b/gimmik/kernels/hip/cstream-ksplit.mako
@@ -69,11 +69,11 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
         dotp = cv[${loop.index // ksplit}] + ${' + '.join(f'csub[{i}][{loop.index}][threadIdx.x]'
                                                           for i in range(ksplit - 1))};
         % if beta == 0:
-        c[i + ${j}*ldc] = dotp;
+        nt_store_c(&c[i + ${j}*ldc], dotp);
         % elif beta == 1:
-        c[i + ${j}*ldc] += dotp;
+        nt_store_c(&c[i + ${j}*ldc], nt_load_c(&c[i + ${j}*ldc]) + dotp);
         % else:
-        c[i + ${j}*ldc] = dotp + ${beta}*c[i + ${j}*ldc];
+        nt_store_c(&c[i + ${j}*ldc], dotp + ${beta}*nt_load_c(&c[i + ${j}*ldc]));
         % endif
       % endif
     % endfor
diff --git a/gimmik/kernels/hip/cstream-preload-c.mako b/gimmik/kernels/hip/cstream-preload-c.mako
new file mode 100644
index 0000000..041e674
--- /dev/null
+++ b/gimmik/kernels/hip/cstream-preload-c.mako
@@ -0,0 +1,51 @@
+<%inherit file='base'/>
+
+<% ksplit = 2 if m < 36 else 1 %>
+
+__global__ __launch_bounds__(128) void
+% if n is None:
+${kname}(int n,
+         const ${dtype}* __restrict__ b, int ldb,
+         ${dtype}* __restrict__ c, int ldc)
+{
+  % if width > 1:
+    n = ((n + ${width} - 1) / ${width}) * ${width};
+    ldb /= ${width};
+    ldc /= ${width};
+  % endif
+% else:
+${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
+{
+    const int n = ${-(-n // width)};
+    const ${'long long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width};
+    const ${'long long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width};
+% endif
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+    ${dtype} dotp;
+
+    if (i < n)
+    {
+% for j, jx in enumerate(A):
+  % if (dotex := dot(lambda kx: f'b[i + {kx}*ldb]', jx, maxsplit=ksplit)) != '0.0':
+    % if beta == 0:
+        dotp = ${dotex};
+        nt_store_c(&c[i + ${j}*ldc], dotp);
+    % elif beta == 1:
+        dotp = nt_load_c(&c[i + ${j}*ldc]);
+        dotp += ${dotex};
+        nt_store_c(&c[i + ${j}*ldc], dotp);
+    % else:
+        dotp = ${beta}*nt_load_c(&c[i + ${j}*ldc]);
+        dotp += ${dotex};
+        nt_store_c(&c[i + ${j}*ldc], dotp);
+    % endif
+  % else:
+    % if beta == 0:
+        nt_store_c(&c[i + ${j}*ldc], make_zero());
+    % elif beta != 1:
+        nt_store_c(&c[i + ${j}*ldc], ${beta}*nt_load_c(&c[i + ${j}*ldc]));
+    % endif
+  % endif
+% endfor
+    }
+}
diff --git a/gimmik/kernels/hip/cstream-width-preload-c.mako b/gimmik/kernels/hip/cstream-width-preload-c.mako
new file mode 100644
index 0000000..9f2f57c
--- /dev/null
+++ b/gimmik/kernels/hip/cstream-width-preload-c.mako
@@ -0,0 +1,106 @@
+<%inherit file='base'/>
+
+% if width == 2:
+static inline __device__ ${dtype}
+gimmik_vmul(${dtype[:-1]} a, ${dtype} b)
+{
+    return make_${dtype}(a*b.x, a*b.y);
+}
+
+static inline __device__ ${dtype}
+gimmik_vadd(${dtype} a, ${dtype} b)
+{
+    return make_${dtype}(a.x + b.x, a.y + b.y);
+}
+
+static inline __device__ ${dtype}
+gimmik_vmadd(${dtype} acc, ${dtype[:-1]} a, ${dtype} b)
+{
+    // Keep the multiply-add expression visible to the compiler.
+    return make_${dtype}(acc.x + a*b.x, acc.y + a*b.y);
+}
+% elif width == 4:
+static inline __device__ ${dtype}
+gimmik_vmul(${dtype[:-1]} a, ${dtype} b)
+{
+    return make_${dtype}(a*b.x, a*b.y, a*b.z, a*b.w);
+}
+
+static inline __device__ ${dtype}
+gimmik_vadd(${dtype} a, ${dtype} b)
+{
+    return make_${dtype}(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
+}
+
+static inline __device__ ${dtype}
+gimmik_vmadd(${dtype} acc, ${dtype[:-1]} a, ${dtype} b)
+{
+    // Keep the multiply-add expression visible to the compiler.
+    return make_${dtype}(acc.x + a*b.x, acc.y + a*b.y, acc.z + a*b.z, acc.w + a*b.w);
+}
+% else:
+#error "cstream_width_preload_c only supports width=2 or width=4"
+% endif
+
+__global__ __launch_bounds__(${blockx}) void
+% if n is None:
+${kname}(int n,
+         const ${dtype}* __restrict__ b, int ldb,
+         ${dtype}* __restrict__ c, int ldc)
+{
+  % if width > 1:
+    n = (n + ${width} - 1) / ${width};
+    ldb /= ${width};
+    ldc /= ${width};
+  % endif
+% else:
+${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
+{
+    const int n = ${-(-n // width)};
+    const ${'long long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width};
+    const ${'long long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width};
+% endif
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+    ${dtype} bv, dotp;
+
+    if (i < n)
+    {
+% for j, row in enumerate(A):
+  <%
+  nzixs = [kx for kx, val in enumerate(row) if val != 0]
+  %>
+  % if nzixs:
+    % if beta == 0:
+      <% first_kx = nzixs[0] %>
+        bv = b[i + ${first_kx}*ldb];
+        dotp = gimmik_vmul(${row[first_kx]}, bv);
+      % for kx in nzixs[1:]:
+        bv = b[i + ${kx}*ldb];
+        dotp = gimmik_vmadd(dotp, ${row[kx]}, bv);
+      % endfor
+        nt_store_c(&c[i + ${j}*ldc], dotp);
+    % elif beta == 1:
+        dotp = nt_load_c(&c[i + ${j}*ldc]);
+      % for kx in nzixs:
+        bv = b[i + ${kx}*ldb];
+        dotp = gimmik_vmadd(dotp, ${row[kx]}, bv);
+      % endfor
+        nt_store_c(&c[i + ${j}*ldc], dotp);
+    % else:
+        dotp = gimmik_vmul(${beta}, nt_load_c(&c[i + ${j}*ldc]));
+      % for kx in nzixs:
+        bv = b[i + ${kx}*ldb];
+        dotp = gimmik_vmadd(dotp, ${row[kx]}, bv);
+      % endfor
+        nt_store_c(&c[i + ${j}*ldc], dotp);
+    % endif
+  % else:
+    % if beta == 0:
+        nt_store_c(&c[i + ${j}*ldc], make_zero());
+    % elif beta != 1:
+        nt_store_c(&c[i + ${j}*ldc], gimmik_vmul(${beta}, nt_load_c(&c[i + ${j}*ldc])));
+    % endif
+  % endif
+% endfor
+    }
+}
diff --git a/gimmik/kernels/hip/cstream.mako b/gimmik/kernels/hip/cstream.mako
index f75301d..0651e87 100644
--- a/gimmik/kernels/hip/cstream.mako
+++ b/gimmik/kernels/hip/cstream.mako
@@ -2,7 +2,7 @@
 
 <% ksplit = 2 if m < 36 else 1 %>
 
-__global__ __launch_bounds__(128) void
+__global__ __launch_bounds__(${blockx}) void
 % if n is None:
 ${kname}(int n,
          const ${dtype}* __restrict__ b, int ldb,
@@ -32,11 +32,11 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
         dotp = make_zero();
   % endif
   % if beta == 0:
-        c[i + ${j}*ldc] = dotp;
+        nt_store_c(&c[i + ${j}*ldc], dotp);
   % elif beta == 1 and dotex != '0.0':
-        c[i + ${j}*ldc] += dotp;
+        nt_store_c(&c[i + ${j}*ldc], nt_load_c(&c[i + ${j}*ldc]) + dotp);
   % else:
-        c[i + ${j}*ldc] = dotp + ${beta}*c[i + ${j}*ldc];
+        nt_store_c(&c[i + ${j}*ldc], dotp + ${beta}*nt_load_c(&c[i + ${j}*ldc]));
   % endif
 % endfor
     }

From 8f4d03ee5b2c6e00a1bb7872bca64f0fae03d9d4 Mon Sep 17 00:00:00 2001
From: tomjen12 <tomjen12@amd.com>
Date: Thu, 18 Jun 2026 00:12:02 -0500
Subject: [PATCH 02/15] Fix HIP GiMMiK block size metadata

---
 gimmik/hip.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/gimmik/hip.py b/gimmik/hip.py
index ee38c9f..920cc0b 100644
--- a/gimmik/hip.py
+++ b/gimmik/hip.py
@@ -19,11 +19,13 @@ def emit(name, args, meta):
             if threads <= max_block_threads and shared <= max_shared:
                 yield (name, args, meta)
 
+        blkx = self.basemeta['block'][0]
+
         # B loading, C streaming kernel
-        yield from emit('cstream', {}, {})
+        yield from emit('cstream', {'blockx': blkx}, {})
 
         # B streaming, C accumulation kernel
-        yield from emit('bstream', {}, {})
+        yield from emit('bstream', {'blockx': blkx}, {})
 
         # Four-way m-split B streaming, C accumulation kernel
         ms, bsz, blkx = 4, 24, 64

From 96671a6364f8ccbac0d14d89364cda13f5223952 Mon Sep 17 00:00:00 2001
From: tomjen12 <tomjen12@amd.com>
Date: Sun, 21 Jun 2026 23:18:07 -0500
Subject: [PATCH 03/15] Address HIP GiMMiK review comments

---
 gimmik/hip.py                                 |  4 ++
 .../hip/bstream-msplit-width-preload-c.mako   | 40 +--------------
 .../kernels/hip/bstream-width-preload-c.mako  | 40 +--------------
 .../kernels/hip/cstream-ksplit-preload-c.mako |  4 +-
 .../hip/cstream-ksplit-width-preload-c.mako   | 51 ++-----------------
 .../kernels/hip/cstream-width-preload-c.mako  | 42 +--------------
 gimmik/kernels/hip/vector.mako                | 41 +++++++++++++++
 7 files changed, 54 insertions(+), 168 deletions(-)
 create mode 100644 gimmik/kernels/hip/vector.mako

diff --git a/gimmik/hip.py b/gimmik/hip.py
index 920cc0b..cea1be5 100644
--- a/gimmik/hip.py
+++ b/gimmik/hip.py
@@ -39,6 +39,10 @@ def emit(name, args, meta):
         meta = {'block': (blkx, ks, 1), 'shared': (ks - 1)*csz*blkx*dsize}
         yield from emit('cstream-ksplit', args, meta)
 
+        # Only emit tuned variants on the architecture they were tuned for.
+        if gcn_arch != 'gfx942' or warp_size != 64:
+            return
+
         # Tuned HIP variants
         msplits, ksplits = [4, 8], [2, 4]
         bsz, csz, blkx = 8, 8, 64
diff --git a/gimmik/kernels/hip/bstream-msplit-width-preload-c.mako b/gimmik/kernels/hip/bstream-msplit-width-preload-c.mako
index 9659db7..4466f9f 100644
--- a/gimmik/kernels/hip/bstream-msplit-width-preload-c.mako
+++ b/gimmik/kernels/hip/bstream-msplit-width-preload-c.mako
@@ -1,44 +1,6 @@
 <%inherit file='base'/>
 
-% if width == 2:
-static inline __device__ ${dtype}
-gimmik_vmul(${dtype[:-1]} a, ${dtype} b)
-{
-    return make_${dtype}(a*b.x, a*b.y);
-}
-
-static inline __device__ ${dtype}
-gimmik_vadd(${dtype} a, ${dtype} b)
-{
-    return make_${dtype}(a.x + b.x, a.y + b.y);
-}
-
-static inline __device__ ${dtype}
-gimmik_vmadd(${dtype} acc, ${dtype[:-1]} a, ${dtype} b)
-{
-    return make_${dtype}(acc.x + a*b.x, acc.y + a*b.y);
-}
-% elif width == 4:
-static inline __device__ ${dtype}
-gimmik_vmul(${dtype[:-1]} a, ${dtype} b)
-{
-    return make_${dtype}(a*b.x, a*b.y, a*b.z, a*b.w);
-}
-
-static inline __device__ ${dtype}
-gimmik_vadd(${dtype} a, ${dtype} b)
-{
-    return make_${dtype}(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
-}
-
-static inline __device__ ${dtype}
-gimmik_vmadd(${dtype} acc, ${dtype[:-1]} a, ${dtype} b)
-{
-    return make_${dtype}(acc.x + a*b.x, acc.y + a*b.y, acc.z + a*b.z, acc.w + a*b.w);
-}
-% else:
-#error "bstream_msplit_width_preload_c only supports width=2 or width=4"
-% endif
+<%include file='vector'/>
 
 <%
 mx = partition(A, into=msplit, by='rows')
diff --git a/gimmik/kernels/hip/bstream-width-preload-c.mako b/gimmik/kernels/hip/bstream-width-preload-c.mako
index 97a7571..2c4e5c5 100644
--- a/gimmik/kernels/hip/bstream-width-preload-c.mako
+++ b/gimmik/kernels/hip/bstream-width-preload-c.mako
@@ -1,44 +1,6 @@
 <%inherit file='base'/>
 
-% if width == 2:
-static inline __device__ ${dtype}
-gimmik_vmul(${dtype[:-1]} a, ${dtype} b)
-{
-    return make_${dtype}(a*b.x, a*b.y);
-}
-
-static inline __device__ ${dtype}
-gimmik_vadd(${dtype} a, ${dtype} b)
-{
-    return make_${dtype}(a.x + b.x, a.y + b.y);
-}
-
-static inline __device__ ${dtype}
-gimmik_vmadd(${dtype} acc, ${dtype[:-1]} a, ${dtype} b)
-{
-    return make_${dtype}(acc.x + a*b.x, acc.y + a*b.y);
-}
-% elif width == 4:
-static inline __device__ ${dtype}
-gimmik_vmul(${dtype[:-1]} a, ${dtype} b)
-{
-    return make_${dtype}(a*b.x, a*b.y, a*b.z, a*b.w);
-}
-
-static inline __device__ ${dtype}
-gimmik_vadd(${dtype} a, ${dtype} b)
-{
-    return make_${dtype}(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
-}
-
-static inline __device__ ${dtype}
-gimmik_vmadd(${dtype} acc, ${dtype[:-1]} a, ${dtype} b)
-{
-    return make_${dtype}(acc.x + a*b.x, acc.y + a*b.y, acc.z + a*b.z, acc.w + a*b.w);
-}
-% else:
-#error "bstream_width_preload_c only supports width=2 or width=4"
-% endif
+<%include file='vector'/>
 
 __global__ __launch_bounds__(${blockx}) void
 % if n is None:
diff --git a/gimmik/kernels/hip/cstream-ksplit-preload-c.mako b/gimmik/kernels/hip/cstream-ksplit-preload-c.mako
index 507c34f..51f1db4 100644
--- a/gimmik/kernels/hip/cstream-ksplit-preload-c.mako
+++ b/gimmik/kernels/hip/cstream-ksplit-preload-c.mako
@@ -45,7 +45,7 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
       % endfor
       <%
       dotex = dot(lambda kx: f'bv[{kx}]', A[j, kbx])
-      has_dotp = any(A[j, kx] != 0 for kx in range(k))
+      has_dotp = A[j].any()
       %>
       % if dotex != '0.0':
         dotp = ${dotex};
@@ -78,7 +78,7 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
     ## Sum and output the final set of dot products
     % for j in cchunk:
       % if loop.index % ksplit == bid:
-        <% has_dotp = any(A[j, kx] != 0 for kx in range(k)) %>
+        <% has_dotp = A[j].any() %>
         % if beta == 0:
         dotp = cv[${loop.index // ksplit}] + ${' + '.join(f'csub[{i}][{loop.index}][threadIdx.x]'
                                                           for i in range(ksplit - 1))};
diff --git a/gimmik/kernels/hip/cstream-ksplit-width-preload-c.mako b/gimmik/kernels/hip/cstream-ksplit-width-preload-c.mako
index b435913..bdac6dc 100644
--- a/gimmik/kernels/hip/cstream-ksplit-width-preload-c.mako
+++ b/gimmik/kernels/hip/cstream-ksplit-width-preload-c.mako
@@ -1,46 +1,6 @@
 <%inherit file='base'/>
 
-% if width == 2:
-static inline __device__ ${dtype}
-gimmik_vmul(${dtype[:-1]} a, ${dtype} b)
-{
-    return make_${dtype}(a*b.x, a*b.y);
-}
-
-static inline __device__ ${dtype}
-gimmik_vadd(${dtype} a, ${dtype} b)
-{
-    return make_${dtype}(a.x + b.x, a.y + b.y);
-}
-
-static inline __device__ ${dtype}
-gimmik_vmadd(${dtype} acc, ${dtype[:-1]} a, ${dtype} b)
-{
-    // Keep the multiply-add expression visible to the compiler.
-    return make_${dtype}(acc.x + a*b.x, acc.y + a*b.y);
-}
-% elif width == 4:
-static inline __device__ ${dtype}
-gimmik_vmul(${dtype[:-1]} a, ${dtype} b)
-{
-    return make_${dtype}(a*b.x, a*b.y, a*b.z, a*b.w);
-}
-
-static inline __device__ ${dtype}
-gimmik_vadd(${dtype} a, ${dtype} b)
-{
-    return make_${dtype}(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
-}
-
-static inline __device__ ${dtype}
-gimmik_vmadd(${dtype} acc, ${dtype[:-1]} a, ${dtype} b)
-{
-    // Keep the multiply-add expression visible to the compiler.
-    return make_${dtype}(acc.x + a*b.x, acc.y + a*b.y, acc.z + a*b.z, acc.w + a*b.w);
-}
-% else:
-#error "cstream_ksplit_width_preload_c only supports width=2 or width=4"
-% endif
+<%include file='vector'/>
 
 <%
 kparts = partition(A, ksplit, by='cols')
@@ -88,12 +48,9 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
 
       ## Expand vectorized partial dot product
       <%
-      nzixs = []
-      for l_idx, kx in enumerate(kbx):
-          if A[j, kx] != 0:
-              nzixs.append((l_idx, kx))
+      nzixs = [(l_idx, kbx[l_idx]) for l_idx in A[j, kbx].nonzero()[0]]
 
-      has_dotp = any(A[j, kx] != 0 for kx in range(k))
+      has_dotp = A[j].any()
       if not nzixs:
           dotex = 'make_zero()'
       else:
@@ -131,7 +88,7 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
     % for j in cchunk:
       % if loop.index % ksplit == bid:
         <%
-        has_dotp = any(A[j, kx] != 0 for kx in range(k))
+        has_dotp = A[j].any()
         sum_expr = f"cv[{loop.index // ksplit}]"
         for s_idx in range(ksplit - 1):
             sum_expr = f"gimmik_vadd({sum_expr}, csub[{s_idx}][{loop.index}][threadIdx.x])"
diff --git a/gimmik/kernels/hip/cstream-width-preload-c.mako b/gimmik/kernels/hip/cstream-width-preload-c.mako
index 9f2f57c..86acfcb 100644
--- a/gimmik/kernels/hip/cstream-width-preload-c.mako
+++ b/gimmik/kernels/hip/cstream-width-preload-c.mako
@@ -1,46 +1,6 @@
 <%inherit file='base'/>
 
-% if width == 2:
-static inline __device__ ${dtype}
-gimmik_vmul(${dtype[:-1]} a, ${dtype} b)
-{
-    return make_${dtype}(a*b.x, a*b.y);
-}
-
-static inline __device__ ${dtype}
-gimmik_vadd(${dtype} a, ${dtype} b)
-{
-    return make_${dtype}(a.x + b.x, a.y + b.y);
-}
-
-static inline __device__ ${dtype}
-gimmik_vmadd(${dtype} acc, ${dtype[:-1]} a, ${dtype} b)
-{
-    // Keep the multiply-add expression visible to the compiler.
-    return make_${dtype}(acc.x + a*b.x, acc.y + a*b.y);
-}
-% elif width == 4:
-static inline __device__ ${dtype}
-gimmik_vmul(${dtype[:-1]} a, ${dtype} b)
-{
-    return make_${dtype}(a*b.x, a*b.y, a*b.z, a*b.w);
-}
-
-static inline __device__ ${dtype}
-gimmik_vadd(${dtype} a, ${dtype} b)
-{
-    return make_${dtype}(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
-}
-
-static inline __device__ ${dtype}
-gimmik_vmadd(${dtype} acc, ${dtype[:-1]} a, ${dtype} b)
-{
-    // Keep the multiply-add expression visible to the compiler.
-    return make_${dtype}(acc.x + a*b.x, acc.y + a*b.y, acc.z + a*b.z, acc.w + a*b.w);
-}
-% else:
-#error "cstream_width_preload_c only supports width=2 or width=4"
-% endif
+<%include file='vector'/>
 
 __global__ __launch_bounds__(${blockx}) void
 % if n is None:
diff --git a/gimmik/kernels/hip/vector.mako b/gimmik/kernels/hip/vector.mako
new file mode 100644
index 0000000..268d6ab
--- /dev/null
+++ b/gimmik/kernels/hip/vector.mako
@@ -0,0 +1,41 @@
+% if width == 2:
+static inline __device__ ${dtype}
+gimmik_vmul(${dtype[:-1]} a, ${dtype} b)
+{
+    return make_${dtype}(a*b.x, a*b.y);
+}
+
+static inline __device__ ${dtype}
+gimmik_vadd(${dtype} a, ${dtype} b)
+{
+    return make_${dtype}(a.x + b.x, a.y + b.y);
+}
+
+static inline __device__ ${dtype}
+gimmik_vmadd(${dtype} acc, ${dtype[:-1]} a, ${dtype} b)
+{
+    // Keep the multiply-add expression visible to the compiler.
+    return make_${dtype}(acc.x + a*b.x, acc.y + a*b.y);
+}
+% elif width == 4:
+static inline __device__ ${dtype}
+gimmik_vmul(${dtype[:-1]} a, ${dtype} b)
+{
+    return make_${dtype}(a*b.x, a*b.y, a*b.z, a*b.w);
+}
+
+static inline __device__ ${dtype}
+gimmik_vadd(${dtype} a, ${dtype} b)
+{
+    return make_${dtype}(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
+}
+
+static inline __device__ ${dtype}
+gimmik_vmadd(${dtype} acc, ${dtype[:-1]} a, ${dtype} b)
+{
+    // Keep the multiply-add expression visible to the compiler.
+    return make_${dtype}(acc.x + a*b.x, acc.y + a*b.y, acc.z + a*b.z, acc.w + a*b.w);
+}
+% else:
+#error "HIP vector helpers only support width=2 or width=4"
+% endif

From 739a82e18f508597081da9c73fb6ed780e7817ed Mon Sep 17 00:00:00 2001
From: tomjen12 <tomjen12@amd.com>
Date: Mon, 22 Jun 2026 01:45:23 -0500
Subject: [PATCH 04/15] Handle ROCm feature suffixes for gfx942 tuning

---
 gimmik/hip.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/gimmik/hip.py b/gimmik/hip.py
index cea1be5..a31c3d6 100644
--- a/gimmik/hip.py
+++ b/gimmik/hip.py
@@ -40,7 +40,8 @@ def emit(name, args, meta):
         yield from emit('cstream-ksplit', args, meta)
 
         # Only emit tuned variants on the architecture they were tuned for.
-        if gcn_arch != 'gfx942' or warp_size != 64:
+        base_arch = gcn_arch.split(':', 1)[0] if gcn_arch else None
+        if base_arch != 'gfx942' or warp_size != 64:
             return
 
         # Tuned HIP variants

From 0633539fbe6c99c97fdbf7aa91c0cb8df35e5160 Mon Sep 17 00:00:00 2001
From: tomjen12 <tomjen12@amd.com>
Date: Mon, 22 Jun 2026 03:30:28 -0500
Subject: [PATCH 05/15] Enable tuned HIP variants on gfx90a

---
 gimmik/hip.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gimmik/hip.py b/gimmik/hip.py
index a31c3d6..e32152d 100644
--- a/gimmik/hip.py
+++ b/gimmik/hip.py
@@ -39,9 +39,9 @@ def emit(name, args, meta):
         meta = {'block': (blkx, ks, 1), 'shared': (ks - 1)*csz*blkx*dsize}
         yield from emit('cstream-ksplit', args, meta)
 
-        # Only emit tuned variants on the architecture they were tuned for.
+        # Only emit tuned variants on architectures they have been validated for.
         base_arch = gcn_arch.split(':', 1)[0] if gcn_arch else None
-        if base_arch != 'gfx942' or warp_size != 64:
+        if base_arch not in {'gfx90a', 'gfx942'} or warp_size != 64:
             return
 
         # Tuned HIP variants

From 7b59fb0a791cbf1324a46c5b0451bfc63a8b6140 Mon Sep 17 00:00:00 2001
From: tomjen12 <tomjen12@amd.com>
Date: Tue, 23 Jun 2026 04:57:56 -0500
Subject: [PATCH 06/15] Parameterize HIP vector width and refine preload
 kernels

---
 gimmik/hip.py                                 | 155 ++++++++----------
 gimmik/kernels/hip/base.mako                  |  83 ++++++++++
 .../kernels/hip/bstream-msplit-preload-c.mako |  28 ++--
 .../hip/bstream-msplit-width-preload-c.mako   | 100 -----------
 gimmik/kernels/hip/bstream-msplit.mako        |  16 +-
 gimmik/kernels/hip/bstream-preload-c.mako     |  28 ++--
 .../kernels/hip/bstream-width-preload-c.mako  |  65 --------
 gimmik/kernels/hip/bstream.mako               |  16 +-
 .../kernels/hip/cstream-ksplit-preload-c.mako |  45 ++---
 .../hip/cstream-ksplit-width-preload-c.mako   | 114 -------------
 gimmik/kernels/hip/cstream-ksplit.mako        |  30 ++--
 gimmik/kernels/hip/cstream-preload-c.mako     |  32 ++--
 .../kernels/hip/cstream-width-preload-c.mako  |  66 --------
 gimmik/kernels/hip/cstream.mako               |  23 ++-
 gimmik/kernels/hip/vector.mako                |  41 -----
 15 files changed, 286 insertions(+), 556 deletions(-)
 delete mode 100644 gimmik/kernels/hip/bstream-msplit-width-preload-c.mako
 delete mode 100644 gimmik/kernels/hip/bstream-width-preload-c.mako
 delete mode 100644 gimmik/kernels/hip/cstream-ksplit-width-preload-c.mako
 delete mode 100644 gimmik/kernels/hip/cstream-width-preload-c.mako
 delete mode 100644 gimmik/kernels/hip/vector.mako

diff --git a/gimmik/hip.py b/gimmik/hip.py
index e32152d..57fa394 100644
--- a/gimmik/hip.py
+++ b/gimmik/hip.py
@@ -47,96 +47,83 @@ def emit(name, args, meta):
         # Tuned HIP variants
         msplits, ksplits = [4, 8], [2, 4]
         bsz, csz, blkx = 8, 8, 64
-        width = 2 if self.aligne is not None and self.aligne % 2 == 0 else 1
-
-        # B loading, C streaming kernel
-        args = {'blockx': blkx}
-        meta = {'block': (blkx, 1, 1), 'desc': f'cstream/x{blkx}'}
-        yield from emit('cstream', args, meta)
-
-        # B streaming, C accumulation kernel
-        meta = {'block': (blkx, 1, 1), 'desc': f'bstream/x{blkx}'}
-        yield from emit('bstream', args, meta)
-
-        for ms in msplits:
-            # m-split B streaming, C accumulation kernel
-            args = {'msplit': ms, 'bsz': bsz, 'blockx': blkx}
-            shared = 2*bsz*blkx*dsize
-            meta = {'block': (blkx, ms, 1), 'shared': shared,
-                    'desc': f'bstream-msplit/m{ms}-b{bsz}-x{blkx}'}
-            yield from emit('bstream-msplit', args, meta)
-
-        for ks in ksplits:
-            # k-split B loading, C streaming kernel
-            args = {'ksplit': ks, 'csz': csz, 'blockx': blkx}
-            shared = (ks - 1)*csz*blkx*dsize
-            meta = {'block': (blkx, ks, 1), 'shared': shared,
-                    'desc': f'cstream-ksplit/k{ks}-c{csz}-x{blkx}'}
-            yield from emit('cstream-ksplit', args, meta)
-
-        # B loading, C preloading, C streaming kernel
-        args = {'blockx': blkx}
-        meta = {'block': (blkx, 1, 1), 'desc': f'cstream-preload-c/x{blkx}'}
-        yield from emit('cstream-preload-c', args, meta)
-
-        # B streaming, C preloading, C accumulation kernel
-        meta = {'block': (blkx, 1, 1), 'desc': f'bstream-preload-c/x{blkx}'}
-        yield from emit('bstream-preload-c', args, meta)
-
-        if width > 1:
-            args = {'dtype': f'{dtype}{width}', 'width': width,
-                    'blockx': blkx}
-            meta = {'block': (blkx, 1, 1), 'width': width,
-                    'desc': f'cstream-width-preload-c/w{width}-x{blkx}'}
-            yield from emit('cstream-width-preload-c', args, meta)
-
-            meta = {'block': (blkx, 1, 1), 'width': width,
-                    'desc': f'bstream-width-preload-c/w{width}-x{blkx}'}
-            yield from emit('bstream-width-preload-c', args, meta)
-
-        for ms in msplits:
-            # m-split B streaming, C preloading, C accumulation kernel
-            args = {'msplit': ms, 'bsz': bsz, 'blockx': blkx}
-            shared = 2*bsz*blkx*dsize
-            meta = {'block': (blkx, ms, 1), 'shared': shared,
-                    'desc': f'bstream-msplit-preload-c/m{ms}-b{bsz}-x{blkx}'}
-            yield from emit('bstream-msplit-preload-c', args, meta)
-
-            if width > 1:
-                args = {'msplit': ms, 'bsz': bsz, 'blockx': blkx,
-                        'dtype': f'{dtype}{width}', 'width': width}
+        widths = [1]
+        if self.aligne is not None and self.aligne % 2 == 0:
+            widths.append(2)
+
+        for width in widths:
+            wargs = ({'dtype': f'{dtype}{width}', 'width': width}
+                     if width > 1 else {})
+            wmeta = {'width': width} if width > 1 else {}
+            wpfx = f'w{width}-' if width > 1 else ''
+
+            # B loading, C streaming kernel
+            args = {'blockx': blkx} | wargs
+            meta = {'block': (blkx, 1, 1),
+                    'desc': f'cstream/{wpfx}x{blkx}'} | wmeta
+            yield from emit('cstream', args, meta)
+
+            # B streaming, C accumulation kernel
+            meta = {'block': (blkx, 1, 1),
+                    'desc': f'bstream/{wpfx}x{blkx}'} | wmeta
+            yield from emit('bstream', args, meta)
+
+            for ms in msplits:
+                # m-split B streaming, C accumulation kernel
+                args = {'msplit': ms, 'bsz': bsz, 'blockx': blkx} | wargs
+                shared = 2*bsz*blkx*dsize*width
                 meta = {
-                    'block': (blkx, ms, 1), 'shared': shared*width,
-                    'width': width,
+                    'block': (blkx, ms, 1), 'shared': shared,
+                    'desc': f'bstream-msplit/{wpfx}m{ms}-b{bsz}-x{blkx}'
+                } | wmeta
+                yield from emit('bstream-msplit', args, meta)
+
+            for ks in ksplits:
+                # k-split B loading, C streaming kernel
+                args = {'ksplit': ks, 'csz': csz, 'blockx': blkx} | wargs
+                shared = (ks - 1)*csz*blkx*dsize*width
+                meta = {
+                    'block': (blkx, ks, 1), 'shared': shared,
+                    'desc': f'cstream-ksplit/{wpfx}k{ks}-c{csz}-x{blkx}'
+                } | wmeta
+                yield from emit('cstream-ksplit', args, meta)
+
+            # B loading, C preloading, C streaming kernel
+            args = {'blockx': blkx} | wargs
+            meta = {'block': (blkx, 1, 1),
+                    'desc': f'cstream-preload-c/{wpfx}x{blkx}'} | wmeta
+            yield from emit('cstream-preload-c', args, meta)
+
+            # B streaming, C preloading, C accumulation kernel
+            meta = {'block': (blkx, 1, 1),
+                    'desc': f'bstream-preload-c/{wpfx}x{blkx}'} | wmeta
+            yield from emit('bstream-preload-c', args, meta)
+
+            for ms in msplits:
+                # m-split B streaming, C preloading, C accumulation kernel
+                args = {'msplit': ms, 'bsz': bsz, 'blockx': blkx} | wargs
+                shared = 2*bsz*blkx*dsize*width
+                meta = {
+                    'block': (blkx, ms, 1), 'shared': shared,
                     'desc': (
-                        f'bstream-msplit-width-preload-c/w{width}-'
-                        f'm{ms}-b{bsz}-x{blkx}'
+                        f'bstream-msplit-preload-c/'
+                        f'{wpfx}m{ms}-b{bsz}-x{blkx}'
                     )
-                }
-                yield from emit('bstream-msplit-width-preload-c', args, meta)
-
-        for ks in ksplits:
-            # k-split B loading, C preloading, C streaming kernel
-            args = {'ksplit': ks, 'csz': csz, 'blockx': blkx}
-            shared = (ks - 1)*csz*blkx*dsize
-            meta = {
-                'block': (blkx, ks, 1), 'shared': shared,
-                'desc': f'cstream-ksplit-preload-c/k{ks}-c{csz}-x{blkx}'
-            }
-            yield from emit('cstream-ksplit-preload-c', args, meta)
-
-            if width > 1:
-                args = {'ksplit': ks, 'csz': csz, 'blockx': blkx,
-                        'dtype': f'{dtype}{width}', 'width': width}
+                } | wmeta
+                yield from emit('bstream-msplit-preload-c', args, meta)
+
+            for ks in ksplits:
+                # k-split B loading, C preloading, C streaming kernel
+                args = {'ksplit': ks, 'csz': csz, 'blockx': blkx} | wargs
+                shared = (ks - 1)*csz*blkx*dsize*width
                 meta = {
-                    'block': (blkx, ks, 1), 'shared': shared*width,
-                    'width': width,
+                    'block': (blkx, ks, 1), 'shared': shared,
                     'desc': (
-                        f'cstream-ksplit-width-preload-c/w{width}-'
-                        f'k{ks}-c{csz}-x{blkx}'
+                        f'cstream-ksplit-preload-c/'
+                        f'{wpfx}k{ks}-c{csz}-x{blkx}'
                     )
-                }
-                yield from emit('cstream-ksplit-width-preload-c', args, meta)
+                } | wmeta
+                yield from emit('cstream-ksplit-preload-c', args, meta)
 
     def _process_meta(self, meta):
         if self.n is not None:
diff --git a/gimmik/kernels/hip/base.mako b/gimmik/kernels/hip/base.mako
index b40a0b9..a03a943 100644
--- a/gimmik/kernels/hip/base.mako
+++ b/gimmik/kernels/hip/base.mako
@@ -9,6 +9,67 @@ static inline __device__ ${dtype} make_zero()
 { return 0; }
 % endif
 
+% if width == 1:
+static inline __device__ ${dtype}
+gimmik_vmul(${dtype} a, ${dtype} b)
+{
+    return a*b;
+}
+
+static inline __device__ ${dtype}
+gimmik_vadd(${dtype} a, ${dtype} b)
+{
+    return a + b;
+}
+
+static inline __device__ ${dtype}
+gimmik_vmadd(${dtype} acc, ${dtype} a, ${dtype} b)
+{
+    // Keep the multiply-add expression visible to the compiler.
+    return acc + a*b;
+}
+% elif width == 2:
+static inline __device__ ${dtype}
+gimmik_vmul(${dtype[:-1]} a, ${dtype} b)
+{
+    return make_${dtype}(a*b.x, a*b.y);
+}
+
+static inline __device__ ${dtype}
+gimmik_vadd(${dtype} a, ${dtype} b)
+{
+    return make_${dtype}(a.x + b.x, a.y + b.y);
+}
+
+static inline __device__ ${dtype}
+gimmik_vmadd(${dtype} acc, ${dtype[:-1]} a, ${dtype} b)
+{
+    // Keep the multiply-add expression visible to the compiler.
+    return make_${dtype}(acc.x + a*b.x, acc.y + a*b.y);
+}
+% elif width == 4:
+static inline __device__ ${dtype}
+gimmik_vmul(${dtype[:-1]} a, ${dtype} b)
+{
+    return make_${dtype}(a*b.x, a*b.y, a*b.z, a*b.w);
+}
+
+static inline __device__ ${dtype}
+gimmik_vadd(${dtype} a, ${dtype} b)
+{
+    return make_${dtype}(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
+}
+
+static inline __device__ ${dtype}
+gimmik_vmadd(${dtype} acc, ${dtype[:-1]} a, ${dtype} b)
+{
+    // Keep the multiply-add expression visible to the compiler.
+    return make_${dtype}(acc.x + a*b.x, acc.y + a*b.y, acc.z + a*b.z, acc.w + a*b.w);
+}
+% else:
+#error "HIP vector helpers only support width=2 or width=4"
+% endif
+
 static inline __device__ void
 nt_store_c(${dtype}* p, ${dtype} v)
 {
@@ -41,4 +102,26 @@ nt_load_c(const ${dtype}* p)
 % endif
 }
 
+<% nt_c = context.get('nt_c', True) %>
+
+static inline __device__ void
+store_c(${dtype}* p, ${dtype} v)
+{
+% if nt_c:
+    nt_store_c(p, v);
+% else:
+    *p = v;
+% endif
+}
+
+static inline __device__ ${dtype}
+load_c(const ${dtype}* p)
+{
+% if nt_c:
+    return nt_load_c(p);
+% else:
+    return *p;
+% endif
+}
+
 ${next.body()}
diff --git a/gimmik/kernels/hip/bstream-msplit-preload-c.mako b/gimmik/kernels/hip/bstream-msplit-preload-c.mako
index 8b6f008..6cabeb6 100644
--- a/gimmik/kernels/hip/bstream-msplit-preload-c.mako
+++ b/gimmik/kernels/hip/bstream-msplit-preload-c.mako
@@ -12,7 +12,7 @@ ${kname}(int n,
          ${dtype}* __restrict__ c, int ldc)
 {
   % if width > 1:
-    n = ((n + ${width} - 1) / ${width}) * ${width};
+    n = (n + ${width} - 1) / ${width};
     ldb /= ${width};
     ldc /= ${width};
   % endif
@@ -38,18 +38,18 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
     % endif
   % endfor
 
+  % if beta != 0:
   ## Preload C values for active rows owned by this m-split lane
   % for j, jx in enumerate(mx[cid]):
     % if afix[jx] != -1:
-      % if beta == 0:
-        csub[${j}] = make_zero();
-      % elif beta == 1:
-        csub[${j}] = nt_load_c(&c[i + ${jx}*ldc]);
+      % if beta == 1:
+        csub[${j}] = load_c(&c[i + ${jx}*ldc]);
       % else:
-        csub[${j}] = ${beta}*nt_load_c(&c[i + ${jx}*ldc]);
+        csub[${j}] = gimmik_vmul(${beta}, load_c(&c[i + ${jx}*ldc]));
       % endif
     % endif
   % endfor
+  % endif
     }
 % endfor
     __syncthreads();
@@ -72,12 +72,18 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
     % for kx in bchunks[bb]:
         bv = bsub[${bb % 2}][${loop.index}][threadIdx.x];
       % for j, jx in enumerate(A[mcx, kx]):
-        % if jx != 0:
-        csub[${j}] += ${jx}*bv;
+        % if beta == 0:
+          % if jx != 0 and kx == afix[mcx[j]]:
+        csub[${j}] = gimmik_vmul(${jx}, bv);
+          % elif jx != 0:
+        csub[${j}] = gimmik_vmadd(csub[${j}], ${jx}, bv);
+          % endif
+        % elif jx != 0:
+        csub[${j}] = gimmik_vmadd(csub[${j}], ${jx}, bv);
         % endif
         ## If we're done with this dot product then store to global
         % if kx == alix[mcx[j]]:
-        nt_store_c(&c[i + ${mcx[j]}*ldc], csub[${j}]);
+        store_c(&c[i + ${mcx[j]}*ldc], csub[${j}]);
         % endif
       % endfor
     % endfor
@@ -85,9 +91,9 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
     % if loop.parent.last:
       % for j, jx in enumerate(afix):
         % if jx == -1 and j % msplit == cid and beta == 0:
-        nt_store_c(&c[i + ${j}*ldc], make_zero());
+        store_c(&c[i + ${j}*ldc], make_zero());
         % elif jx == -1 and j % msplit == cid and beta != 1:
-        nt_store_c(&c[i + ${j}*ldc], ${beta}*nt_load_c(&c[i + ${j}*ldc]));
+        store_c(&c[i + ${j}*ldc], gimmik_vmul(${beta}, load_c(&c[i + ${j}*ldc])));
         % endif
       % endfor
     % endif
diff --git a/gimmik/kernels/hip/bstream-msplit-width-preload-c.mako b/gimmik/kernels/hip/bstream-msplit-width-preload-c.mako
deleted file mode 100644
index 4466f9f..0000000
--- a/gimmik/kernels/hip/bstream-msplit-width-preload-c.mako
+++ /dev/null
@@ -1,100 +0,0 @@
-<%inherit file='base'/>
-
-<%include file='vector'/>
-
-<%
-mx = partition(A, into=msplit, by='rows')
-bchunks = chunk(bix, bsz)
-%>
-
-__global__ __launch_bounds__(${blockx*msplit}) void
-% if n is None:
-${kname}(int n,
-         const ${dtype}* __restrict__ b, int ldb,
-         ${dtype}* __restrict__ c, int ldc)
-{
-  % if width > 1:
-    n = (n + ${width} - 1) / ${width};
-    ldb /= ${width};
-    ldc /= ${width};
-  % endif
-% else:
-${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
-{
-    const int n = ${-(-n // width)};
-    const ${'long long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width};
-    const ${'long long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width};
-% endif
-    int i = blockDim.x*blockIdx.x + threadIdx.x;
-
-    ${dtype} bv, csub[${-(-m // msplit)}];
-    __shared__ ${dtype} bsub[2][${bsz}][${blockx}];
-
-## Fill the initial shared memory block
-% for cid in range(msplit):
-    if (i < n && threadIdx.y == ${cid})
-    {
-  % for kx in bchunks[0]:
-    % if loop.index % msplit == cid:
-        bsub[0][${loop.index}][threadIdx.x] = b[i + ${kx}*ldb];
-    % endif
-  % endfor
-
-  ## Preload C values for active rows owned by this m-split lane
-  % for j, jx in enumerate(mx[cid]):
-    % if afix[jx] != -1:
-      % if beta == 0:
-        csub[${j}] = make_zero();
-      % elif beta == 1:
-        csub[${j}] = nt_load_c(&c[i + ${jx}*ldc]);
-      % else:
-        csub[${j}] = gimmik_vmul(${beta}, nt_load_c(&c[i + ${jx}*ldc]));
-      % endif
-    % endif
-  % endfor
-    }
-% endfor
-    __syncthreads();
-
-## Iterate over each row-chunk of B
-% for bb in range(len(bchunks)):
-  ## Iterate over each row-chunk of C
-  % for cid, mcx in enumerate(mx):
-    if (i < n && threadIdx.y == ${cid})
-    {
-    ## Start filling the next shared memory block
-    % if not loop.parent.last:
-      % for kx in bchunks[bb + 1]:
-        % if loop.index % msplit == cid:
-        bsub[${(bb + 1) % 2}][${loop.index}][threadIdx.x] = b[i + ${kx}*ldb];
-        % endif
-      % endfor
-    % endif
-    ## Accumulate our dot products
-    % for kx in bchunks[bb]:
-        bv = bsub[${bb % 2}][${loop.index}][threadIdx.x];
-      % for j, jx in enumerate(A[mcx, kx]):
-        % if jx != 0:
-        csub[${j}] = gimmik_vmadd(csub[${j}], ${jx}, bv);
-        % endif
-        ## If we're done with this dot product then store to global
-        % if kx == alix[mcx[j]]:
-        nt_store_c(&c[i + ${mcx[j]}*ldc], csub[${j}]);
-        % endif
-      % endfor
-    % endfor
-    ## Handle rows of A which are all zero
-    % if loop.parent.last:
-      % for j, jx in enumerate(afix):
-        % if jx == -1 and j % msplit == cid and beta == 0:
-        nt_store_c(&c[i + ${j}*ldc], make_zero());
-        % elif jx == -1 and j % msplit == cid and beta != 1:
-        nt_store_c(&c[i + ${j}*ldc], gimmik_vmul(${beta}, nt_load_c(&c[i + ${j}*ldc])));
-        % endif
-      % endfor
-    % endif
-    }
-  % endfor
-    __syncthreads();
-% endfor
-}
diff --git a/gimmik/kernels/hip/bstream-msplit.mako b/gimmik/kernels/hip/bstream-msplit.mako
index 6470477..35d336e 100644
--- a/gimmik/kernels/hip/bstream-msplit.mako
+++ b/gimmik/kernels/hip/bstream-msplit.mako
@@ -12,7 +12,7 @@ ${kname}(int n,
          ${dtype}* __restrict__ c, int ldc)
 {
   % if width > 1:
-    n = ((n + ${width} - 1) / ${width}) * ${width};
+    n = (n + ${width} - 1) / ${width};
     ldb /= ${width};
     ldc /= ${width};
   % endif
@@ -60,17 +60,17 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
         bv = bsub[${bb % 2}][${loop.index}][threadIdx.x];
       % for j, jx in enumerate(A[mcx, kx]):
         % if jx != 0 and kx == afix[mcx[j]]:
-        csub[${j}] = ${jx}*bv;
+        csub[${j}] = gimmik_vmul(${jx}, bv);
         % elif jx != 0:
-        csub[${j}] += ${jx}*bv;
+        csub[${j}] = gimmik_vmadd(csub[${j}], ${jx}, bv);
         % endif
         ## If we're done with this dot product then store to global
         % if kx == alix[mcx[j]] and beta == 0:
-        nt_store_c(&c[i + ${mcx[j]}*ldc], csub[${j}]);
+        store_c(&c[i + ${mcx[j]}*ldc], csub[${j}]);
         % elif kx == alix[mcx[j]] and beta == 1:
-        nt_store_c(&c[i + ${mcx[j]}*ldc], nt_load_c(&c[i + ${mcx[j]}*ldc]) + csub[${j}]);
+        store_c(&c[i + ${mcx[j]}*ldc], gimmik_vadd(load_c(&c[i + ${mcx[j]}*ldc]), csub[${j}]));
         % elif kx == alix[mcx[j]]:
-        nt_store_c(&c[i + ${mcx[j]}*ldc], csub[${j}] + ${beta}*nt_load_c(&c[i + ${mcx[j]}*ldc]));
+        store_c(&c[i + ${mcx[j]}*ldc], gimmik_vadd(csub[${j}], gimmik_vmul(${beta}, load_c(&c[i + ${mcx[j]}*ldc]))));
         % endif
       % endfor
     % endfor
@@ -78,9 +78,9 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
     % if loop.parent.last:
       % for j, jx in enumerate(afix):
         % if jx == -1 and j % msplit == cid and beta == 0:
-        nt_store_c(&c[i + ${j}*ldc], make_zero());
+        store_c(&c[i + ${j}*ldc], make_zero());
         % elif jx == -1 and j % msplit == cid and beta != 1:
-        nt_store_c(&c[i + ${j}*ldc], nt_load_c(&c[i + ${j}*ldc])*${beta});
+        store_c(&c[i + ${j}*ldc], gimmik_vmul(${beta}, load_c(&c[i + ${j}*ldc])));
         % endif
       % endfor
     % endif
diff --git a/gimmik/kernels/hip/bstream-preload-c.mako b/gimmik/kernels/hip/bstream-preload-c.mako
index 30b08f6..095be83 100644
--- a/gimmik/kernels/hip/bstream-preload-c.mako
+++ b/gimmik/kernels/hip/bstream-preload-c.mako
@@ -7,7 +7,7 @@ ${kname}(int n,
          ${dtype}* __restrict__ c, int ldc)
 {
   % if width > 1:
-    n = ((n + ${width} - 1) / ${width}) * ${width};
+    n = (n + ${width} - 1) / ${width};
     ldb /= ${width};
     ldc /= ${width};
   % endif
@@ -24,29 +24,35 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
     {
         ${dtype} bv, csub[${m}];
 
+% if beta != 0:
 ## Preload C values for rows which will receive a non-zero dot product
 % for j, jx in enumerate(afix):
   % if jx != -1:
-    % if beta == 0:
-        csub[${j}] = make_zero();
-    % elif beta == 1:
-        csub[${j}] = nt_load_c(&c[i + ${j}*ldc]);
+    % if beta == 1:
+        csub[${j}] = load_c(&c[i + ${j}*ldc]);
     % else:
-        csub[${j}] = ${beta}*nt_load_c(&c[i + ${j}*ldc]);
+        csub[${j}] = gimmik_vmul(${beta}, load_c(&c[i + ${j}*ldc]));
     % endif
   % endif
 % endfor
+% endif
 
 ## Iterate through the used rows of B
 % for kx in bix:
         bv = b[i + ${kx}*ldb];
   % for j, jx in enumerate(A[:, kx]):
-    % if jx != 0:
-        csub[${j}] += ${jx}*bv;
+    % if beta == 0:
+      % if jx != 0 and kx == afix[j]:
+        csub[${j}] = gimmik_vmul(${jx}, bv);
+      % elif jx != 0:
+        csub[${j}] = gimmik_vmadd(csub[${j}], ${jx}, bv);
+      % endif
+    % elif jx != 0:
+        csub[${j}] = gimmik_vmadd(csub[${j}], ${jx}, bv);
     % endif
     ##
     % if kx == alix[j]:
-        nt_store_c(&c[i + ${j}*ldc], csub[${j}]);
+        store_c(&c[i + ${j}*ldc], csub[${j}]);
     % endif
   % endfor
 % endfor
@@ -54,9 +60,9 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
 ## Handle rows of A which are all zero
 % for j, jx in enumerate(afix):
   % if jx == -1 and beta == 0:
-        nt_store_c(&c[i + ${j}*ldc], make_zero());
+        store_c(&c[i + ${j}*ldc], make_zero());
   % elif jx == -1 and beta != 1:
-        nt_store_c(&c[i + ${j}*ldc], ${beta}*nt_load_c(&c[i + ${j}*ldc]));
+        store_c(&c[i + ${j}*ldc], gimmik_vmul(${beta}, load_c(&c[i + ${j}*ldc])));
   % endif
 % endfor
     }
diff --git a/gimmik/kernels/hip/bstream-width-preload-c.mako b/gimmik/kernels/hip/bstream-width-preload-c.mako
deleted file mode 100644
index 2c4e5c5..0000000
--- a/gimmik/kernels/hip/bstream-width-preload-c.mako
+++ /dev/null
@@ -1,65 +0,0 @@
-<%inherit file='base'/>
-
-<%include file='vector'/>
-
-__global__ __launch_bounds__(${blockx}) void
-% if n is None:
-${kname}(int n,
-         const ${dtype}* __restrict__ b, int ldb,
-         ${dtype}* __restrict__ c, int ldc)
-{
-  % if width > 1:
-    n = (n + ${width} - 1) / ${width};
-    ldb /= ${width};
-    ldc /= ${width};
-  % endif
-% else:
-${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
-{
-    const int n = ${-(-n // width)};
-    const ${'long long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width};
-    const ${'long long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width};
-% endif
-    const int i = blockDim.x*blockIdx.x + threadIdx.x;
-
-    if (i < n)
-    {
-        ${dtype} bv, csub[${m}];
-
-## Preload C values for rows which will receive a non-zero dot product
-% for j, jx in enumerate(afix):
-  % if jx != -1:
-    % if beta == 0:
-        csub[${j}] = make_zero();
-    % elif beta == 1:
-        csub[${j}] = nt_load_c(&c[i + ${j}*ldc]);
-    % else:
-        csub[${j}] = gimmik_vmul(${beta}, nt_load_c(&c[i + ${j}*ldc]));
-    % endif
-  % endif
-% endfor
-
-## Iterate through the used rows of B
-% for kx in bix:
-        bv = b[i + ${kx}*ldb];
-  % for j, jx in enumerate(A[:, kx]):
-    % if jx != 0:
-        csub[${j}] = gimmik_vmadd(csub[${j}], ${jx}, bv);
-    % endif
-    ##
-    % if kx == alix[j]:
-        nt_store_c(&c[i + ${j}*ldc], csub[${j}]);
-    % endif
-  % endfor
-% endfor
-
-## Handle rows of A which are all zero
-% for j, jx in enumerate(afix):
-  % if jx == -1 and beta == 0:
-        nt_store_c(&c[i + ${j}*ldc], make_zero());
-  % elif jx == -1 and beta != 1:
-        nt_store_c(&c[i + ${j}*ldc], gimmik_vmul(${beta}, nt_load_c(&c[i + ${j}*ldc])));
-  % endif
-% endfor
-    }
-}
diff --git a/gimmik/kernels/hip/bstream.mako b/gimmik/kernels/hip/bstream.mako
index 9634c73..427dffc 100644
--- a/gimmik/kernels/hip/bstream.mako
+++ b/gimmik/kernels/hip/bstream.mako
@@ -7,7 +7,7 @@ ${kname}(int n,
          ${dtype}* __restrict__ c, int ldc)
 {
   % if width > 1:
-    n = ((n + ${width} - 1) / ${width}) * ${width};
+    n = (n + ${width} - 1) / ${width};
     ldb /= ${width};
     ldc /= ${width};
   % endif
@@ -29,17 +29,17 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
         bv = b[i + ${kx}*ldb];
   % for j, jx in enumerate(A[:, kx]):
     % if jx != 0 and kx == afix[j]:
-        csub[${j}] = ${jx}*bv;
+        csub[${j}] = gimmik_vmul(${jx}, bv);
     % elif jx != 0:
-        csub[${j}] += ${jx}*bv;
+        csub[${j}] = gimmik_vmadd(csub[${j}], ${jx}, bv);
     % endif
     ##
     % if kx == alix[j] and beta == 0:
-        nt_store_c(&c[i + ${j}*ldc], csub[${j}]);
+        store_c(&c[i + ${j}*ldc], csub[${j}]);
     % elif kx == alix[j] and beta == 1:
-        nt_store_c(&c[i + ${j}*ldc], nt_load_c(&c[i + ${j}*ldc]) + csub[${j}]);
+        store_c(&c[i + ${j}*ldc], gimmik_vadd(load_c(&c[i + ${j}*ldc]), csub[${j}]));
     % elif kx == alix[j]:
-        nt_store_c(&c[i + ${j}*ldc], csub[${j}] + ${beta}*nt_load_c(&c[i + ${j}*ldc]));
+        store_c(&c[i + ${j}*ldc], gimmik_vadd(csub[${j}], gimmik_vmul(${beta}, load_c(&c[i + ${j}*ldc]))));
     % endif
   % endfor
 % endfor
@@ -47,9 +47,9 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
 ## Handle rows of A which are all zero
 % for j, jx in enumerate(afix):
   % if jx == -1 and beta == 0:
-        nt_store_c(&c[i + ${j}*ldc], make_zero());
+        store_c(&c[i + ${j}*ldc], make_zero());
   % elif jx == -1 and beta != 1:
-        nt_store_c(&c[i + ${j}*ldc], nt_load_c(&c[i + ${j}*ldc])*${beta});
+        store_c(&c[i + ${j}*ldc], gimmik_vmul(${beta}, load_c(&c[i + ${j}*ldc])));
   % endif
 % endfor
     }
diff --git a/gimmik/kernels/hip/cstream-ksplit-preload-c.mako b/gimmik/kernels/hip/cstream-ksplit-preload-c.mako
index 51f1db4..4550700 100644
--- a/gimmik/kernels/hip/cstream-ksplit-preload-c.mako
+++ b/gimmik/kernels/hip/cstream-ksplit-preload-c.mako
@@ -13,7 +13,7 @@ ${kname}(int n,
          ${dtype}* __restrict__ c, int ldc)
 {
   % if width > 1:
-    n = ((n + ${width} - 1) / ${width}) * ${width};
+    n = (n + ${width} - 1) / ${width};
     ldb /= ${width};
     ldc /= ${width};
   % endif
@@ -44,24 +44,27 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
         % endif
       % endfor
       <%
-      dotex = dot(lambda kx: f'bv[{kx}]', A[j, kbx])
+      nzixs = [(l_idx, kbx[l_idx]) for l_idx in A[j, kbx].nonzero()[0]]
       has_dotp = A[j].any()
+      if nzixs:
+          first_l_idx, first_kx = nzixs[0]
+          dotex = f"gimmik_vmul({A[j, first_kx]}, bv[{first_l_idx}])"
+          for l_idx, kx in nzixs[1:]:
+              dotex = f"gimmik_vmadd({dotex}, {A[j, kx]}, bv[{l_idx}])"
+      else:
+          dotex = 'make_zero()'
       %>
-      % if dotex != '0.0':
         dotp = ${dotex};
-      % else:
-        dotp = make_zero();
-      % endif
       ## Save to a register
       % if loop.index % ksplit == bid:
         % if beta == 0:
         cv[${loop.index // ksplit}] = dotp;
         % elif beta == 1 and has_dotp:
-        cv[${loop.index // ksplit}] = nt_load_c(&c[i + ${j}*ldc]);
-        cv[${loop.index // ksplit}] += dotp;
+        cv[${loop.index // ksplit}] = load_c(&c[i + ${j}*ldc]);
+        cv[${loop.index // ksplit}] = gimmik_vadd(cv[${loop.index // ksplit}], dotp);
         % elif has_dotp:
-        cv[${loop.index // ksplit}] = ${beta}*nt_load_c(&c[i + ${j}*ldc]);
-        cv[${loop.index // ksplit}] += dotp;
+        cv[${loop.index // ksplit}] = gimmik_vmul(${beta}, load_c(&c[i + ${j}*ldc]));
+        cv[${loop.index // ksplit}] = gimmik_vadd(cv[${loop.index // ksplit}], dotp);
         % endif
       ## Save to shared memory
       % else:
@@ -79,20 +82,22 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
     % for j in cchunk:
       % if loop.index % ksplit == bid:
         <% has_dotp = A[j].any() %>
+        <%
+        sum_expr = f"cv[{loop.index // ksplit}]"
+        for s_idx in range(ksplit - 1):
+            sum_expr = f"gimmik_vadd({sum_expr}, csub[{s_idx}][{loop.index}][threadIdx.x])"
+        %>
         % if beta == 0:
-        dotp = cv[${loop.index // ksplit}] + ${' + '.join(f'csub[{i}][{loop.index}][threadIdx.x]'
-                                                          for i in range(ksplit - 1))};
-        nt_store_c(&c[i + ${j}*ldc], dotp);
+        dotp = ${sum_expr};
+        store_c(&c[i + ${j}*ldc], dotp);
         % elif beta == 1 and has_dotp:
-        dotp = cv[${loop.index // ksplit}] + ${' + '.join(f'csub[{i}][{loop.index}][threadIdx.x]'
-                                                          for i in range(ksplit - 1))};
-        nt_store_c(&c[i + ${j}*ldc], dotp);
+        dotp = ${sum_expr};
+        store_c(&c[i + ${j}*ldc], dotp);
         % elif beta != 1 and has_dotp:
-        dotp = cv[${loop.index // ksplit}] + ${' + '.join(f'csub[{i}][{loop.index}][threadIdx.x]'
-                                                          for i in range(ksplit - 1))};
-        nt_store_c(&c[i + ${j}*ldc], dotp);
+        dotp = ${sum_expr};
+        store_c(&c[i + ${j}*ldc], dotp);
         % elif beta != 1:
-        nt_store_c(&c[i + ${j}*ldc], ${beta}*nt_load_c(&c[i + ${j}*ldc]));
+        store_c(&c[i + ${j}*ldc], gimmik_vmul(${beta}, load_c(&c[i + ${j}*ldc])));
         % endif
       % endif
     % endfor
diff --git a/gimmik/kernels/hip/cstream-ksplit-width-preload-c.mako b/gimmik/kernels/hip/cstream-ksplit-width-preload-c.mako
deleted file mode 100644
index bdac6dc..0000000
--- a/gimmik/kernels/hip/cstream-ksplit-width-preload-c.mako
+++ /dev/null
@@ -1,114 +0,0 @@
-<%inherit file='base'/>
-
-<%include file='vector'/>
-
-<%
-kparts = partition(A, ksplit, by='cols')
-cchunks = chunk(range(m), csz)
-loaded = set()
-%>
-
-__global__ __launch_bounds__(${blockx*ksplit}) void
-% if n is None:
-${kname}(int n,
-         const ${dtype}* __restrict__ b, int ldb,
-         ${dtype}* __restrict__ c, int ldc)
-{
-  % if width > 1:
-    n = (n + ${width} - 1) / ${width};
-    ldb /= ${width};
-    ldc /= ${width};
-  % endif
-% else:
-${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
-{
-    const int n = ${-(-n // width)};
-    const ${'long long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width};
-    const ${'long long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width};
-% endif
-    int i = blockDim.x*blockIdx.x + threadIdx.x;
-
-    ${dtype} cv[${-(-csz // ksplit)}], bv[${-(-k // ksplit)}], dotp;
-    __shared__ ${dtype} csub[${ksplit - 1}][${csz}][${blockx}];
-
-## Iterate over the row-partitions of C
-% for cchunk in cchunks:
-  ## Iterate over the column-partitions of B
-  % for bid, kbx in enumerate(kparts):
-    if (i < n && threadIdx.y == ${bid})
-    {
-    ## Evaluate our partial dot products
-    % for j in cchunk:
-      ## Load in any missing parts of B
-      % for kx in kbx:
-        % if A[j, kx] != 0 and kx not in loaded:
-        bv[${loop.index}] = b[i + ${kx}*ldb]; <% loaded.add(kx) %>
-        % endif
-      % endfor
-
-      ## Expand vectorized partial dot product
-      <%
-      nzixs = [(l_idx, kbx[l_idx]) for l_idx in A[j, kbx].nonzero()[0]]
-
-      has_dotp = A[j].any()
-      if not nzixs:
-          dotex = 'make_zero()'
-      else:
-          first_l_idx, first_kx = nzixs[0]
-          dotex = f"gimmik_vmul({A[j, first_kx]}, bv[{first_l_idx}])"
-          for l_idx, kx in nzixs[1:]:
-              dotex = f"gimmik_vmadd({dotex}, {A[j, kx]}, bv[{l_idx}])"
-      %>
-      dotp = ${dotex};
-
-      ## Save to a register
-      % if loop.index % ksplit == bid:
-        % if beta == 0:
-        cv[${loop.index // ksplit}] = dotp;
-        % elif beta == 1 and has_dotp:
-        cv[${loop.index // ksplit}] = nt_load_c(&c[i + ${j}*ldc]);
-        cv[${loop.index // ksplit}] = gimmik_vadd(cv[${loop.index // ksplit}], dotp);
-        % elif has_dotp:
-        cv[${loop.index // ksplit}] = gimmik_vmul(${beta}, nt_load_c(&c[i + ${j}*ldc]));
-        cv[${loop.index // ksplit}] = gimmik_vadd(cv[${loop.index // ksplit}], dotp);
-        % endif
-      ## Save to shared memory
-      % else:
-        csub[${bid - (bid > loop.index % ksplit)}][${loop.index}][threadIdx.x] = dotp;
-      % endif
-    % endfor
-    }
-  % endfor
-    __syncthreads();
-    
-  ## Sum and output the final set of dot products
-  % for bid, kbx in enumerate(kparts):
-    if (i < n && threadIdx.y == ${bid})
-    {
-    % for j in cchunk:
-      % if loop.index % ksplit == bid:
-        <%
-        has_dotp = A[j].any()
-        sum_expr = f"cv[{loop.index // ksplit}]"
-        for s_idx in range(ksplit - 1):
-            sum_expr = f"gimmik_vadd({sum_expr}, csub[{s_idx}][{loop.index}][threadIdx.x])"
-        %>
-        % if beta == 0:
-        dotp = ${sum_expr};
-        nt_store_c(&c[i + ${j}*ldc], dotp);
-        % elif beta == 1 and has_dotp:
-        dotp = ${sum_expr};
-        nt_store_c(&c[i + ${j}*ldc], dotp);
-        % elif beta != 1 and has_dotp:
-        dotp = ${sum_expr};
-        nt_store_c(&c[i + ${j}*ldc], dotp);
-        % elif beta != 1:
-        nt_store_c(&c[i + ${j}*ldc], gimmik_vmul(${beta}, nt_load_c(&c[i + ${j}*ldc])));
-        % endif
-      % endif
-    % endfor
-    }
-  % endfor
-    __syncthreads();
-% endfor
-}
diff --git a/gimmik/kernels/hip/cstream-ksplit.mako b/gimmik/kernels/hip/cstream-ksplit.mako
index 6fd3210..9c5fc09 100644
--- a/gimmik/kernels/hip/cstream-ksplit.mako
+++ b/gimmik/kernels/hip/cstream-ksplit.mako
@@ -13,7 +13,7 @@ ${kname}(int n,
          ${dtype}* __restrict__ c, int ldc)
 {
   % if width > 1:
-    n = ((n + ${width} - 1) / ${width}) * ${width};
+    n = (n + ${width} - 1) / ${width};
     ldb /= ${width};
     ldc /= ${width};
   % endif
@@ -43,11 +43,17 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
         bv[${loop.index}] = b[i + ${kx}*ldb]; <% loaded.add(kx) %>
         % endif
       % endfor
-      % if (dotex := dot(lambda kx: f'bv[{kx}]', A[j, kbx])) != '0.0':
+      <%
+      nzixs = [(l_idx, kbx[l_idx]) for l_idx in A[j, kbx].nonzero()[0]]
+      if nzixs:
+          first_l_idx, first_kx = nzixs[0]
+          dotex = f"gimmik_vmul({A[j, first_kx]}, bv[{first_l_idx}])"
+          for l_idx, kx in nzixs[1:]:
+              dotex = f"gimmik_vmadd({dotex}, {A[j, kx]}, bv[{l_idx}])"
+      else:
+          dotex = 'make_zero()'
+      %>
         dotp = ${dotex};
-      % else:
-        dotp = make_zero();
-      % endif
       ## Save to a register
       % if loop.index % ksplit == bid:
         cv[${loop.index // ksplit}] = dotp;
@@ -66,14 +72,18 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
     ## Sum and output the final set of dot products
     % for j in cchunk:
       % if loop.index % ksplit == bid:
-        dotp = cv[${loop.index // ksplit}] + ${' + '.join(f'csub[{i}][{loop.index}][threadIdx.x]'
-                                                          for i in range(ksplit - 1))};
+        <%
+        sum_expr = f"cv[{loop.index // ksplit}]"
+        for s_idx in range(ksplit - 1):
+            sum_expr = f"gimmik_vadd({sum_expr}, csub[{s_idx}][{loop.index}][threadIdx.x])"
+        %>
+        dotp = ${sum_expr};
         % if beta == 0:
-        nt_store_c(&c[i + ${j}*ldc], dotp);
+        store_c(&c[i + ${j}*ldc], dotp);
         % elif beta == 1:
-        nt_store_c(&c[i + ${j}*ldc], nt_load_c(&c[i + ${j}*ldc]) + dotp);
+        store_c(&c[i + ${j}*ldc], gimmik_vadd(load_c(&c[i + ${j}*ldc]), dotp));
         % else:
-        nt_store_c(&c[i + ${j}*ldc], dotp + ${beta}*nt_load_c(&c[i + ${j}*ldc]));
+        store_c(&c[i + ${j}*ldc], gimmik_vadd(dotp, gimmik_vmul(${beta}, load_c(&c[i + ${j}*ldc]))));
         % endif
       % endif
     % endfor
diff --git a/gimmik/kernels/hip/cstream-preload-c.mako b/gimmik/kernels/hip/cstream-preload-c.mako
index 041e674..eebb602 100644
--- a/gimmik/kernels/hip/cstream-preload-c.mako
+++ b/gimmik/kernels/hip/cstream-preload-c.mako
@@ -9,7 +9,7 @@ ${kname}(int n,
          ${dtype}* __restrict__ c, int ldc)
 {
   % if width > 1:
-    n = ((n + ${width} - 1) / ${width}) * ${width};
+    n = (n + ${width} - 1) / ${width};
     ldb /= ${width};
     ldc /= ${width};
   % endif
@@ -26,24 +26,34 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
     if (i < n)
     {
 % for j, jx in enumerate(A):
-  % if (dotex := dot(lambda kx: f'b[i + {kx}*ldb]', jx, maxsplit=ksplit)) != '0.0':
+  <%
+  nzixs = [kx for kx, val in enumerate(jx) if val != 0]
+  if nzixs:
+      first_kx = nzixs[0]
+      dotex = f"gimmik_vmul({jx[first_kx]}, b[i + {first_kx}*ldb])"
+      for kx in nzixs[1:]:
+          dotex = f"gimmik_vmadd({dotex}, {jx[kx]}, b[i + {kx}*ldb])"
+  else:
+      dotex = 'make_zero()'
+  %>
+  % if nzixs:
     % if beta == 0:
         dotp = ${dotex};
-        nt_store_c(&c[i + ${j}*ldc], dotp);
+        store_c(&c[i + ${j}*ldc], dotp);
     % elif beta == 1:
-        dotp = nt_load_c(&c[i + ${j}*ldc]);
-        dotp += ${dotex};
-        nt_store_c(&c[i + ${j}*ldc], dotp);
+        dotp = load_c(&c[i + ${j}*ldc]);
+        dotp = gimmik_vadd(dotp, ${dotex});
+        store_c(&c[i + ${j}*ldc], dotp);
     % else:
-        dotp = ${beta}*nt_load_c(&c[i + ${j}*ldc]);
-        dotp += ${dotex};
-        nt_store_c(&c[i + ${j}*ldc], dotp);
+        dotp = gimmik_vmul(${beta}, load_c(&c[i + ${j}*ldc]));
+        dotp = gimmik_vadd(dotp, ${dotex});
+        store_c(&c[i + ${j}*ldc], dotp);
     % endif
   % else:
     % if beta == 0:
-        nt_store_c(&c[i + ${j}*ldc], make_zero());
+        store_c(&c[i + ${j}*ldc], make_zero());
     % elif beta != 1:
-        nt_store_c(&c[i + ${j}*ldc], ${beta}*nt_load_c(&c[i + ${j}*ldc]));
+        store_c(&c[i + ${j}*ldc], gimmik_vmul(${beta}, load_c(&c[i + ${j}*ldc])));
     % endif
   % endif
 % endfor
diff --git a/gimmik/kernels/hip/cstream-width-preload-c.mako b/gimmik/kernels/hip/cstream-width-preload-c.mako
deleted file mode 100644
index 86acfcb..0000000
--- a/gimmik/kernels/hip/cstream-width-preload-c.mako
+++ /dev/null
@@ -1,66 +0,0 @@
-<%inherit file='base'/>
-
-<%include file='vector'/>
-
-__global__ __launch_bounds__(${blockx}) void
-% if n is None:
-${kname}(int n,
-         const ${dtype}* __restrict__ b, int ldb,
-         ${dtype}* __restrict__ c, int ldc)
-{
-  % if width > 1:
-    n = (n + ${width} - 1) / ${width};
-    ldb /= ${width};
-    ldc /= ${width};
-  % endif
-% else:
-${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
-{
-    const int n = ${-(-n // width)};
-    const ${'long long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width};
-    const ${'long long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width};
-% endif
-    const int i = blockDim.x*blockIdx.x + threadIdx.x;
-    ${dtype} bv, dotp;
-
-    if (i < n)
-    {
-% for j, row in enumerate(A):
-  <%
-  nzixs = [kx for kx, val in enumerate(row) if val != 0]
-  %>
-  % if nzixs:
-    % if beta == 0:
-      <% first_kx = nzixs[0] %>
-        bv = b[i + ${first_kx}*ldb];
-        dotp = gimmik_vmul(${row[first_kx]}, bv);
-      % for kx in nzixs[1:]:
-        bv = b[i + ${kx}*ldb];
-        dotp = gimmik_vmadd(dotp, ${row[kx]}, bv);
-      % endfor
-        nt_store_c(&c[i + ${j}*ldc], dotp);
-    % elif beta == 1:
-        dotp = nt_load_c(&c[i + ${j}*ldc]);
-      % for kx in nzixs:
-        bv = b[i + ${kx}*ldb];
-        dotp = gimmik_vmadd(dotp, ${row[kx]}, bv);
-      % endfor
-        nt_store_c(&c[i + ${j}*ldc], dotp);
-    % else:
-        dotp = gimmik_vmul(${beta}, nt_load_c(&c[i + ${j}*ldc]));
-      % for kx in nzixs:
-        bv = b[i + ${kx}*ldb];
-        dotp = gimmik_vmadd(dotp, ${row[kx]}, bv);
-      % endfor
-        nt_store_c(&c[i + ${j}*ldc], dotp);
-    % endif
-  % else:
-    % if beta == 0:
-        nt_store_c(&c[i + ${j}*ldc], make_zero());
-    % elif beta != 1:
-        nt_store_c(&c[i + ${j}*ldc], gimmik_vmul(${beta}, nt_load_c(&c[i + ${j}*ldc])));
-    % endif
-  % endif
-% endfor
-    }
-}
diff --git a/gimmik/kernels/hip/cstream.mako b/gimmik/kernels/hip/cstream.mako
index 0651e87..1b7b312 100644
--- a/gimmik/kernels/hip/cstream.mako
+++ b/gimmik/kernels/hip/cstream.mako
@@ -9,7 +9,7 @@ ${kname}(int n,
          ${dtype}* __restrict__ c, int ldc)
 {
   % if width > 1:
-    n = ((n + ${width} - 1) / ${width}) * ${width};
+    n = (n + ${width} - 1) / ${width};
     ldb /= ${width};
     ldc /= ${width};
   % endif
@@ -26,17 +26,26 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
     if (i < n)
     {
 % for j, jx in enumerate(A):
-  % if (dotex := dot(lambda kx: f'b[i + {kx}*ldb]', jx, maxsplit=ksplit)) != '0.0':
+  <%
+  nzixs = [kx for kx, val in enumerate(jx) if val != 0]
+  if nzixs:
+      first_kx = nzixs[0]
+      dotex = f"gimmik_vmul({jx[first_kx]}, b[i + {first_kx}*ldb])"
+      for kx in nzixs[1:]:
+          dotex = f"gimmik_vmadd({dotex}, {jx[kx]}, b[i + {kx}*ldb])"
+  else:
+      dotex = 'make_zero()'
+  %>
         dotp = ${dotex};
-  % else:
+  % if not nzixs:
         dotp = make_zero();
   % endif
   % if beta == 0:
-        nt_store_c(&c[i + ${j}*ldc], dotp);
-  % elif beta == 1 and dotex != '0.0':
-        nt_store_c(&c[i + ${j}*ldc], nt_load_c(&c[i + ${j}*ldc]) + dotp);
+        store_c(&c[i + ${j}*ldc], dotp);
+  % elif beta == 1 and nzixs:
+        store_c(&c[i + ${j}*ldc], gimmik_vadd(load_c(&c[i + ${j}*ldc]), dotp));
   % else:
-        nt_store_c(&c[i + ${j}*ldc], dotp + ${beta}*nt_load_c(&c[i + ${j}*ldc]));
+        store_c(&c[i + ${j}*ldc], gimmik_vadd(dotp, gimmik_vmul(${beta}, load_c(&c[i + ${j}*ldc]))));
   % endif
 % endfor
     }
diff --git a/gimmik/kernels/hip/vector.mako b/gimmik/kernels/hip/vector.mako
deleted file mode 100644
index 268d6ab..0000000
--- a/gimmik/kernels/hip/vector.mako
+++ /dev/null
@@ -1,41 +0,0 @@
-% if width == 2:
-static inline __device__ ${dtype}
-gimmik_vmul(${dtype[:-1]} a, ${dtype} b)
-{
-    return make_${dtype}(a*b.x, a*b.y);
-}
-
-static inline __device__ ${dtype}
-gimmik_vadd(${dtype} a, ${dtype} b)
-{
-    return make_${dtype}(a.x + b.x, a.y + b.y);
-}
-
-static inline __device__ ${dtype}
-gimmik_vmadd(${dtype} acc, ${dtype[:-1]} a, ${dtype} b)
-{
-    // Keep the multiply-add expression visible to the compiler.
-    return make_${dtype}(acc.x + a*b.x, acc.y + a*b.y);
-}
-% elif width == 4:
-static inline __device__ ${dtype}
-gimmik_vmul(${dtype[:-1]} a, ${dtype} b)
-{
-    return make_${dtype}(a*b.x, a*b.y, a*b.z, a*b.w);
-}
-
-static inline __device__ ${dtype}
-gimmik_vadd(${dtype} a, ${dtype} b)
-{
-    return make_${dtype}(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
-}
-
-static inline __device__ ${dtype}
-gimmik_vmadd(${dtype} acc, ${dtype[:-1]} a, ${dtype} b)
-{
-    // Keep the multiply-add expression visible to the compiler.
-    return make_${dtype}(acc.x + a*b.x, acc.y + a*b.y, acc.z + a*b.z, acc.w + a*b.w);
-}
-% else:
-#error "HIP vector helpers only support width=2 or width=4"
-% endif

From e9b921a99f7ce476c98fc59bace1a4e72c26259b Mon Sep 17 00:00:00 2001
From: tomjen12 <tomjen12@amd.com>
Date: Tue, 23 Jun 2026 05:14:11 -0500
Subject: [PATCH 07/15] Use blockx launch bounds for HIP cstream preload

---
 gimmik/kernels/hip/cstream-preload-c.mako | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gimmik/kernels/hip/cstream-preload-c.mako b/gimmik/kernels/hip/cstream-preload-c.mako
index eebb602..a8d7e31 100644
--- a/gimmik/kernels/hip/cstream-preload-c.mako
+++ b/gimmik/kernels/hip/cstream-preload-c.mako
@@ -2,7 +2,7 @@
 
 <% ksplit = 2 if m < 36 else 1 %>
 
-__global__ __launch_bounds__(128) void
+__global__ __launch_bounds__(${blockx}) void
 % if n is None:
 ${kname}(int n,
          const ${dtype}* __restrict__ b, int ldb,

From 2aa2577a4a99a5479f57679c62a9debacf083ef6 Mon Sep 17 00:00:00 2001
From: tomjen12 <tomjen12@amd.com>
Date: Tue, 23 Jun 2026 21:49:28 -0500
Subject: [PATCH 08/15] Always use non-temporal C accesses for HIP

---
 gimmik/kernels/hip/base.mako | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/gimmik/kernels/hip/base.mako b/gimmik/kernels/hip/base.mako
index a03a943..831b8e0 100644
--- a/gimmik/kernels/hip/base.mako
+++ b/gimmik/kernels/hip/base.mako
@@ -102,26 +102,16 @@ nt_load_c(const ${dtype}* p)
 % endif
 }
 
-<% nt_c = context.get('nt_c', True) %>
-
 static inline __device__ void
 store_c(${dtype}* p, ${dtype} v)
 {
-% if nt_c:
     nt_store_c(p, v);
-% else:
-    *p = v;
-% endif
 }
 
 static inline __device__ ${dtype}
 load_c(const ${dtype}* p)
 {
-% if nt_c:
     return nt_load_c(p);
-% else:
-    return *p;
-% endif
 }
 
 ${next.body()}

From be1c1dbb58e34353e73d7b863aa6eceecf9a0880 Mon Sep 17 00:00:00 2001
From: "Eric.Chin.AMD" <ericchin@amd.com>
Date: Wed, 24 Jun 2026 22:47:02 +0800
Subject: [PATCH 09/15] feat(hip): add non-temporal B-load (NTB) variants for
 bstream-msplit

On memory-bound operators the B matrix is read once from HBM and reused
only within a work-group via LDS -- it is never re-read across blocks.
A normal global load still allocates B's line in L2, which is pure
overhead: the line is never reused, it evicts genuinely-reusable data,
and it adds cache-allocate/eviction traffic. This is the read-side mirror
of the non-temporal C store we already use.

NTB loads B with a non-temporal load (load_b -> __builtin_nontemporal
path) so B bypasses L2 instead of polluting it. It moves the same number
of bytes but keeps the cache clean, raising effective bandwidth.

Implemented as a flag on the existing templates rather than new files:
- base.mako: add a load_b() wrapper (non-temporal B load).
- bstream-msplit{,-preload-c}.mako: gate the B read behind an `ntload`
  flag (context.get('ntload', False)); renders byte-identically to the
  plain kernel when the flag is absent.
- hip.py: emit `*-ntb` variants by passing ntload=True inside the
  existing width loop, so NTB combines with width (w1/w2) automatically.

Backward-compatible (plain variants unchanged) and CDNA-gated like the
other tuned variants. On MI300X (gfx942) NTB passes the accuracy check
(~1e-15) and wins the autotune in the majority of memory-bound cases
(~+4.5% bandwidth on those), being chosen over the plain bstream-msplit.
---
 gimmik/hip.py                                   | 17 +++++++++++++++++
 gimmik/kernels/hip/base.mako                    |  9 +++++++++
 .../kernels/hip/bstream-msplit-preload-c.mako   |  6 ++++--
 gimmik/kernels/hip/bstream-msplit.mako          |  6 ++++--
 4 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/gimmik/hip.py b/gimmik/hip.py
index 57fa394..860ed6c 100644
--- a/gimmik/hip.py
+++ b/gimmik/hip.py
@@ -78,6 +78,13 @@ def emit(name, args, meta):
                 } | wmeta
                 yield from emit('bstream-msplit', args, meta)
 
+                # non-temporal B-load variant: B is read-once -> skip L2 alloc
+                nmeta = {
+                    'block': (blkx, ms, 1), 'shared': shared,
+                    'desc': f'bstream-msplit-ntb/{wpfx}m{ms}-b{bsz}-x{blkx}'
+                } | wmeta
+                yield from emit('bstream-msplit', args | {'ntload': True}, nmeta)
+
             for ks in ksplits:
                 # k-split B loading, C streaming kernel
                 args = {'ksplit': ks, 'csz': csz, 'blockx': blkx} | wargs
@@ -112,6 +119,16 @@ def emit(name, args, meta):
                 } | wmeta
                 yield from emit('bstream-msplit-preload-c', args, meta)
 
+                # non-temporal B-load variant
+                nmeta = {
+                    'block': (blkx, ms, 1), 'shared': shared,
+                    'desc': (
+                        f'bstream-msplit-preload-c-ntb/'
+                        f'{wpfx}m{ms}-b{bsz}-x{blkx}'
+                    )
+                } | wmeta
+                yield from emit('bstream-msplit-preload-c', args | {'ntload': True}, nmeta)
+
             for ks in ksplits:
                 # k-split B loading, C preloading, C streaming kernel
                 args = {'ksplit': ks, 'csz': csz, 'blockx': blkx} | wargs
diff --git a/gimmik/kernels/hip/base.mako b/gimmik/kernels/hip/base.mako
index 831b8e0..5291538 100644
--- a/gimmik/kernels/hip/base.mako
+++ b/gimmik/kernels/hip/base.mako
@@ -114,4 +114,13 @@ load_c(const ${dtype}* p)
     return nt_load_c(p);
 }
 
+static inline __device__ ${dtype}
+load_b(const ${dtype}* p)
+{
+    // B is read-once (reused only within the block via LDS, never re-read across
+    // blocks), so load it non-temporally to avoid polluting L2 -- the read-side
+    // twin of the non-temporal C store.
+    return nt_load_c(p);
+}
+
 ${next.body()}
diff --git a/gimmik/kernels/hip/bstream-msplit-preload-c.mako b/gimmik/kernels/hip/bstream-msplit-preload-c.mako
index 6cabeb6..36783c0 100644
--- a/gimmik/kernels/hip/bstream-msplit-preload-c.mako
+++ b/gimmik/kernels/hip/bstream-msplit-preload-c.mako
@@ -3,6 +3,8 @@
 <%
 mx = partition(A, into=msplit, by='rows')
 bchunks = chunk(bix, bsz)
+ntload = context.get('ntload', False)
+bload = (lambda kx: f'load_b(&b[i + {kx}*ldb])') if ntload else (lambda kx: f'b[i + {kx}*ldb]')
 %>
 
 __global__ __launch_bounds__(${blockx*msplit}) void
@@ -34,7 +36,7 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
     {
   % for kx in bchunks[0]:
     % if loop.index % msplit == cid:
-        bsub[0][${loop.index}][threadIdx.x] = b[i + ${kx}*ldb];
+        bsub[0][${loop.index}][threadIdx.x] = ${bload(kx)};
     % endif
   % endfor
 
@@ -64,7 +66,7 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
     % if not loop.parent.last:
       % for kx in bchunks[bb + 1]:
         % if loop.index % msplit == cid:
-        bsub[${(bb + 1) % 2}][${loop.index}][threadIdx.x] = b[i + ${kx}*ldb];
+        bsub[${(bb + 1) % 2}][${loop.index}][threadIdx.x] = ${bload(kx)};
         % endif
       % endfor
     % endif
diff --git a/gimmik/kernels/hip/bstream-msplit.mako b/gimmik/kernels/hip/bstream-msplit.mako
index 35d336e..30f2c57 100644
--- a/gimmik/kernels/hip/bstream-msplit.mako
+++ b/gimmik/kernels/hip/bstream-msplit.mako
@@ -3,6 +3,8 @@
 <%
 mx = partition(A, into=msplit, by='rows')
 bchunks = chunk(bix, bsz)
+ntload = context.get('ntload', False)
+bload = (lambda kx: f'load_b(&b[i + {kx}*ldb])') if ntload else (lambda kx: f'b[i + {kx}*ldb]')
 %>
 
 __global__ __launch_bounds__(${blockx*msplit}) void
@@ -34,7 +36,7 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
     {
   % for kx in bchunks[0]:
     % if loop.index % msplit == cid:
-        bsub[0][${loop.index}][threadIdx.x] = b[i + ${kx}*ldb];
+        bsub[0][${loop.index}][threadIdx.x] = ${bload(kx)};
     % endif
   % endfor
     }
@@ -51,7 +53,7 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
     % if not loop.parent.last:
       % for kx in bchunks[bb + 1]:
         % if loop.index % msplit == cid:
-        bsub[${(bb + 1) % 2}][${loop.index}][threadIdx.x] = b[i + ${kx}*ldb];
+        bsub[${(bb + 1) % 2}][${loop.index}][threadIdx.x] = ${bload(kx)};
         % endif
       % endfor
     % endif

From 280e948c90da7643b511c5064ef9f51c784d27ba Mon Sep 17 00:00:00 2001
From: tomjen12 <tomjen12@amd.com>
Date: Wed, 24 Jun 2026 22:39:34 -0500
Subject: [PATCH 10/15] Use non-temporal B loads by default for HIP

---
 gimmik/hip.py                                 | 17 ----
 gimmik/kernels/hip/base.mako                  | 94 +++++--------------
 .../kernels/hip/bstream-msplit-preload-c.mako | 16 ++--
 gimmik/kernels/hip/bstream-msplit.mako        | 16 ++--
 gimmik/kernels/hip/bstream-preload-c.mako     | 12 +--
 gimmik/kernels/hip/bstream.mako               | 12 +--
 .../kernels/hip/cstream-ksplit-preload-c.mako | 14 +--
 gimmik/kernels/hip/cstream-ksplit.mako        | 10 +-
 gimmik/kernels/hip/cstream-preload-c.mako     | 12 +--
 gimmik/kernels/hip/cstream.mako               |  8 +-
 10 files changed, 72 insertions(+), 139 deletions(-)

diff --git a/gimmik/hip.py b/gimmik/hip.py
index 860ed6c..57fa394 100644
--- a/gimmik/hip.py
+++ b/gimmik/hip.py
@@ -78,13 +78,6 @@ def emit(name, args, meta):
                 } | wmeta
                 yield from emit('bstream-msplit', args, meta)
 
-                # non-temporal B-load variant: B is read-once -> skip L2 alloc
-                nmeta = {
-                    'block': (blkx, ms, 1), 'shared': shared,
-                    'desc': f'bstream-msplit-ntb/{wpfx}m{ms}-b{bsz}-x{blkx}'
-                } | wmeta
-                yield from emit('bstream-msplit', args | {'ntload': True}, nmeta)
-
             for ks in ksplits:
                 # k-split B loading, C streaming kernel
                 args = {'ksplit': ks, 'csz': csz, 'blockx': blkx} | wargs
@@ -119,16 +112,6 @@ def emit(name, args, meta):
                 } | wmeta
                 yield from emit('bstream-msplit-preload-c', args, meta)
 
-                # non-temporal B-load variant
-                nmeta = {
-                    'block': (blkx, ms, 1), 'shared': shared,
-                    'desc': (
-                        f'bstream-msplit-preload-c-ntb/'
-                        f'{wpfx}m{ms}-b{bsz}-x{blkx}'
-                    )
-                } | wmeta
-                yield from emit('bstream-msplit-preload-c', args | {'ntload': True}, nmeta)
-
             for ks in ksplits:
                 # k-split B loading, C preloading, C streaming kernel
                 args = {'ksplit': ks, 'csz': csz, 'blockx': blkx} | wargs
diff --git a/gimmik/kernels/hip/base.mako b/gimmik/kernels/hip/base.mako
index 5291538..95e622f 100644
--- a/gimmik/kernels/hip/base.mako
+++ b/gimmik/kernels/hip/base.mako
@@ -1,77 +1,34 @@
 % if dtype.endswith('4'):
-static inline __device__ ${dtype} make_zero()
-{ return make_${dtype}(0, 0, 0, 0); }
-% elif dtype.endswith('2'):
-static inline __device__ ${dtype} make_zero()
-{ return make_${dtype}(0, 0); }
-% else:
-static inline __device__ ${dtype} make_zero()
-{ return 0; }
-% endif
+inline __device__ ${dtype} operator+(${dtype} a, ${dtype} b)
+{ return make_${dtype}(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); }
 
-% if width == 1:
-static inline __device__ ${dtype}
-gimmik_vmul(${dtype} a, ${dtype} b)
-{
-    return a*b;
-}
+inline __device__ ${dtype} operator*(${dtype[:-1]} a, ${dtype} b)
+{ return make_${dtype}(a*b.x, a*b.y, a*b.z, a*b.w); }
 
-static inline __device__ ${dtype}
-gimmik_vadd(${dtype} a, ${dtype} b)
-{
-    return a + b;
-}
-
-static inline __device__ ${dtype}
-gimmik_vmadd(${dtype} acc, ${dtype} a, ${dtype} b)
-{
-    // Keep the multiply-add expression visible to the compiler.
-    return acc + a*b;
-}
-% elif width == 2:
-static inline __device__ ${dtype}
-gimmik_vmul(${dtype[:-1]} a, ${dtype} b)
-{
-    return make_${dtype}(a*b.x, a*b.y);
-}
+inline __device__ void operator+=(${dtype} &a, ${dtype} b)
+{ a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; }
 
-static inline __device__ ${dtype}
-gimmik_vadd(${dtype} a, ${dtype} b)
-{
-    return make_${dtype}(a.x + b.x, a.y + b.y);
-}
+inline __device__ ${dtype} make_zero()
+{ return make_${dtype}(0, 0, 0, 0); }
+% elif dtype.endswith('2'):
+inline __device__ ${dtype} operator+(${dtype} a, ${dtype} b)
+{ return make_${dtype}(a.x + b.x, a.y + b.y); }
 
-static inline __device__ ${dtype}
-gimmik_vmadd(${dtype} acc, ${dtype[:-1]} a, ${dtype} b)
-{
-    // Keep the multiply-add expression visible to the compiler.
-    return make_${dtype}(acc.x + a*b.x, acc.y + a*b.y);
-}
-% elif width == 4:
-static inline __device__ ${dtype}
-gimmik_vmul(${dtype[:-1]} a, ${dtype} b)
-{
-    return make_${dtype}(a*b.x, a*b.y, a*b.z, a*b.w);
-}
+inline __device__ ${dtype} operator*(${dtype[:-1]} a, ${dtype} b)
+{ return make_${dtype}(a*b.x, a*b.y); }
 
-static inline __device__ ${dtype}
-gimmik_vadd(${dtype} a, ${dtype} b)
-{
-    return make_${dtype}(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
-}
+inline __device__ void operator+=(${dtype} &a, ${dtype} b)
+{ a.x += b.x; a.y += b.y; }
 
-static inline __device__ ${dtype}
-gimmik_vmadd(${dtype} acc, ${dtype[:-1]} a, ${dtype} b)
-{
-    // Keep the multiply-add expression visible to the compiler.
-    return make_${dtype}(acc.x + a*b.x, acc.y + a*b.y, acc.z + a*b.z, acc.w + a*b.w);
-}
+inline __device__ ${dtype} make_zero()
+{ return make_${dtype}(0, 0); }
 % else:
-#error "HIP vector helpers only support width=2 or width=4"
+inline __device__ ${dtype} make_zero()
+{ return 0; }
 % endif
 
 static inline __device__ void
-nt_store_c(${dtype}* p, ${dtype} v)
+nt_store(${dtype}* p, ${dtype} v)
 {
 % if dtype.endswith('4'):
     __builtin_nontemporal_store(v.x, &p->x);
@@ -87,7 +44,7 @@ nt_store_c(${dtype}* p, ${dtype} v)
 }
 
 static inline __device__ ${dtype}
-nt_load_c(const ${dtype}* p)
+nt_load(const ${dtype}* p)
 {
 % if dtype.endswith('4'):
     return make_${dtype}(__builtin_nontemporal_load(&p->x),
@@ -105,22 +62,19 @@ nt_load_c(const ${dtype}* p)
 static inline __device__ void
 store_c(${dtype}* p, ${dtype} v)
 {
-    nt_store_c(p, v);
+    nt_store(p, v);
 }
 
 static inline __device__ ${dtype}
 load_c(const ${dtype}* p)
 {
-    return nt_load_c(p);
+    return nt_load(p);
 }
 
 static inline __device__ ${dtype}
 load_b(const ${dtype}* p)
 {
-    // B is read-once (reused only within the block via LDS, never re-read across
-    // blocks), so load it non-temporally to avoid polluting L2 -- the read-side
-    // twin of the non-temporal C store.
-    return nt_load_c(p);
+    return nt_load(p);
 }
 
 ${next.body()}
diff --git a/gimmik/kernels/hip/bstream-msplit-preload-c.mako b/gimmik/kernels/hip/bstream-msplit-preload-c.mako
index 36783c0..e58fa43 100644
--- a/gimmik/kernels/hip/bstream-msplit-preload-c.mako
+++ b/gimmik/kernels/hip/bstream-msplit-preload-c.mako
@@ -3,8 +3,6 @@
 <%
 mx = partition(A, into=msplit, by='rows')
 bchunks = chunk(bix, bsz)
-ntload = context.get('ntload', False)
-bload = (lambda kx: f'load_b(&b[i + {kx}*ldb])') if ntload else (lambda kx: f'b[i + {kx}*ldb]')
 %>
 
 __global__ __launch_bounds__(${blockx*msplit}) void
@@ -36,7 +34,7 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
     {
   % for kx in bchunks[0]:
     % if loop.index % msplit == cid:
-        bsub[0][${loop.index}][threadIdx.x] = ${bload(kx)};
+        bsub[0][${loop.index}][threadIdx.x] = load_b(&b[i + ${kx}*ldb]);
     % endif
   % endfor
 
@@ -47,7 +45,7 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
       % if beta == 1:
         csub[${j}] = load_c(&c[i + ${jx}*ldc]);
       % else:
-        csub[${j}] = gimmik_vmul(${beta}, load_c(&c[i + ${jx}*ldc]));
+        csub[${j}] = ${beta}*load_c(&c[i + ${jx}*ldc]);
       % endif
     % endif
   % endfor
@@ -66,7 +64,7 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
     % if not loop.parent.last:
       % for kx in bchunks[bb + 1]:
         % if loop.index % msplit == cid:
-        bsub[${(bb + 1) % 2}][${loop.index}][threadIdx.x] = ${bload(kx)};
+        bsub[${(bb + 1) % 2}][${loop.index}][threadIdx.x] = load_b(&b[i + ${kx}*ldb]);
         % endif
       % endfor
     % endif
@@ -76,12 +74,12 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
       % for j, jx in enumerate(A[mcx, kx]):
         % if beta == 0:
           % if jx != 0 and kx == afix[mcx[j]]:
-        csub[${j}] = gimmik_vmul(${jx}, bv);
+        csub[${j}] = ${jx}*bv;
           % elif jx != 0:
-        csub[${j}] = gimmik_vmadd(csub[${j}], ${jx}, bv);
+        csub[${j}] += ${jx}*bv;
           % endif
         % elif jx != 0:
-        csub[${j}] = gimmik_vmadd(csub[${j}], ${jx}, bv);
+        csub[${j}] += ${jx}*bv;
         % endif
         ## If we're done with this dot product then store to global
         % if kx == alix[mcx[j]]:
@@ -95,7 +93,7 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
         % if jx == -1 and j % msplit == cid and beta == 0:
         store_c(&c[i + ${j}*ldc], make_zero());
         % elif jx == -1 and j % msplit == cid and beta != 1:
-        store_c(&c[i + ${j}*ldc], gimmik_vmul(${beta}, load_c(&c[i + ${j}*ldc])));
+        store_c(&c[i + ${j}*ldc], ${beta}*load_c(&c[i + ${j}*ldc]));
         % endif
       % endfor
     % endif
diff --git a/gimmik/kernels/hip/bstream-msplit.mako b/gimmik/kernels/hip/bstream-msplit.mako
index 30f2c57..0681783 100644
--- a/gimmik/kernels/hip/bstream-msplit.mako
+++ b/gimmik/kernels/hip/bstream-msplit.mako
@@ -3,8 +3,6 @@
 <%
 mx = partition(A, into=msplit, by='rows')
 bchunks = chunk(bix, bsz)
-ntload = context.get('ntload', False)
-bload = (lambda kx: f'load_b(&b[i + {kx}*ldb])') if ntload else (lambda kx: f'b[i + {kx}*ldb]')
 %>
 
 __global__ __launch_bounds__(${blockx*msplit}) void
@@ -36,7 +34,7 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
     {
   % for kx in bchunks[0]:
     % if loop.index % msplit == cid:
-        bsub[0][${loop.index}][threadIdx.x] = ${bload(kx)};
+        bsub[0][${loop.index}][threadIdx.x] = load_b(&b[i + ${kx}*ldb]);
     % endif
   % endfor
     }
@@ -53,7 +51,7 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
     % if not loop.parent.last:
       % for kx in bchunks[bb + 1]:
         % if loop.index % msplit == cid:
-        bsub[${(bb + 1) % 2}][${loop.index}][threadIdx.x] = ${bload(kx)};
+        bsub[${(bb + 1) % 2}][${loop.index}][threadIdx.x] = load_b(&b[i + ${kx}*ldb]);
         % endif
       % endfor
     % endif
@@ -62,17 +60,17 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
         bv = bsub[${bb % 2}][${loop.index}][threadIdx.x];
       % for j, jx in enumerate(A[mcx, kx]):
         % if jx != 0 and kx == afix[mcx[j]]:
-        csub[${j}] = gimmik_vmul(${jx}, bv);
+        csub[${j}] = ${jx}*bv;
         % elif jx != 0:
-        csub[${j}] = gimmik_vmadd(csub[${j}], ${jx}, bv);
+        csub[${j}] += ${jx}*bv;
         % endif
         ## If we're done with this dot product then store to global
         % if kx == alix[mcx[j]] and beta == 0:
         store_c(&c[i + ${mcx[j]}*ldc], csub[${j}]);
         % elif kx == alix[mcx[j]] and beta == 1:
-        store_c(&c[i + ${mcx[j]}*ldc], gimmik_vadd(load_c(&c[i + ${mcx[j]}*ldc]), csub[${j}]));
+        store_c(&c[i + ${mcx[j]}*ldc], load_c(&c[i + ${mcx[j]}*ldc]) + csub[${j}]);
         % elif kx == alix[mcx[j]]:
-        store_c(&c[i + ${mcx[j]}*ldc], gimmik_vadd(csub[${j}], gimmik_vmul(${beta}, load_c(&c[i + ${mcx[j]}*ldc]))));
+        store_c(&c[i + ${mcx[j]}*ldc], csub[${j}] + ${beta}*load_c(&c[i + ${mcx[j]}*ldc]));
         % endif
       % endfor
     % endfor
@@ -82,7 +80,7 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
         % if jx == -1 and j % msplit == cid and beta == 0:
         store_c(&c[i + ${j}*ldc], make_zero());
         % elif jx == -1 and j % msplit == cid and beta != 1:
-        store_c(&c[i + ${j}*ldc], gimmik_vmul(${beta}, load_c(&c[i + ${j}*ldc])));
+        store_c(&c[i + ${j}*ldc], ${beta}*load_c(&c[i + ${j}*ldc]));
         % endif
       % endfor
     % endif
diff --git a/gimmik/kernels/hip/bstream-preload-c.mako b/gimmik/kernels/hip/bstream-preload-c.mako
index 095be83..a7140c6 100644
--- a/gimmik/kernels/hip/bstream-preload-c.mako
+++ b/gimmik/kernels/hip/bstream-preload-c.mako
@@ -31,7 +31,7 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
     % if beta == 1:
         csub[${j}] = load_c(&c[i + ${j}*ldc]);
     % else:
-        csub[${j}] = gimmik_vmul(${beta}, load_c(&c[i + ${j}*ldc]));
+        csub[${j}] = ${beta}*load_c(&c[i + ${j}*ldc]);
     % endif
   % endif
 % endfor
@@ -39,16 +39,16 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
 
 ## Iterate through the used rows of B
 % for kx in bix:
-        bv = b[i + ${kx}*ldb];
+        bv = load_b(&b[i + ${kx}*ldb]);
   % for j, jx in enumerate(A[:, kx]):
     % if beta == 0:
       % if jx != 0 and kx == afix[j]:
-        csub[${j}] = gimmik_vmul(${jx}, bv);
+        csub[${j}] = ${jx}*bv;
       % elif jx != 0:
-        csub[${j}] = gimmik_vmadd(csub[${j}], ${jx}, bv);
+        csub[${j}] += ${jx}*bv;
       % endif
     % elif jx != 0:
-        csub[${j}] = gimmik_vmadd(csub[${j}], ${jx}, bv);
+        csub[${j}] += ${jx}*bv;
     % endif
     ##
     % if kx == alix[j]:
@@ -62,7 +62,7 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
   % if jx == -1 and beta == 0:
         store_c(&c[i + ${j}*ldc], make_zero());
   % elif jx == -1 and beta != 1:
-        store_c(&c[i + ${j}*ldc], gimmik_vmul(${beta}, load_c(&c[i + ${j}*ldc])));
+        store_c(&c[i + ${j}*ldc], ${beta}*load_c(&c[i + ${j}*ldc]));
   % endif
 % endfor
     }
diff --git a/gimmik/kernels/hip/bstream.mako b/gimmik/kernels/hip/bstream.mako
index 427dffc..10df790 100644
--- a/gimmik/kernels/hip/bstream.mako
+++ b/gimmik/kernels/hip/bstream.mako
@@ -26,20 +26,20 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
 
 ## Iterare through the used rows of B
 % for kx in bix:
-        bv = b[i + ${kx}*ldb];
+        bv = load_b(&b[i + ${kx}*ldb]);
   % for j, jx in enumerate(A[:, kx]):
     % if jx != 0 and kx == afix[j]:
-        csub[${j}] = gimmik_vmul(${jx}, bv);
+        csub[${j}] = ${jx}*bv;
     % elif jx != 0:
-        csub[${j}] = gimmik_vmadd(csub[${j}], ${jx}, bv);
+        csub[${j}] += ${jx}*bv;
     % endif
     ##
     % if kx == alix[j] and beta == 0:
         store_c(&c[i + ${j}*ldc], csub[${j}]);
     % elif kx == alix[j] and beta == 1:
-        store_c(&c[i + ${j}*ldc], gimmik_vadd(load_c(&c[i + ${j}*ldc]), csub[${j}]));
+        store_c(&c[i + ${j}*ldc], load_c(&c[i + ${j}*ldc]) + csub[${j}]);
     % elif kx == alix[j]:
-        store_c(&c[i + ${j}*ldc], gimmik_vadd(csub[${j}], gimmik_vmul(${beta}, load_c(&c[i + ${j}*ldc]))));
+        store_c(&c[i + ${j}*ldc], csub[${j}] + ${beta}*load_c(&c[i + ${j}*ldc]));
     % endif
   % endfor
 % endfor
@@ -49,7 +49,7 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
   % if jx == -1 and beta == 0:
         store_c(&c[i + ${j}*ldc], make_zero());
   % elif jx == -1 and beta != 1:
-        store_c(&c[i + ${j}*ldc], gimmik_vmul(${beta}, load_c(&c[i + ${j}*ldc])));
+        store_c(&c[i + ${j}*ldc], ${beta}*load_c(&c[i + ${j}*ldc]));
   % endif
 % endfor
     }
diff --git a/gimmik/kernels/hip/cstream-ksplit-preload-c.mako b/gimmik/kernels/hip/cstream-ksplit-preload-c.mako
index 4550700..b922666 100644
--- a/gimmik/kernels/hip/cstream-ksplit-preload-c.mako
+++ b/gimmik/kernels/hip/cstream-ksplit-preload-c.mako
@@ -48,9 +48,9 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
       has_dotp = A[j].any()
       if nzixs:
           first_l_idx, first_kx = nzixs[0]
-          dotex = f"gimmik_vmul({A[j, first_kx]}, bv[{first_l_idx}])"
+          dotex = f"{A[j, first_kx]}*bv[{first_l_idx}]"
           for l_idx, kx in nzixs[1:]:
-              dotex = f"gimmik_vmadd({dotex}, {A[j, kx]}, bv[{l_idx}])"
+              dotex = f"{dotex} + {A[j, kx]}*bv[{l_idx}]"
       else:
           dotex = 'make_zero()'
       %>
@@ -61,10 +61,10 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
         cv[${loop.index // ksplit}] = dotp;
         % elif beta == 1 and has_dotp:
         cv[${loop.index // ksplit}] = load_c(&c[i + ${j}*ldc]);
-        cv[${loop.index // ksplit}] = gimmik_vadd(cv[${loop.index // ksplit}], dotp);
+        cv[${loop.index // ksplit}] += dotp;
         % elif has_dotp:
-        cv[${loop.index // ksplit}] = gimmik_vmul(${beta}, load_c(&c[i + ${j}*ldc]));
-        cv[${loop.index // ksplit}] = gimmik_vadd(cv[${loop.index // ksplit}], dotp);
+        cv[${loop.index // ksplit}] = ${beta}*load_c(&c[i + ${j}*ldc]);
+        cv[${loop.index // ksplit}] += dotp;
         % endif
       ## Save to shared memory
       % else:
@@ -85,7 +85,7 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
         <%
         sum_expr = f"cv[{loop.index // ksplit}]"
         for s_idx in range(ksplit - 1):
-            sum_expr = f"gimmik_vadd({sum_expr}, csub[{s_idx}][{loop.index}][threadIdx.x])"
+            sum_expr = f"{sum_expr} + csub[{s_idx}][{loop.index}][threadIdx.x]"
         %>
         % if beta == 0:
         dotp = ${sum_expr};
@@ -97,7 +97,7 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
         dotp = ${sum_expr};
         store_c(&c[i + ${j}*ldc], dotp);
         % elif beta != 1:
-        store_c(&c[i + ${j}*ldc], gimmik_vmul(${beta}, load_c(&c[i + ${j}*ldc])));
+        store_c(&c[i + ${j}*ldc], ${beta}*load_c(&c[i + ${j}*ldc]));
         % endif
       % endif
     % endfor
diff --git a/gimmik/kernels/hip/cstream-ksplit.mako b/gimmik/kernels/hip/cstream-ksplit.mako
index 9c5fc09..5fc6711 100644
--- a/gimmik/kernels/hip/cstream-ksplit.mako
+++ b/gimmik/kernels/hip/cstream-ksplit.mako
@@ -47,9 +47,9 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
       nzixs = [(l_idx, kbx[l_idx]) for l_idx in A[j, kbx].nonzero()[0]]
       if nzixs:
           first_l_idx, first_kx = nzixs[0]
-          dotex = f"gimmik_vmul({A[j, first_kx]}, bv[{first_l_idx}])"
+          dotex = f"{A[j, first_kx]}*bv[{first_l_idx}]"
           for l_idx, kx in nzixs[1:]:
-              dotex = f"gimmik_vmadd({dotex}, {A[j, kx]}, bv[{l_idx}])"
+              dotex = f"{dotex} + {A[j, kx]}*bv[{l_idx}]"
       else:
           dotex = 'make_zero()'
       %>
@@ -75,15 +75,15 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
         <%
         sum_expr = f"cv[{loop.index // ksplit}]"
         for s_idx in range(ksplit - 1):
-            sum_expr = f"gimmik_vadd({sum_expr}, csub[{s_idx}][{loop.index}][threadIdx.x])"
+            sum_expr = f"{sum_expr} + csub[{s_idx}][{loop.index}][threadIdx.x]"
         %>
         dotp = ${sum_expr};
         % if beta == 0:
         store_c(&c[i + ${j}*ldc], dotp);
         % elif beta == 1:
-        store_c(&c[i + ${j}*ldc], gimmik_vadd(load_c(&c[i + ${j}*ldc]), dotp));
+        store_c(&c[i + ${j}*ldc], load_c(&c[i + ${j}*ldc]) + dotp);
         % else:
-        store_c(&c[i + ${j}*ldc], gimmik_vadd(dotp, gimmik_vmul(${beta}, load_c(&c[i + ${j}*ldc]))));
+        store_c(&c[i + ${j}*ldc], dotp + ${beta}*load_c(&c[i + ${j}*ldc]));
         % endif
       % endif
     % endfor
diff --git a/gimmik/kernels/hip/cstream-preload-c.mako b/gimmik/kernels/hip/cstream-preload-c.mako
index a8d7e31..c9a83b0 100644
--- a/gimmik/kernels/hip/cstream-preload-c.mako
+++ b/gimmik/kernels/hip/cstream-preload-c.mako
@@ -30,9 +30,9 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
   nzixs = [kx for kx, val in enumerate(jx) if val != 0]
   if nzixs:
       first_kx = nzixs[0]
-      dotex = f"gimmik_vmul({jx[first_kx]}, b[i + {first_kx}*ldb])"
+      dotex = f"{jx[first_kx]}*b[i + {first_kx}*ldb]"
       for kx in nzixs[1:]:
-          dotex = f"gimmik_vmadd({dotex}, {jx[kx]}, b[i + {kx}*ldb])"
+          dotex = f"{dotex} + {jx[kx]}*b[i + {kx}*ldb]"
   else:
       dotex = 'make_zero()'
   %>
@@ -42,18 +42,18 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
         store_c(&c[i + ${j}*ldc], dotp);
     % elif beta == 1:
         dotp = load_c(&c[i + ${j}*ldc]);
-        dotp = gimmik_vadd(dotp, ${dotex});
+        dotp += ${dotex};
         store_c(&c[i + ${j}*ldc], dotp);
     % else:
-        dotp = gimmik_vmul(${beta}, load_c(&c[i + ${j}*ldc]));
-        dotp = gimmik_vadd(dotp, ${dotex});
+        dotp = ${beta}*load_c(&c[i + ${j}*ldc]);
+        dotp += ${dotex};
         store_c(&c[i + ${j}*ldc], dotp);
     % endif
   % else:
     % if beta == 0:
         store_c(&c[i + ${j}*ldc], make_zero());
     % elif beta != 1:
-        store_c(&c[i + ${j}*ldc], gimmik_vmul(${beta}, load_c(&c[i + ${j}*ldc])));
+        store_c(&c[i + ${j}*ldc], ${beta}*load_c(&c[i + ${j}*ldc]));
     % endif
   % endif
 % endfor
diff --git a/gimmik/kernels/hip/cstream.mako b/gimmik/kernels/hip/cstream.mako
index 1b7b312..4ea995e 100644
--- a/gimmik/kernels/hip/cstream.mako
+++ b/gimmik/kernels/hip/cstream.mako
@@ -30,9 +30,9 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
   nzixs = [kx for kx, val in enumerate(jx) if val != 0]
   if nzixs:
       first_kx = nzixs[0]
-      dotex = f"gimmik_vmul({jx[first_kx]}, b[i + {first_kx}*ldb])"
+      dotex = f"{jx[first_kx]}*b[i + {first_kx}*ldb]"
       for kx in nzixs[1:]:
-          dotex = f"gimmik_vmadd({dotex}, {jx[kx]}, b[i + {kx}*ldb])"
+          dotex = f"{dotex} + {jx[kx]}*b[i + {kx}*ldb]"
   else:
       dotex = 'make_zero()'
   %>
@@ -43,9 +43,9 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
   % if beta == 0:
         store_c(&c[i + ${j}*ldc], dotp);
   % elif beta == 1 and nzixs:
-        store_c(&c[i + ${j}*ldc], gimmik_vadd(load_c(&c[i + ${j}*ldc]), dotp));
+        store_c(&c[i + ${j}*ldc], load_c(&c[i + ${j}*ldc]) + dotp);
   % else:
-        store_c(&c[i + ${j}*ldc], gimmik_vadd(dotp, gimmik_vmul(${beta}, load_c(&c[i + ${j}*ldc]))));
+        store_c(&c[i + ${j}*ldc], dotp + ${beta}*load_c(&c[i + ${j}*ldc]));
   % endif
 % endfor
     }

From c06216d0e09f76313acf0cf53cccd5529d7f6eab Mon Sep 17 00:00:00 2001
From: tomjen12 <tomjen12@amd.com>
Date: Wed, 24 Jun 2026 23:00:57 -0500
Subject: [PATCH 11/15] Make HIP preload-C a template option

---
 gimmik/hip.py                                 |  11 +-
 .../kernels/hip/bstream-msplit-preload-c.mako | 104 -----------------
 gimmik/kernels/hip/bstream-msplit.mako        |  22 +++-
 gimmik/kernels/hip/bstream-preload-c.mako     |  69 -----------
 gimmik/kernels/hip/bstream.mako               |  25 +++-
 .../kernels/hip/cstream-ksplit-preload-c.mako | 108 ------------------
 gimmik/kernels/hip/cstream-ksplit.mako        |  28 ++++-
 gimmik/kernels/hip/cstream-preload-c.mako     |  61 ----------
 gimmik/kernels/hip/cstream.mako               |  25 +++-
 9 files changed, 96 insertions(+), 357 deletions(-)
 delete mode 100644 gimmik/kernels/hip/bstream-msplit-preload-c.mako
 delete mode 100644 gimmik/kernels/hip/bstream-preload-c.mako
 delete mode 100644 gimmik/kernels/hip/cstream-ksplit-preload-c.mako
 delete mode 100644 gimmik/kernels/hip/cstream-preload-c.mako

diff --git a/gimmik/hip.py b/gimmik/hip.py
index 57fa394..1fceb01 100644
--- a/gimmik/hip.py
+++ b/gimmik/hip.py
@@ -19,6 +19,9 @@ def emit(name, args, meta):
             if threads <= max_block_threads and shared <= max_shared:
                 yield (name, args, meta)
 
+        def emit_preload(name, args, meta):
+            yield from emit(name, args | {'preload': True}, meta)
+
         blkx = self.basemeta['block'][0]
 
         # B loading, C streaming kernel
@@ -92,12 +95,12 @@ def emit(name, args, meta):
             args = {'blockx': blkx} | wargs
             meta = {'block': (blkx, 1, 1),
                     'desc': f'cstream-preload-c/{wpfx}x{blkx}'} | wmeta
-            yield from emit('cstream-preload-c', args, meta)
+            yield from emit_preload('cstream', args, meta)
 
             # B streaming, C preloading, C accumulation kernel
             meta = {'block': (blkx, 1, 1),
                     'desc': f'bstream-preload-c/{wpfx}x{blkx}'} | wmeta
-            yield from emit('bstream-preload-c', args, meta)
+            yield from emit_preload('bstream', args, meta)
 
             for ms in msplits:
                 # m-split B streaming, C preloading, C accumulation kernel
@@ -110,7 +113,7 @@ def emit(name, args, meta):
                         f'{wpfx}m{ms}-b{bsz}-x{blkx}'
                     )
                 } | wmeta
-                yield from emit('bstream-msplit-preload-c', args, meta)
+                yield from emit_preload('bstream-msplit', args, meta)
 
             for ks in ksplits:
                 # k-split B loading, C preloading, C streaming kernel
@@ -123,7 +126,7 @@ def emit(name, args, meta):
                         f'{wpfx}k{ks}-c{csz}-x{blkx}'
                     )
                 } | wmeta
-                yield from emit('cstream-ksplit-preload-c', args, meta)
+                yield from emit_preload('cstream-ksplit', args, meta)
 
     def _process_meta(self, meta):
         if self.n is not None:
diff --git a/gimmik/kernels/hip/bstream-msplit-preload-c.mako b/gimmik/kernels/hip/bstream-msplit-preload-c.mako
deleted file mode 100644
index e58fa43..0000000
--- a/gimmik/kernels/hip/bstream-msplit-preload-c.mako
+++ /dev/null
@@ -1,104 +0,0 @@
-<%inherit file='base'/>
-
-<%
-mx = partition(A, into=msplit, by='rows')
-bchunks = chunk(bix, bsz)
-%>
-
-__global__ __launch_bounds__(${blockx*msplit}) void
-% if n is None:
-${kname}(int n,
-         const ${dtype}* __restrict__ b, int ldb,
-         ${dtype}* __restrict__ c, int ldc)
-{
-  % if width > 1:
-    n = (n + ${width} - 1) / ${width};
-    ldb /= ${width};
-    ldc /= ${width};
-  % endif
-% else:
-${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
-{
-    const int n = ${-(-n // width)};
-    const ${'long long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width};
-    const ${'long long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width};
-% endif
-    int i = blockDim.x*blockIdx.x + threadIdx.x;
-
-    ${dtype} bv, csub[${-(-m // msplit)}];
-    __shared__ ${dtype} bsub[2][${bsz}][${blockx}];
-
-## Fill the initial shared memory block
-% for cid in range(msplit):
-    if (i < n && threadIdx.y == ${cid})
-    {
-  % for kx in bchunks[0]:
-    % if loop.index % msplit == cid:
-        bsub[0][${loop.index}][threadIdx.x] = load_b(&b[i + ${kx}*ldb]);
-    % endif
-  % endfor
-
-  % if beta != 0:
-  ## Preload C values for active rows owned by this m-split lane
-  % for j, jx in enumerate(mx[cid]):
-    % if afix[jx] != -1:
-      % if beta == 1:
-        csub[${j}] = load_c(&c[i + ${jx}*ldc]);
-      % else:
-        csub[${j}] = ${beta}*load_c(&c[i + ${jx}*ldc]);
-      % endif
-    % endif
-  % endfor
-  % endif
-    }
-% endfor
-    __syncthreads();
-
-## Iterate over each row-chunk of B
-% for bb in range(len(bchunks)):
-  ## Iterate over each row-chunk of C
-  % for cid, mcx in enumerate(mx):
-    if (i < n && threadIdx.y == ${cid})
-    {
-    ## Start filling the next shared memory block
-    % if not loop.parent.last:
-      % for kx in bchunks[bb + 1]:
-        % if loop.index % msplit == cid:
-        bsub[${(bb + 1) % 2}][${loop.index}][threadIdx.x] = load_b(&b[i + ${kx}*ldb]);
-        % endif
-      % endfor
-    % endif
-    ## Accumulate our dot products
-    % for kx in bchunks[bb]:
-        bv = bsub[${bb % 2}][${loop.index}][threadIdx.x];
-      % for j, jx in enumerate(A[mcx, kx]):
-        % if beta == 0:
-          % if jx != 0 and kx == afix[mcx[j]]:
-        csub[${j}] = ${jx}*bv;
-          % elif jx != 0:
-        csub[${j}] += ${jx}*bv;
-          % endif
-        % elif jx != 0:
-        csub[${j}] += ${jx}*bv;
-        % endif
-        ## If we're done with this dot product then store to global
-        % if kx == alix[mcx[j]]:
-        store_c(&c[i + ${mcx[j]}*ldc], csub[${j}]);
-        % endif
-      % endfor
-    % endfor
-    ## Handle rows of A which are all zero
-    % if loop.parent.last:
-      % for j, jx in enumerate(afix):
-        % if jx == -1 and j % msplit == cid and beta == 0:
-        store_c(&c[i + ${j}*ldc], make_zero());
-        % elif jx == -1 and j % msplit == cid and beta != 1:
-        store_c(&c[i + ${j}*ldc], ${beta}*load_c(&c[i + ${j}*ldc]));
-        % endif
-      % endfor
-    % endif
-    }
-  % endfor
-    __syncthreads();
-% endfor
-}
diff --git a/gimmik/kernels/hip/bstream-msplit.mako b/gimmik/kernels/hip/bstream-msplit.mako
index 0681783..52853f4 100644
--- a/gimmik/kernels/hip/bstream-msplit.mako
+++ b/gimmik/kernels/hip/bstream-msplit.mako
@@ -3,6 +3,7 @@
 <%
 mx = partition(A, into=msplit, by='rows')
 bchunks = chunk(bix, bsz)
+preload = context.get('preload', False)
 %>
 
 __global__ __launch_bounds__(${blockx*msplit}) void
@@ -37,6 +38,19 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
         bsub[0][${loop.index}][threadIdx.x] = load_b(&b[i + ${kx}*ldb]);
     % endif
   % endfor
+
+  % if preload and beta != 0:
+  ## Preload C values for active rows owned by this m-split lane
+  % for j, jx in enumerate(mx[cid]):
+    % if afix[jx] != -1:
+      % if beta == 1:
+        csub[${j}] = load_c(&c[i + ${jx}*ldc]);
+      % else:
+        csub[${j}] = ${beta}*load_c(&c[i + ${jx}*ldc]);
+      % endif
+    % endif
+  % endfor
+  % endif
     }
 % endfor
     __syncthreads();
@@ -59,13 +73,17 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
     % for kx in bchunks[bb]:
         bv = bsub[${bb % 2}][${loop.index}][threadIdx.x];
       % for j, jx in enumerate(A[mcx, kx]):
-        % if jx != 0 and kx == afix[mcx[j]]:
+        % if preload and beta != 0 and jx != 0:
+        csub[${j}] += ${jx}*bv;
+        % elif jx != 0 and kx == afix[mcx[j]]:
         csub[${j}] = ${jx}*bv;
         % elif jx != 0:
         csub[${j}] += ${jx}*bv;
         % endif
         ## If we're done with this dot product then store to global
-        % if kx == alix[mcx[j]] and beta == 0:
+        % if preload and kx == alix[mcx[j]]:
+        store_c(&c[i + ${mcx[j]}*ldc], csub[${j}]);
+        % elif kx == alix[mcx[j]] and beta == 0:
         store_c(&c[i + ${mcx[j]}*ldc], csub[${j}]);
         % elif kx == alix[mcx[j]] and beta == 1:
         store_c(&c[i + ${mcx[j]}*ldc], load_c(&c[i + ${mcx[j]}*ldc]) + csub[${j}]);
diff --git a/gimmik/kernels/hip/bstream-preload-c.mako b/gimmik/kernels/hip/bstream-preload-c.mako
deleted file mode 100644
index a7140c6..0000000
--- a/gimmik/kernels/hip/bstream-preload-c.mako
+++ /dev/null
@@ -1,69 +0,0 @@
-<%inherit file='base'/>
-
-__global__ __launch_bounds__(${blockx}) void
-% if n is None:
-${kname}(int n,
-         const ${dtype}* __restrict__ b, int ldb,
-         ${dtype}* __restrict__ c, int ldc)
-{
-  % if width > 1:
-    n = (n + ${width} - 1) / ${width};
-    ldb /= ${width};
-    ldc /= ${width};
-  % endif
-% else:
-${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
-{
-    const int n = ${-(-n // width)};
-    const ${'long long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width};
-    const ${'long long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width};
-% endif
-    const int i = blockDim.x*blockIdx.x + threadIdx.x;
-
-    if (i < n)
-    {
-        ${dtype} bv, csub[${m}];
-
-% if beta != 0:
-## Preload C values for rows which will receive a non-zero dot product
-% for j, jx in enumerate(afix):
-  % if jx != -1:
-    % if beta == 1:
-        csub[${j}] = load_c(&c[i + ${j}*ldc]);
-    % else:
-        csub[${j}] = ${beta}*load_c(&c[i + ${j}*ldc]);
-    % endif
-  % endif
-% endfor
-% endif
-
-## Iterate through the used rows of B
-% for kx in bix:
-        bv = load_b(&b[i + ${kx}*ldb]);
-  % for j, jx in enumerate(A[:, kx]):
-    % if beta == 0:
-      % if jx != 0 and kx == afix[j]:
-        csub[${j}] = ${jx}*bv;
-      % elif jx != 0:
-        csub[${j}] += ${jx}*bv;
-      % endif
-    % elif jx != 0:
-        csub[${j}] += ${jx}*bv;
-    % endif
-    ##
-    % if kx == alix[j]:
-        store_c(&c[i + ${j}*ldc], csub[${j}]);
-    % endif
-  % endfor
-% endfor
-
-## Handle rows of A which are all zero
-% for j, jx in enumerate(afix):
-  % if jx == -1 and beta == 0:
-        store_c(&c[i + ${j}*ldc], make_zero());
-  % elif jx == -1 and beta != 1:
-        store_c(&c[i + ${j}*ldc], ${beta}*load_c(&c[i + ${j}*ldc]));
-  % endif
-% endfor
-    }
-}
diff --git a/gimmik/kernels/hip/bstream.mako b/gimmik/kernels/hip/bstream.mako
index 10df790..1e7a70b 100644
--- a/gimmik/kernels/hip/bstream.mako
+++ b/gimmik/kernels/hip/bstream.mako
@@ -1,5 +1,7 @@
 <%inherit file='base'/>
 
+<% preload = context.get('preload', False) %>
+
 __global__ __launch_bounds__(${blockx}) void
 % if n is None:
 ${kname}(int n,
@@ -24,17 +26,34 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
     {
         ${dtype} bv, csub[${m}];
 
-## Iterare through the used rows of B
+% if preload and beta != 0:
+## Preload C values for rows which will receive a non-zero dot product
+% for j, jx in enumerate(afix):
+  % if jx != -1:
+    % if beta == 1:
+        csub[${j}] = load_c(&c[i + ${j}*ldc]);
+    % else:
+        csub[${j}] = ${beta}*load_c(&c[i + ${j}*ldc]);
+    % endif
+  % endif
+% endfor
+% endif
+
+## Iterate through the used rows of B
 % for kx in bix:
         bv = load_b(&b[i + ${kx}*ldb]);
   % for j, jx in enumerate(A[:, kx]):
-    % if jx != 0 and kx == afix[j]:
+    % if preload and beta != 0 and jx != 0:
+        csub[${j}] += ${jx}*bv;
+    % elif jx != 0 and kx == afix[j]:
         csub[${j}] = ${jx}*bv;
     % elif jx != 0:
         csub[${j}] += ${jx}*bv;
     % endif
     ##
-    % if kx == alix[j] and beta == 0:
+    % if preload and kx == alix[j]:
+        store_c(&c[i + ${j}*ldc], csub[${j}]);
+    % elif kx == alix[j] and beta == 0:
         store_c(&c[i + ${j}*ldc], csub[${j}]);
     % elif kx == alix[j] and beta == 1:
         store_c(&c[i + ${j}*ldc], load_c(&c[i + ${j}*ldc]) + csub[${j}]);
diff --git a/gimmik/kernels/hip/cstream-ksplit-preload-c.mako b/gimmik/kernels/hip/cstream-ksplit-preload-c.mako
deleted file mode 100644
index b922666..0000000
--- a/gimmik/kernels/hip/cstream-ksplit-preload-c.mako
+++ /dev/null
@@ -1,108 +0,0 @@
-<%inherit file='base'/>
-
-<%
-kparts = partition(A, ksplit, by='cols')
-cchunks = chunk(range(m), csz)
-loaded = set()
-%>
-
-__global__ __launch_bounds__(${blockx*ksplit}) void
-% if n is None:
-${kname}(int n,
-         const ${dtype}* __restrict__ b, int ldb,
-         ${dtype}* __restrict__ c, int ldc)
-{
-  % if width > 1:
-    n = (n + ${width} - 1) / ${width};
-    ldb /= ${width};
-    ldc /= ${width};
-  % endif
-% else:
-${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
-{
-    const int n = ${-(-n // width)};
-    const ${'long long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width};
-    const ${'long long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width};
-% endif
-    int i = blockDim.x*blockIdx.x + threadIdx.x;
-
-    ${dtype} cv[${-(-csz // ksplit)}], bv[${-(-k // ksplit)}], dotp;
-    __shared__ ${dtype} csub[${ksplit - 1}][${csz}][${blockx}];
-
-## Iterate over the row-partitions of C
-% for cchunk in cchunks:
-  ## Iterate over the row-partitions of B
-  % for bid, kbx in enumerate(kparts):
-    if (i < n && threadIdx.y == ${bid})
-    {
-    ## Evaluate our partial dot products
-    % for j in cchunk:
-      ## Load in any missing parts of B
-      % for kx in kbx:
-        % if A[j, kx] != 0 and kx not in loaded:
-        bv[${loop.index}] = b[i + ${kx}*ldb]; <% loaded.add(kx) %>
-        % endif
-      % endfor
-      <%
-      nzixs = [(l_idx, kbx[l_idx]) for l_idx in A[j, kbx].nonzero()[0]]
-      has_dotp = A[j].any()
-      if nzixs:
-          first_l_idx, first_kx = nzixs[0]
-          dotex = f"{A[j, first_kx]}*bv[{first_l_idx}]"
-          for l_idx, kx in nzixs[1:]:
-              dotex = f"{dotex} + {A[j, kx]}*bv[{l_idx}]"
-      else:
-          dotex = 'make_zero()'
-      %>
-        dotp = ${dotex};
-      ## Save to a register
-      % if loop.index % ksplit == bid:
-        % if beta == 0:
-        cv[${loop.index // ksplit}] = dotp;
-        % elif beta == 1 and has_dotp:
-        cv[${loop.index // ksplit}] = load_c(&c[i + ${j}*ldc]);
-        cv[${loop.index // ksplit}] += dotp;
-        % elif has_dotp:
-        cv[${loop.index // ksplit}] = ${beta}*load_c(&c[i + ${j}*ldc]);
-        cv[${loop.index // ksplit}] += dotp;
-        % endif
-      ## Save to shared memory
-      % else:
-        csub[${bid - (bid > loop.index % ksplit)}][${loop.index}][threadIdx.x] = dotp;
-      % endif
-    % endfor
-    }
-  % endfor
-    __syncthreads();
-  ## Iterate over the column-partitions of B
-  % for bid, kbx in enumerate(kparts):
-    if (i < n && threadIdx.y == ${bid})
-    {
-    ## Sum and output the final set of dot products
-    % for j in cchunk:
-      % if loop.index % ksplit == bid:
-        <% has_dotp = A[j].any() %>
-        <%
-        sum_expr = f"cv[{loop.index // ksplit}]"
-        for s_idx in range(ksplit - 1):
-            sum_expr = f"{sum_expr} + csub[{s_idx}][{loop.index}][threadIdx.x]"
-        %>
-        % if beta == 0:
-        dotp = ${sum_expr};
-        store_c(&c[i + ${j}*ldc], dotp);
-        % elif beta == 1 and has_dotp:
-        dotp = ${sum_expr};
-        store_c(&c[i + ${j}*ldc], dotp);
-        % elif beta != 1 and has_dotp:
-        dotp = ${sum_expr};
-        store_c(&c[i + ${j}*ldc], dotp);
-        % elif beta != 1:
-        store_c(&c[i + ${j}*ldc], ${beta}*load_c(&c[i + ${j}*ldc]));
-        % endif
-      % endif
-    % endfor
-    }
-  % endfor
-    __syncthreads();
-% endfor
-}
diff --git a/gimmik/kernels/hip/cstream-ksplit.mako b/gimmik/kernels/hip/cstream-ksplit.mako
index 5fc6711..12c59ba 100644
--- a/gimmik/kernels/hip/cstream-ksplit.mako
+++ b/gimmik/kernels/hip/cstream-ksplit.mako
@@ -4,6 +4,7 @@
 kparts = partition(A, ksplit, by='cols')
 cchunks = chunk(range(m), csz)
 loaded = set()
+preload = context.get('preload', False)
 %>
 
 __global__ __launch_bounds__(${blockx*ksplit}) void
@@ -45,6 +46,7 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
       % endfor
       <%
       nzixs = [(l_idx, kbx[l_idx]) for l_idx in A[j, kbx].nonzero()[0]]
+      has_dotp = A[j].any()
       if nzixs:
           first_l_idx, first_kx = nzixs[0]
           dotex = f"{A[j, first_kx]}*bv[{first_l_idx}]"
@@ -56,7 +58,17 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
         dotp = ${dotex};
       ## Save to a register
       % if loop.index % ksplit == bid:
+        % if preload and beta == 0:
         cv[${loop.index // ksplit}] = dotp;
+        % elif preload and beta == 1 and has_dotp:
+        cv[${loop.index // ksplit}] = load_c(&c[i + ${j}*ldc]);
+        cv[${loop.index // ksplit}] += dotp;
+        % elif preload and has_dotp:
+        cv[${loop.index // ksplit}] = ${beta}*load_c(&c[i + ${j}*ldc]);
+        cv[${loop.index // ksplit}] += dotp;
+        % elif not preload:
+        cv[${loop.index // ksplit}] = dotp;
+        % endif
       ## Save to shared memory
       % else:
         csub[${bid - (bid > loop.index % ksplit)}][${loop.index}][threadIdx.x] = dotp;
@@ -72,17 +84,31 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
     ## Sum and output the final set of dot products
     % for j in cchunk:
       % if loop.index % ksplit == bid:
+        <% has_dotp = A[j].any() %>
         <%
         sum_expr = f"cv[{loop.index // ksplit}]"
         for s_idx in range(ksplit - 1):
             sum_expr = f"{sum_expr} + csub[{s_idx}][{loop.index}][threadIdx.x]"
         %>
+        % if preload and beta == 0:
+        dotp = ${sum_expr};
+        store_c(&c[i + ${j}*ldc], dotp);
+        % elif preload and beta == 1 and has_dotp:
+        dotp = ${sum_expr};
+        store_c(&c[i + ${j}*ldc], dotp);
+        % elif preload and beta != 1 and has_dotp:
+        dotp = ${sum_expr};
+        store_c(&c[i + ${j}*ldc], dotp);
+        % elif preload and beta != 1:
+        store_c(&c[i + ${j}*ldc], ${beta}*load_c(&c[i + ${j}*ldc]));
+        % elif beta == 0:
         dotp = ${sum_expr};
-        % if beta == 0:
         store_c(&c[i + ${j}*ldc], dotp);
         % elif beta == 1:
+        dotp = ${sum_expr};
         store_c(&c[i + ${j}*ldc], load_c(&c[i + ${j}*ldc]) + dotp);
         % else:
+        dotp = ${sum_expr};
         store_c(&c[i + ${j}*ldc], dotp + ${beta}*load_c(&c[i + ${j}*ldc]));
         % endif
       % endif
diff --git a/gimmik/kernels/hip/cstream-preload-c.mako b/gimmik/kernels/hip/cstream-preload-c.mako
deleted file mode 100644
index c9a83b0..0000000
--- a/gimmik/kernels/hip/cstream-preload-c.mako
+++ /dev/null
@@ -1,61 +0,0 @@
-<%inherit file='base'/>
-
-<% ksplit = 2 if m < 36 else 1 %>
-
-__global__ __launch_bounds__(${blockx}) void
-% if n is None:
-${kname}(int n,
-         const ${dtype}* __restrict__ b, int ldb,
-         ${dtype}* __restrict__ c, int ldc)
-{
-  % if width > 1:
-    n = (n + ${width} - 1) / ${width};
-    ldb /= ${width};
-    ldc /= ${width};
-  % endif
-% else:
-${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
-{
-    const int n = ${-(-n // width)};
-    const ${'long long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width};
-    const ${'long long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width};
-% endif
-    const int i = blockDim.x*blockIdx.x + threadIdx.x;
-    ${dtype} dotp;
-
-    if (i < n)
-    {
-% for j, jx in enumerate(A):
-  <%
-  nzixs = [kx for kx, val in enumerate(jx) if val != 0]
-  if nzixs:
-      first_kx = nzixs[0]
-      dotex = f"{jx[first_kx]}*b[i + {first_kx}*ldb]"
-      for kx in nzixs[1:]:
-          dotex = f"{dotex} + {jx[kx]}*b[i + {kx}*ldb]"
-  else:
-      dotex = 'make_zero()'
-  %>
-  % if nzixs:
-    % if beta == 0:
-        dotp = ${dotex};
-        store_c(&c[i + ${j}*ldc], dotp);
-    % elif beta == 1:
-        dotp = load_c(&c[i + ${j}*ldc]);
-        dotp += ${dotex};
-        store_c(&c[i + ${j}*ldc], dotp);
-    % else:
-        dotp = ${beta}*load_c(&c[i + ${j}*ldc]);
-        dotp += ${dotex};
-        store_c(&c[i + ${j}*ldc], dotp);
-    % endif
-  % else:
-    % if beta == 0:
-        store_c(&c[i + ${j}*ldc], make_zero());
-    % elif beta != 1:
-        store_c(&c[i + ${j}*ldc], ${beta}*load_c(&c[i + ${j}*ldc]));
-    % endif
-  % endif
-% endfor
-    }
-}
diff --git a/gimmik/kernels/hip/cstream.mako b/gimmik/kernels/hip/cstream.mako
index 4ea995e..2ee9574 100644
--- a/gimmik/kernels/hip/cstream.mako
+++ b/gimmik/kernels/hip/cstream.mako
@@ -1,6 +1,8 @@
 <%inherit file='base'/>
 
-<% ksplit = 2 if m < 36 else 1 %>
+<%
+preload = context.get('preload', False)
+%>
 
 __global__ __launch_bounds__(${blockx}) void
 % if n is None:
@@ -37,10 +39,23 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
       dotex = 'make_zero()'
   %>
         dotp = ${dotex};
-  % if not nzixs:
-        dotp = make_zero();
-  % endif
-  % if beta == 0:
+  % if preload and nzixs:
+    % if beta == 0:
+        store_c(&c[i + ${j}*ldc], dotp);
+    % elif beta == 1:
+        dotp = load_c(&c[i + ${j}*ldc]) + dotp;
+        store_c(&c[i + ${j}*ldc], dotp);
+    % else:
+        dotp = ${beta}*load_c(&c[i + ${j}*ldc]) + dotp;
+        store_c(&c[i + ${j}*ldc], dotp);
+    % endif
+  % elif preload:
+    % if beta == 0:
+        store_c(&c[i + ${j}*ldc], make_zero());
+    % elif beta != 1:
+        store_c(&c[i + ${j}*ldc], ${beta}*load_c(&c[i + ${j}*ldc]));
+    % endif
+  % elif beta == 0:
         store_c(&c[i + ${j}*ldc], dotp);
   % elif beta == 1 and nzixs:
         store_c(&c[i + ${j}*ldc], load_c(&c[i + ${j}*ldc]) + dotp);

From e014e4d40c1745e2b803e0f85ea3ce92da1aee50 Mon Sep 17 00:00:00 2001
From: tomjen12 <tomjen12@amd.com>
Date: Wed, 24 Jun 2026 23:21:08 -0500
Subject: [PATCH 12/15] Avoid HIP vector operator+= overloads

---
 gimmik/kernels/hip/base.mako | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/gimmik/kernels/hip/base.mako b/gimmik/kernels/hip/base.mako
index 95e622f..d67ee25 100644
--- a/gimmik/kernels/hip/base.mako
+++ b/gimmik/kernels/hip/base.mako
@@ -5,9 +5,6 @@ inline __device__ ${dtype} operator+(${dtype} a, ${dtype} b)
 inline __device__ ${dtype} operator*(${dtype[:-1]} a, ${dtype} b)
 { return make_${dtype}(a*b.x, a*b.y, a*b.z, a*b.w); }
 
-inline __device__ void operator+=(${dtype} &a, ${dtype} b)
-{ a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; }
-
 inline __device__ ${dtype} make_zero()
 { return make_${dtype}(0, 0, 0, 0); }
 % elif dtype.endswith('2'):
@@ -17,9 +14,6 @@ inline __device__ ${dtype} operator+(${dtype} a, ${dtype} b)
 inline __device__ ${dtype} operator*(${dtype[:-1]} a, ${dtype} b)
 { return make_${dtype}(a*b.x, a*b.y); }
 
-inline __device__ void operator+=(${dtype} &a, ${dtype} b)
-{ a.x += b.x; a.y += b.y; }
-
 inline __device__ ${dtype} make_zero()
 { return make_${dtype}(0, 0); }
 % else:

From f6bc30882c7fe8c36da58d4763a2b32089b13a49 Mon Sep 17 00:00:00 2001
From: tomjen12 <tomjen12@amd.com>
Date: Thu, 25 Jun 2026 02:02:37 -0500
Subject: [PATCH 13/15] Prune HIP tuned variants to 12 Reduce the HIP tuned
 kernel search space from 28 variants to 12 and order the remaining variants
 to try common winners earlier.

---
 gimmik/hip.py | 54 ++-------------------------------------------------
 1 file changed, 2 insertions(+), 52 deletions(-)

diff --git a/gimmik/hip.py b/gimmik/hip.py
index 1fceb01..b55e3fe 100644
--- a/gimmik/hip.py
+++ b/gimmik/hip.py
@@ -24,35 +24,17 @@ def emit_preload(name, args, meta):
 
         blkx = self.basemeta['block'][0]
 
-        # B loading, C streaming kernel
-        yield from emit('cstream', {'blockx': blkx}, {})
-
-        # B streaming, C accumulation kernel
-        yield from emit('bstream', {'blockx': blkx}, {})
-
-        # Four-way m-split B streaming, C accumulation kernel
-        ms, bsz, blkx = 4, 24, 64
-        args = {'msplit': ms, 'bsz': bsz, 'blockx': blkx}
-        meta = {'block': (blkx, ms, 1), 'shared': 2*bsz*blkx*dsize}
-        yield from emit('bstream-msplit', args, meta)
-
-        # Two-way k-split B loading, C streaming kernel
-        ks, csz, blkx = 2, 24, 64
-        args = {'ksplit': ks, 'csz': csz, 'blockx': blkx}
-        meta = {'block': (blkx, ks, 1), 'shared': (ks - 1)*csz*blkx*dsize}
-        yield from emit('cstream-ksplit', args, meta)
-
         # Only emit tuned variants on architectures they have been validated for.
         base_arch = gcn_arch.split(':', 1)[0] if gcn_arch else None
         if base_arch not in {'gfx90a', 'gfx942'} or warp_size != 64:
             return
 
         # Tuned HIP variants
-        msplits, ksplits = [4, 8], [2, 4]
+        msplits, ksplits = [8, 4], [4, 2]
         bsz, csz, blkx = 8, 8, 64
         widths = [1]
         if self.aligne is not None and self.aligne % 2 == 0:
-            widths.append(2)
+            widths.insert(0, 2)
 
         for width in widths:
             wargs = ({'dtype': f'{dtype}{width}', 'width': width}
@@ -60,17 +42,6 @@ def emit_preload(name, args, meta):
             wmeta = {'width': width} if width > 1 else {}
             wpfx = f'w{width}-' if width > 1 else ''
 
-            # B loading, C streaming kernel
-            args = {'blockx': blkx} | wargs
-            meta = {'block': (blkx, 1, 1),
-                    'desc': f'cstream/{wpfx}x{blkx}'} | wmeta
-            yield from emit('cstream', args, meta)
-
-            # B streaming, C accumulation kernel
-            meta = {'block': (blkx, 1, 1),
-                    'desc': f'bstream/{wpfx}x{blkx}'} | wmeta
-            yield from emit('bstream', args, meta)
-
             for ms in msplits:
                 # m-split B streaming, C accumulation kernel
                 args = {'msplit': ms, 'bsz': bsz, 'blockx': blkx} | wargs
@@ -81,27 +52,6 @@ def emit_preload(name, args, meta):
                 } | wmeta
                 yield from emit('bstream-msplit', args, meta)
 
-            for ks in ksplits:
-                # k-split B loading, C streaming kernel
-                args = {'ksplit': ks, 'csz': csz, 'blockx': blkx} | wargs
-                shared = (ks - 1)*csz*blkx*dsize*width
-                meta = {
-                    'block': (blkx, ks, 1), 'shared': shared,
-                    'desc': f'cstream-ksplit/{wpfx}k{ks}-c{csz}-x{blkx}'
-                } | wmeta
-                yield from emit('cstream-ksplit', args, meta)
-
-            # B loading, C preloading, C streaming kernel
-            args = {'blockx': blkx} | wargs
-            meta = {'block': (blkx, 1, 1),
-                    'desc': f'cstream-preload-c/{wpfx}x{blkx}'} | wmeta
-            yield from emit_preload('cstream', args, meta)
-
-            # B streaming, C preloading, C accumulation kernel
-            meta = {'block': (blkx, 1, 1),
-                    'desc': f'bstream-preload-c/{wpfx}x{blkx}'} | wmeta
-            yield from emit_preload('bstream', args, meta)
-
             for ms in msplits:
                 # m-split B streaming, C preloading, C accumulation kernel
                 args = {'msplit': ms, 'bsz': bsz, 'blockx': blkx} | wargs

From a3aee45280334d46dbfb578a03a90494f64f2041 Mon Sep 17 00:00:00 2001
From: tomjen12 <tomjen12@amd.com>
Date: Thu, 25 Jun 2026 02:19:05 -0500
Subject: [PATCH 14/15] Remove HIP variant arch gate

---
 gimmik/hip.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/gimmik/hip.py b/gimmik/hip.py
index b55e3fe..9365938 100644
--- a/gimmik/hip.py
+++ b/gimmik/hip.py
@@ -24,11 +24,6 @@ def emit_preload(name, args, meta):
 
         blkx = self.basemeta['block'][0]
 
-        # Only emit tuned variants on architectures they have been validated for.
-        base_arch = gcn_arch.split(':', 1)[0] if gcn_arch else None
-        if base_arch not in {'gfx90a', 'gfx942'} or warp_size != 64:
-            return
-
         # Tuned HIP variants
         msplits, ksplits = [8, 4], [4, 2]
         bsz, csz, blkx = 8, 8, 64

From 2c7af9b3dbbe022d1cffe5e4845ead079d398320 Mon Sep 17 00:00:00 2001
From: tomjen12 <tomjen12@amd.com>
Date: Thu, 25 Jun 2026 11:18:27 +0000
Subject: [PATCH 15/15] Restore MI355 HIP baseline variants

---
 gimmik/hip.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/gimmik/hip.py b/gimmik/hip.py
index 9365938..142a799 100644
--- a/gimmik/hip.py
+++ b/gimmik/hip.py
@@ -22,7 +22,21 @@ def emit(name, args, meta):
         def emit_preload(name, args, meta):
             yield from emit(name, args | {'preload': True}, meta)
 
-        blkx = self.basemeta['block'][0]
+        ms, bsz, blkx = 4, 24, 64
+        args = {'msplit': ms, 'bsz': bsz, 'blockx': blkx}
+        meta = {
+            'block': (blkx, ms, 1), 'shared': 2*bsz*blkx*dsize,
+            'desc': f'bstream-msplit/m{ms}-b{bsz}-x{blkx}'
+        }
+        yield from emit('bstream-msplit', args, meta)
+
+        ks, csz, blkx = 2, 24, 64
+        args = {'ksplit': ks, 'csz': csz, 'blockx': blkx}
+        meta = {
+            'block': (blkx, ks, 1), 'shared': (ks - 1)*csz*blkx*dsize,
+            'desc': f'cstream-ksplit/k{ks}-c{csz}-x{blkx}'
+        }
+        yield from emit('cstream-ksplit', args, meta)
 
         # Tuned HIP variants
         msplits, ksplits = [8, 4], [4, 2]