From 8e23d63cafcc24d6d5f11bbccc135e4f4ba482d7 Mon Sep 17 00:00:00 2001 From: tomjen12 Date: Thu, 18 Jun 2026 00:06:05 -0500 Subject: [PATCH 01/15] Add tuned HIP GiMMiK preload variants --- gimmik/hip.py | 113 ++++++++++++- gimmik/kernels/hip/base.mako | 32 ++++ .../kernels/hip/bstream-msplit-preload-c.mako | 98 +++++++++++ .../hip/bstream-msplit-width-preload-c.mako | 138 +++++++++++++++ gimmik/kernels/hip/bstream-msplit.mako | 10 +- gimmik/kernels/hip/bstream-preload-c.mako | 63 +++++++ .../kernels/hip/bstream-width-preload-c.mako | 103 ++++++++++++ gimmik/kernels/hip/bstream.mako | 12 +- .../kernels/hip/cstream-ksplit-preload-c.mako | 103 ++++++++++++ .../hip/cstream-ksplit-width-preload-c.mako | 157 ++++++++++++++++++ gimmik/kernels/hip/cstream-ksplit.mako | 6 +- gimmik/kernels/hip/cstream-preload-c.mako | 51 ++++++ .../kernels/hip/cstream-width-preload-c.mako | 106 ++++++++++++ gimmik/kernels/hip/cstream.mako | 8 +- 14 files changed, 978 insertions(+), 22 deletions(-) create mode 100644 gimmik/kernels/hip/bstream-msplit-preload-c.mako create mode 100644 gimmik/kernels/hip/bstream-msplit-width-preload-c.mako create mode 100644 gimmik/kernels/hip/bstream-preload-c.mako create mode 100644 gimmik/kernels/hip/bstream-width-preload-c.mako create mode 100644 gimmik/kernels/hip/cstream-ksplit-preload-c.mako create mode 100644 gimmik/kernels/hip/cstream-ksplit-width-preload-c.mako create mode 100644 gimmik/kernels/hip/cstream-preload-c.mako create mode 100644 gimmik/kernels/hip/cstream-width-preload-c.mako diff --git a/gimmik/hip.py b/gimmik/hip.py index a58c8fa..ee38c9f 100644 --- a/gimmik/hip.py +++ b/gimmik/hip.py @@ -8,23 +8,128 @@ class HIPMatMul(MatMul): basemeta = {'block': (128, 1, 1), 'width': 1, 'shared': 0} def _kernel_generators(self, dtype, dsize, *, gcn_arch=None, warp_size=64): + max_block_threads = 1024 + max_shared = 64*1024 + + def emit(name, args, meta): + block = meta.get('block', self.basemeta['block']) + shared = meta.get('shared', self.basemeta['shared']) + threads = block[0]*block[1]*block[2] + + if threads <= max_block_threads and shared <= max_shared: + yield (name, args, meta) + # B loading, C streaming kernel - yield ('cstream', {}, {}) + yield from emit('cstream', {}, {}) # B streaming, C accumulation kernel - yield ('bstream', {}, {}) + yield from emit('bstream', {}, {}) # Four-way m-split B streaming, C accumulation kernel ms, bsz, blkx = 4, 24, 64 args = {'msplit': ms, 'bsz': bsz, 'blockx': blkx} meta = {'block': (blkx, ms, 1), 'shared': 2*bsz*blkx*dsize} - yield ('bstream-msplit', args, meta) + yield from emit('bstream-msplit', args, meta) # Two-way k-split B loading, C streaming kernel ks, csz, blkx = 2, 24, 64 args = {'ksplit': ks, 'csz': csz, 'blockx': blkx} meta = {'block': (blkx, ks, 1), 'shared': (ks - 1)*csz*blkx*dsize} - yield ('cstream-ksplit', args, meta) + yield from emit('cstream-ksplit', args, meta) + + # Tuned HIP variants + msplits, ksplits = [4, 8], [2, 4] + bsz, csz, blkx = 8, 8, 64 + width = 2 if self.aligne is not None and self.aligne % 2 == 0 else 1 + + # B loading, C streaming kernel + args = {'blockx': blkx} + meta = {'block': (blkx, 1, 1), 'desc': f'cstream/x{blkx}'} + yield from emit('cstream', args, meta) + + # B streaming, C accumulation kernel + meta = {'block': (blkx, 1, 1), 'desc': f'bstream/x{blkx}'} + yield from emit('bstream', args, meta) + + for ms in msplits: + # m-split B streaming, C accumulation kernel + args = {'msplit': ms, 'bsz': bsz, 'blockx': blkx} + shared = 2*bsz*blkx*dsize + meta = {'block': (blkx, ms, 1), 'shared': shared, + 'desc': f'bstream-msplit/m{ms}-b{bsz}-x{blkx}'} + yield from emit('bstream-msplit', args, meta) + + for ks in ksplits: + # k-split B loading, C streaming kernel + args = {'ksplit': ks, 'csz': csz, 'blockx': blkx} + shared = (ks - 1)*csz*blkx*dsize + meta = {'block': (blkx, ks, 1), 'shared': shared, + 'desc': f'cstream-ksplit/k{ks}-c{csz}-x{blkx}'} + yield from emit('cstream-ksplit', args, meta) + + # B loading, C preloading, C streaming kernel + args = {'blockx': blkx} + meta = {'block': (blkx, 1, 1), 'desc': f'cstream-preload-c/x{blkx}'} + yield from emit('cstream-preload-c', args, meta) + + # B streaming, C preloading, C accumulation kernel + meta = {'block': (blkx, 1, 1), 'desc': f'bstream-preload-c/x{blkx}'} + yield from emit('bstream-preload-c', args, meta) + + if width > 1: + args = {'dtype': f'{dtype}{width}', 'width': width, + 'blockx': blkx} + meta = {'block': (blkx, 1, 1), 'width': width, + 'desc': f'cstream-width-preload-c/w{width}-x{blkx}'} + yield from emit('cstream-width-preload-c', args, meta) + + meta = {'block': (blkx, 1, 1), 'width': width, + 'desc': f'bstream-width-preload-c/w{width}-x{blkx}'} + yield from emit('bstream-width-preload-c', args, meta) + + for ms in msplits: + # m-split B streaming, C preloading, C accumulation kernel + args = {'msplit': ms, 'bsz': bsz, 'blockx': blkx} + shared = 2*bsz*blkx*dsize + meta = {'block': (blkx, ms, 1), 'shared': shared, + 'desc': f'bstream-msplit-preload-c/m{ms}-b{bsz}-x{blkx}'} + yield from emit('bstream-msplit-preload-c', args, meta) + + if width > 1: + args = {'msplit': ms, 'bsz': bsz, 'blockx': blkx, + 'dtype': f'{dtype}{width}', 'width': width} + meta = { + 'block': (blkx, ms, 1), 'shared': shared*width, + 'width': width, + 'desc': ( + f'bstream-msplit-width-preload-c/w{width}-' + f'm{ms}-b{bsz}-x{blkx}' + ) + } + yield from emit('bstream-msplit-width-preload-c', args, meta) + + for ks in ksplits: + # k-split B loading, C preloading, C streaming kernel + args = {'ksplit': ks, 'csz': csz, 'blockx': blkx} + shared = (ks - 1)*csz*blkx*dsize + meta = { + 'block': (blkx, ks, 1), 'shared': shared, + 'desc': f'cstream-ksplit-preload-c/k{ks}-c{csz}-x{blkx}' + } + yield from emit('cstream-ksplit-preload-c', args, meta) + + if width > 1: + args = {'ksplit': ks, 'csz': csz, 'blockx': blkx, + 'dtype': f'{dtype}{width}', 'width': width} + meta = { + 'block': (blkx, ks, 1), 'shared': shared*width, + 'width': width, + 'desc': ( + f'cstream-ksplit-width-preload-c/w{width}-' + f'k{ks}-c{csz}-x{blkx}' + ) + } + yield from emit('cstream-ksplit-width-preload-c', args, meta) def _process_meta(self, meta): if self.n is not None: diff --git a/gimmik/kernels/hip/base.mako b/gimmik/kernels/hip/base.mako index 874fbbd..b40a0b9 100644 --- a/gimmik/kernels/hip/base.mako +++ b/gimmik/kernels/hip/base.mako @@ -9,4 +9,36 @@ static inline __device__ ${dtype} make_zero() { return 0; } % endif +static inline __device__ void +nt_store_c(${dtype}* p, ${dtype} v) +{ +% if dtype.endswith('4'): + __builtin_nontemporal_store(v.x, &p->x); + __builtin_nontemporal_store(v.y, &p->y); + __builtin_nontemporal_store(v.z, &p->z); + __builtin_nontemporal_store(v.w, &p->w); +% elif dtype.endswith('2'): + __builtin_nontemporal_store(v.x, &p->x); + __builtin_nontemporal_store(v.y, &p->y); +% else: + __builtin_nontemporal_store(v, p); +% endif +} + +static inline __device__ ${dtype} +nt_load_c(const ${dtype}* p) +{ +% if dtype.endswith('4'): + return make_${dtype}(__builtin_nontemporal_load(&p->x), + __builtin_nontemporal_load(&p->y), + __builtin_nontemporal_load(&p->z), + __builtin_nontemporal_load(&p->w)); +% elif dtype.endswith('2'): + return make_${dtype}(__builtin_nontemporal_load(&p->x), + __builtin_nontemporal_load(&p->y)); +% else: + return __builtin_nontemporal_load(p); +% endif +} + ${next.body()} diff --git a/gimmik/kernels/hip/bstream-msplit-preload-c.mako b/gimmik/kernels/hip/bstream-msplit-preload-c.mako new file mode 100644 index 0000000..8b6f008 --- /dev/null +++ b/gimmik/kernels/hip/bstream-msplit-preload-c.mako @@ -0,0 +1,98 @@ +<%inherit file='base'/> + +<% +mx = partition(A, into=msplit, by='rows') +bchunks = chunk(bix, bsz) +%> + +__global__ __launch_bounds__(${blockx*msplit}) void +% if n is None: +${kname}(int n, + const ${dtype}* __restrict__ b, int ldb, + ${dtype}* __restrict__ c, int ldc) +{ + % if width > 1: + n = ((n + ${width} - 1) / ${width}) * ${width}; + ldb /= ${width}; + ldc /= ${width}; + % endif +% else: +${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) +{ + const int n = ${-(-n // width)}; + const ${'long long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width}; + const ${'long long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width}; +% endif + int i = blockDim.x*blockIdx.x + threadIdx.x; + + ${dtype} bv, csub[${-(-m // msplit)}]; + __shared__ ${dtype} bsub[2][${bsz}][${blockx}]; + +## Fill the initial shared memory block +% for cid in range(msplit): + if (i < n && threadIdx.y == ${cid}) + { + % for kx in bchunks[0]: + % if loop.index % msplit == cid: + bsub[0][${loop.index}][threadIdx.x] = b[i + ${kx}*ldb]; + % endif + % endfor + + ## Preload C values for active rows owned by this m-split lane + % for j, jx in enumerate(mx[cid]): + % if afix[jx] != -1: + % if beta == 0: + csub[${j}] = make_zero(); + % elif beta == 1: + csub[${j}] = nt_load_c(&c[i + ${jx}*ldc]); + % else: + csub[${j}] = ${beta}*nt_load_c(&c[i + ${jx}*ldc]); + % endif + % endif + % endfor + } +% endfor + __syncthreads(); + +## Iterate over each row-chunk of B +% for bb in range(len(bchunks)): + ## Iterate over each row-chunk of C + % for cid, mcx in enumerate(mx): + if (i < n && threadIdx.y == ${cid}) + { + ## Start filling the next shared memory block + % if not loop.parent.last: + % for kx in bchunks[bb + 1]: + % if loop.index % msplit == cid: + bsub[${(bb + 1) % 2}][${loop.index}][threadIdx.x] = b[i + ${kx}*ldb]; + % endif + % endfor + % endif + ## Accumulate our dot products + % for kx in bchunks[bb]: + bv = bsub[${bb % 2}][${loop.index}][threadIdx.x]; + % for j, jx in enumerate(A[mcx, kx]): + % if jx != 0: + csub[${j}] += ${jx}*bv; + % endif + ## If we're done with this dot product then store to global + % if kx == alix[mcx[j]]: + nt_store_c(&c[i + ${mcx[j]}*ldc], csub[${j}]); + % endif + % endfor + % endfor + ## Handle rows of A which are all zero + % if loop.parent.last: + % for j, jx in enumerate(afix): + % if jx == -1 and j % msplit == cid and beta == 0: + nt_store_c(&c[i + ${j}*ldc], make_zero()); + % elif jx == -1 and j % msplit == cid and beta != 1: + nt_store_c(&c[i + ${j}*ldc], ${beta}*nt_load_c(&c[i + ${j}*ldc])); + % endif + % endfor + % endif + } + % endfor + __syncthreads(); +% endfor +} diff --git a/gimmik/kernels/hip/bstream-msplit-width-preload-c.mako b/gimmik/kernels/hip/bstream-msplit-width-preload-c.mako new file mode 100644 index 0000000..9659db7 --- /dev/null +++ b/gimmik/kernels/hip/bstream-msplit-width-preload-c.mako @@ -0,0 +1,138 @@ +<%inherit file='base'/> + +% if width == 2: +static inline __device__ ${dtype} +gimmik_vmul(${dtype[:-1]} a, ${dtype} b) +{ + return make_${dtype}(a*b.x, a*b.y); +} + +static inline __device__ ${dtype} +gimmik_vadd(${dtype} a, ${dtype} b) +{ + return make_${dtype}(a.x + b.x, a.y + b.y); +} + +static inline __device__ ${dtype} +gimmik_vmadd(${dtype} acc, ${dtype[:-1]} a, ${dtype} b) +{ + return make_${dtype}(acc.x + a*b.x, acc.y + a*b.y); +} +% elif width == 4: +static inline __device__ ${dtype} +gimmik_vmul(${dtype[:-1]} a, ${dtype} b) +{ + return make_${dtype}(a*b.x, a*b.y, a*b.z, a*b.w); +} + +static inline __device__ ${dtype} +gimmik_vadd(${dtype} a, ${dtype} b) +{ + return make_${dtype}(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); +} + +static inline __device__ ${dtype} +gimmik_vmadd(${dtype} acc, ${dtype[:-1]} a, ${dtype} b) +{ + return make_${dtype}(acc.x + a*b.x, acc.y + a*b.y, acc.z + a*b.z, acc.w + a*b.w); +} +% else: +#error "bstream_msplit_width_preload_c only supports width=2 or width=4" +% endif + +<% +mx = partition(A, into=msplit, by='rows') +bchunks = chunk(bix, bsz) +%> + +__global__ __launch_bounds__(${blockx*msplit}) void +% if n is None: +${kname}(int n, + const ${dtype}* __restrict__ b, int ldb, + ${dtype}* __restrict__ c, int ldc) +{ + % if width > 1: + n = (n + ${width} - 1) / ${width}; + ldb /= ${width}; + ldc /= ${width}; + % endif +% else: +${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) +{ + const int n = ${-(-n // width)}; + const ${'long long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width}; + const ${'long long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width}; +% endif + int i = blockDim.x*blockIdx.x + threadIdx.x; + + ${dtype} bv, csub[${-(-m // msplit)}]; + __shared__ ${dtype} bsub[2][${bsz}][${blockx}]; + +## Fill the initial shared memory block +% for cid in range(msplit): + if (i < n && threadIdx.y == ${cid}) + { + % for kx in bchunks[0]: + % if loop.index % msplit == cid: + bsub[0][${loop.index}][threadIdx.x] = b[i + ${kx}*ldb]; + % endif + % endfor + + ## Preload C values for active rows owned by this m-split lane + % for j, jx in enumerate(mx[cid]): + % if afix[jx] != -1: + % if beta == 0: + csub[${j}] = make_zero(); + % elif beta == 1: + csub[${j}] = nt_load_c(&c[i + ${jx}*ldc]); + % else: + csub[${j}] = gimmik_vmul(${beta}, nt_load_c(&c[i + ${jx}*ldc])); + % endif + % endif + % endfor + } +% endfor + __syncthreads(); + +## Iterate over each row-chunk of B +% for bb in range(len(bchunks)): + ## Iterate over each row-chunk of C + % for cid, mcx in enumerate(mx): + if (i < n && threadIdx.y == ${cid}) + { + ## Start filling the next shared memory block + % if not loop.parent.last: + % for kx in bchunks[bb + 1]: + % if loop.index % msplit == cid: + bsub[${(bb + 1) % 2}][${loop.index}][threadIdx.x] = b[i + ${kx}*ldb]; + % endif + % endfor + % endif + ## Accumulate our dot products + % for kx in bchunks[bb]: + bv = bsub[${bb % 2}][${loop.index}][threadIdx.x]; + % for j, jx in enumerate(A[mcx, kx]): + % if jx != 0: + csub[${j}] = gimmik_vmadd(csub[${j}], ${jx}, bv); + % endif + ## If we're done with this dot product then store to global + % if kx == alix[mcx[j]]: + nt_store_c(&c[i + ${mcx[j]}*ldc], csub[${j}]); + % endif + % endfor + % endfor + ## Handle rows of A which are all zero + % if loop.parent.last: + % for j, jx in enumerate(afix): + % if jx == -1 and j % msplit == cid and beta == 0: + nt_store_c(&c[i + ${j}*ldc], make_zero()); + % elif jx == -1 and j % msplit == cid and beta != 1: + nt_store_c(&c[i + ${j}*ldc], gimmik_vmul(${beta}, nt_load_c(&c[i + ${j}*ldc]))); + % endif + % endfor + % endif + } + % endfor + __syncthreads(); +% endfor +} diff --git a/gimmik/kernels/hip/bstream-msplit.mako b/gimmik/kernels/hip/bstream-msplit.mako index 6359ca1..6470477 100644 --- a/gimmik/kernels/hip/bstream-msplit.mako +++ b/gimmik/kernels/hip/bstream-msplit.mako @@ -66,11 +66,11 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) % endif ## If we're done with this dot product then store to global % if kx == alix[mcx[j]] and beta == 0: - c[i + ${mcx[j]}*ldc] = csub[${j}]; + nt_store_c(&c[i + ${mcx[j]}*ldc], csub[${j}]); % elif kx == alix[mcx[j]] and beta == 1: - c[i + ${mcx[j]}*ldc] += csub[${j}]; + nt_store_c(&c[i + ${mcx[j]}*ldc], nt_load_c(&c[i + ${mcx[j]}*ldc]) + csub[${j}]); % elif kx == alix[mcx[j]]: - c[i + ${mcx[j]}*ldc] = csub[${j}] + ${beta}*c[i + ${mcx[j]}*ldc]; + nt_store_c(&c[i + ${mcx[j]}*ldc], csub[${j}] + ${beta}*nt_load_c(&c[i + ${mcx[j]}*ldc])); % endif % endfor % endfor @@ -78,9 +78,9 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) % if loop.parent.last: % for j, jx in enumerate(afix): % if jx == -1 and j % msplit == cid and beta == 0: - c[i + ${j}*ldc] = make_zero(); + nt_store_c(&c[i + ${j}*ldc], make_zero()); % elif jx == -1 and j % msplit == cid and beta != 1: - c[i + ${j}*ldc] *= ${beta}; + nt_store_c(&c[i + ${j}*ldc], nt_load_c(&c[i + ${j}*ldc])*${beta}); % endif % endfor % endif diff --git a/gimmik/kernels/hip/bstream-preload-c.mako b/gimmik/kernels/hip/bstream-preload-c.mako new file mode 100644 index 0000000..30b08f6 --- /dev/null +++ b/gimmik/kernels/hip/bstream-preload-c.mako @@ -0,0 +1,63 @@ +<%inherit file='base'/> + +__global__ __launch_bounds__(${blockx}) void +% if n is None: +${kname}(int n, + const ${dtype}* __restrict__ b, int ldb, + ${dtype}* __restrict__ c, int ldc) +{ + % if width > 1: + n = ((n + ${width} - 1) / ${width}) * ${width}; + ldb /= ${width}; + ldc /= ${width}; + % endif +% else: +${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) +{ + const int n = ${-(-n // width)}; + const ${'long long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width}; + const ${'long long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width}; +% endif + const int i = blockDim.x*blockIdx.x + threadIdx.x; + + if (i < n) + { + ${dtype} bv, csub[${m}]; + +## Preload C values for rows which will receive a non-zero dot product +% for j, jx in enumerate(afix): + % if jx != -1: + % if beta == 0: + csub[${j}] = make_zero(); + % elif beta == 1: + csub[${j}] = nt_load_c(&c[i + ${j}*ldc]); + % else: + csub[${j}] = ${beta}*nt_load_c(&c[i + ${j}*ldc]); + % endif + % endif +% endfor + +## Iterate through the used rows of B +% for kx in bix: + bv = b[i + ${kx}*ldb]; + % for j, jx in enumerate(A[:, kx]): + % if jx != 0: + csub[${j}] += ${jx}*bv; + % endif + ## + % if kx == alix[j]: + nt_store_c(&c[i + ${j}*ldc], csub[${j}]); + % endif + % endfor +% endfor + +## Handle rows of A which are all zero +% for j, jx in enumerate(afix): + % if jx == -1 and beta == 0: + nt_store_c(&c[i + ${j}*ldc], make_zero()); + % elif jx == -1 and beta != 1: + nt_store_c(&c[i + ${j}*ldc], ${beta}*nt_load_c(&c[i + ${j}*ldc])); + % endif +% endfor + } +} diff --git a/gimmik/kernels/hip/bstream-width-preload-c.mako b/gimmik/kernels/hip/bstream-width-preload-c.mako new file mode 100644 index 0000000..97a7571 --- /dev/null +++ b/gimmik/kernels/hip/bstream-width-preload-c.mako @@ -0,0 +1,103 @@ +<%inherit file='base'/> + +% if width == 2: +static inline __device__ ${dtype} +gimmik_vmul(${dtype[:-1]} a, ${dtype} b) +{ + return make_${dtype}(a*b.x, a*b.y); +} + +static inline __device__ ${dtype} +gimmik_vadd(${dtype} a, ${dtype} b) +{ + return make_${dtype}(a.x + b.x, a.y + b.y); +} + +static inline __device__ ${dtype} +gimmik_vmadd(${dtype} acc, ${dtype[:-1]} a, ${dtype} b) +{ + return make_${dtype}(acc.x + a*b.x, acc.y + a*b.y); +} +% elif width == 4: +static inline __device__ ${dtype} +gimmik_vmul(${dtype[:-1]} a, ${dtype} b) +{ + return make_${dtype}(a*b.x, a*b.y, a*b.z, a*b.w); +} + +static inline __device__ ${dtype} +gimmik_vadd(${dtype} a, ${dtype} b) +{ + return make_${dtype}(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); +} + +static inline __device__ ${dtype} +gimmik_vmadd(${dtype} acc, ${dtype[:-1]} a, ${dtype} b) +{ + return make_${dtype}(acc.x + a*b.x, acc.y + a*b.y, acc.z + a*b.z, acc.w + a*b.w); +} +% else: +#error "bstream_width_preload_c only supports width=2 or width=4" +% endif + +__global__ __launch_bounds__(${blockx}) void +% if n is None: +${kname}(int n, + const ${dtype}* __restrict__ b, int ldb, + ${dtype}* __restrict__ c, int ldc) +{ + % if width > 1: + n = (n + ${width} - 1) / ${width}; + ldb /= ${width}; + ldc /= ${width}; + % endif +% else: +${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) +{ + const int n = ${-(-n // width)}; + const ${'long long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width}; + const ${'long long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width}; +% endif + const int i = blockDim.x*blockIdx.x + threadIdx.x; + + if (i < n) + { + ${dtype} bv, csub[${m}]; + +## Preload C values for rows which will receive a non-zero dot product +% for j, jx in enumerate(afix): + % if jx != -1: + % if beta == 0: + csub[${j}] = make_zero(); + % elif beta == 1: + csub[${j}] = nt_load_c(&c[i + ${j}*ldc]); + % else: + csub[${j}] = gimmik_vmul(${beta}, nt_load_c(&c[i + ${j}*ldc])); + % endif + % endif +% endfor + +## Iterate through the used rows of B +% for kx in bix: + bv = b[i + ${kx}*ldb]; + % for j, jx in enumerate(A[:, kx]): + % if jx != 0: + csub[${j}] = gimmik_vmadd(csub[${j}], ${jx}, bv); + % endif + ## + % if kx == alix[j]: + nt_store_c(&c[i + ${j}*ldc], csub[${j}]); + % endif + % endfor +% endfor + +## Handle rows of A which are all zero +% for j, jx in enumerate(afix): + % if jx == -1 and beta == 0: + nt_store_c(&c[i + ${j}*ldc], make_zero()); + % elif jx == -1 and beta != 1: + nt_store_c(&c[i + ${j}*ldc], gimmik_vmul(${beta}, nt_load_c(&c[i + ${j}*ldc]))); + % endif +% endfor + } +} diff --git a/gimmik/kernels/hip/bstream.mako b/gimmik/kernels/hip/bstream.mako index 2f6dc62..9634c73 100644 --- a/gimmik/kernels/hip/bstream.mako +++ b/gimmik/kernels/hip/bstream.mako @@ -1,6 +1,6 @@ <%inherit file='base'/> -__global__ __launch_bounds__(128) void +__global__ __launch_bounds__(${blockx}) void % if n is None: ${kname}(int n, const ${dtype}* __restrict__ b, int ldb, @@ -35,11 +35,11 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) % endif ## % if kx == alix[j] and beta == 0: - c[i + ${j}*ldc] = csub[${j}]; + nt_store_c(&c[i + ${j}*ldc], csub[${j}]); % elif kx == alix[j] and beta == 1: - c[i + ${j}*ldc] += csub[${j}]; + nt_store_c(&c[i + ${j}*ldc], nt_load_c(&c[i + ${j}*ldc]) + csub[${j}]); % elif kx == alix[j]: - c[i + ${j}*ldc] = csub[${j}] + ${beta}*c[i + ${j}*ldc]; + nt_store_c(&c[i + ${j}*ldc], csub[${j}] + ${beta}*nt_load_c(&c[i + ${j}*ldc])); % endif % endfor % endfor @@ -47,9 +47,9 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) ## Handle rows of A which are all zero % for j, jx in enumerate(afix): % if jx == -1 and beta == 0: - c[i + ${j}*ldc] = make_zero(); + nt_store_c(&c[i + ${j}*ldc], make_zero()); % elif jx == -1 and beta != 1: - c[i + ${j}*ldc] *= ${beta}; + nt_store_c(&c[i + ${j}*ldc], nt_load_c(&c[i + ${j}*ldc])*${beta}); % endif % endfor } diff --git a/gimmik/kernels/hip/cstream-ksplit-preload-c.mako b/gimmik/kernels/hip/cstream-ksplit-preload-c.mako new file mode 100644 index 0000000..507c34f --- /dev/null +++ b/gimmik/kernels/hip/cstream-ksplit-preload-c.mako @@ -0,0 +1,103 @@ +<%inherit file='base'/> + +<% +kparts = partition(A, ksplit, by='cols') +cchunks = chunk(range(m), csz) +loaded = set() +%> + +__global__ __launch_bounds__(${blockx*ksplit}) void +% if n is None: +${kname}(int n, + const ${dtype}* __restrict__ b, int ldb, + ${dtype}* __restrict__ c, int ldc) +{ + % if width > 1: + n = ((n + ${width} - 1) / ${width}) * ${width}; + ldb /= ${width}; + ldc /= ${width}; + % endif +% else: +${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) +{ + const int n = ${-(-n // width)}; + const ${'long long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width}; + const ${'long long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width}; +% endif + int i = blockDim.x*blockIdx.x + threadIdx.x; + + ${dtype} cv[${-(-csz // ksplit)}], bv[${-(-k // ksplit)}], dotp; + __shared__ ${dtype} csub[${ksplit - 1}][${csz}][${blockx}]; + +## Iterate over the row-partitions of C +% for cchunk in cchunks: + ## Iterate over the row-partitions of B + % for bid, kbx in enumerate(kparts): + if (i < n && threadIdx.y == ${bid}) + { + ## Evaluate our partial dot products + % for j in cchunk: + ## Load in any missing parts of B + % for kx in kbx: + % if A[j, kx] != 0 and kx not in loaded: + bv[${loop.index}] = b[i + ${kx}*ldb]; <% loaded.add(kx) %> + % endif + % endfor + <% + dotex = dot(lambda kx: f'bv[{kx}]', A[j, kbx]) + has_dotp = any(A[j, kx] != 0 for kx in range(k)) + %> + % if dotex != '0.0': + dotp = ${dotex}; + % else: + dotp = make_zero(); + % endif + ## Save to a register + % if loop.index % ksplit == bid: + % if beta == 0: + cv[${loop.index // ksplit}] = dotp; + % elif beta == 1 and has_dotp: + cv[${loop.index // ksplit}] = nt_load_c(&c[i + ${j}*ldc]); + cv[${loop.index // ksplit}] += dotp; + % elif has_dotp: + cv[${loop.index // ksplit}] = ${beta}*nt_load_c(&c[i + ${j}*ldc]); + cv[${loop.index // ksplit}] += dotp; + % endif + ## Save to shared memory + % else: + csub[${bid - (bid > loop.index % ksplit)}][${loop.index}][threadIdx.x] = dotp; + % endif + % endfor + } + % endfor + __syncthreads(); + ## Iterate over the column-partitions of B + % for bid, kbx in enumerate(kparts): + if (i < n && threadIdx.y == ${bid}) + { + ## Sum and output the final set of dot products + % for j in cchunk: + % if loop.index % ksplit == bid: + <% has_dotp = any(A[j, kx] != 0 for kx in range(k)) %> + % if beta == 0: + dotp = cv[${loop.index // ksplit}] + ${' + '.join(f'csub[{i}][{loop.index}][threadIdx.x]' + for i in range(ksplit - 1))}; + nt_store_c(&c[i + ${j}*ldc], dotp); + % elif beta == 1 and has_dotp: + dotp = cv[${loop.index // ksplit}] + ${' + '.join(f'csub[{i}][{loop.index}][threadIdx.x]' + for i in range(ksplit - 1))}; + nt_store_c(&c[i + ${j}*ldc], dotp); + % elif beta != 1 and has_dotp: + dotp = cv[${loop.index // ksplit}] + ${' + '.join(f'csub[{i}][{loop.index}][threadIdx.x]' + for i in range(ksplit - 1))}; + nt_store_c(&c[i + ${j}*ldc], dotp); + % elif beta != 1: + nt_store_c(&c[i + ${j}*ldc], ${beta}*nt_load_c(&c[i + ${j}*ldc])); + % endif + % endif + % endfor + } + % endfor + __syncthreads(); +% endfor +} diff --git a/gimmik/kernels/hip/cstream-ksplit-width-preload-c.mako b/gimmik/kernels/hip/cstream-ksplit-width-preload-c.mako new file mode 100644 index 0000000..b435913 --- /dev/null +++ b/gimmik/kernels/hip/cstream-ksplit-width-preload-c.mako @@ -0,0 +1,157 @@ +<%inherit file='base'/> + +% if width == 2: +static inline __device__ ${dtype} +gimmik_vmul(${dtype[:-1]} a, ${dtype} b) +{ + return make_${dtype}(a*b.x, a*b.y); +} + +static inline __device__ ${dtype} +gimmik_vadd(${dtype} a, ${dtype} b) +{ + return make_${dtype}(a.x + b.x, a.y + b.y); +} + +static inline __device__ ${dtype} +gimmik_vmadd(${dtype} acc, ${dtype[:-1]} a, ${dtype} b) +{ + // Keep the multiply-add expression visible to the compiler. + return make_${dtype}(acc.x + a*b.x, acc.y + a*b.y); +} +% elif width == 4: +static inline __device__ ${dtype} +gimmik_vmul(${dtype[:-1]} a, ${dtype} b) +{ + return make_${dtype}(a*b.x, a*b.y, a*b.z, a*b.w); +} + +static inline __device__ ${dtype} +gimmik_vadd(${dtype} a, ${dtype} b) +{ + return make_${dtype}(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); +} + +static inline __device__ ${dtype} +gimmik_vmadd(${dtype} acc, ${dtype[:-1]} a, ${dtype} b) +{ + // Keep the multiply-add expression visible to the compiler. + return make_${dtype}(acc.x + a*b.x, acc.y + a*b.y, acc.z + a*b.z, acc.w + a*b.w); +} +% else: +#error "cstream_ksplit_width_preload_c only supports width=2 or width=4" +% endif + +<% +kparts = partition(A, ksplit, by='cols') +cchunks = chunk(range(m), csz) +loaded = set() +%> + +__global__ __launch_bounds__(${blockx*ksplit}) void +% if n is None: +${kname}(int n, + const ${dtype}* __restrict__ b, int ldb, + ${dtype}* __restrict__ c, int ldc) +{ + % if width > 1: + n = (n + ${width} - 1) / ${width}; + ldb /= ${width}; + ldc /= ${width}; + % endif +% else: +${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) +{ + const int n = ${-(-n // width)}; + const ${'long long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width}; + const ${'long long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width}; +% endif + int i = blockDim.x*blockIdx.x + threadIdx.x; + + ${dtype} cv[${-(-csz // ksplit)}], bv[${-(-k // ksplit)}], dotp; + __shared__ ${dtype} csub[${ksplit - 1}][${csz}][${blockx}]; + +## Iterate over the row-partitions of C +% for cchunk in cchunks: + ## Iterate over the column-partitions of B + % for bid, kbx in enumerate(kparts): + if (i < n && threadIdx.y == ${bid}) + { + ## Evaluate our partial dot products + % for j in cchunk: + ## Load in any missing parts of B + % for kx in kbx: + % if A[j, kx] != 0 and kx not in loaded: + bv[${loop.index}] = b[i + ${kx}*ldb]; <% loaded.add(kx) %> + % endif + % endfor + + ## Expand vectorized partial dot product + <% + nzixs = [] + for l_idx, kx in enumerate(kbx): + if A[j, kx] != 0: + nzixs.append((l_idx, kx)) + + has_dotp = any(A[j, kx] != 0 for kx in range(k)) + if not nzixs: + dotex = 'make_zero()' + else: + first_l_idx, first_kx = nzixs[0] + dotex = f"gimmik_vmul({A[j, first_kx]}, bv[{first_l_idx}])" + for l_idx, kx in nzixs[1:]: + dotex = f"gimmik_vmadd({dotex}, {A[j, kx]}, bv[{l_idx}])" + %> + dotp = ${dotex}; + + ## Save to a register + % if loop.index % ksplit == bid: + % if beta == 0: + cv[${loop.index // ksplit}] = dotp; + % elif beta == 1 and has_dotp: + cv[${loop.index // ksplit}] = nt_load_c(&c[i + ${j}*ldc]); + cv[${loop.index // ksplit}] = gimmik_vadd(cv[${loop.index // ksplit}], dotp); + % elif has_dotp: + cv[${loop.index // ksplit}] = gimmik_vmul(${beta}, nt_load_c(&c[i + ${j}*ldc])); + cv[${loop.index // ksplit}] = gimmik_vadd(cv[${loop.index // ksplit}], dotp); + % endif + ## Save to shared memory + % else: + csub[${bid - (bid > loop.index % ksplit)}][${loop.index}][threadIdx.x] = dotp; + % endif + % endfor + } + % endfor + __syncthreads(); + + ## Sum and output the final set of dot products + % for bid, kbx in enumerate(kparts): + if (i < n && threadIdx.y == ${bid}) + { + % for j in cchunk: + % if loop.index % ksplit == bid: + <% + has_dotp = any(A[j, kx] != 0 for kx in range(k)) + sum_expr = f"cv[{loop.index // ksplit}]" + for s_idx in range(ksplit - 1): + sum_expr = f"gimmik_vadd({sum_expr}, csub[{s_idx}][{loop.index}][threadIdx.x])" + %> + % if beta == 0: + dotp = ${sum_expr}; + nt_store_c(&c[i + ${j}*ldc], dotp); + % elif beta == 1 and has_dotp: + dotp = ${sum_expr}; + nt_store_c(&c[i + ${j}*ldc], dotp); + % elif beta != 1 and has_dotp: + dotp = ${sum_expr}; + nt_store_c(&c[i + ${j}*ldc], dotp); + % elif beta != 1: + nt_store_c(&c[i + ${j}*ldc], gimmik_vmul(${beta}, nt_load_c(&c[i + ${j}*ldc]))); + % endif + % endif + % endfor + } + % endfor + __syncthreads(); +% endfor +} diff --git a/gimmik/kernels/hip/cstream-ksplit.mako b/gimmik/kernels/hip/cstream-ksplit.mako index bae2d2a..6fd3210 100644 --- a/gimmik/kernels/hip/cstream-ksplit.mako +++ b/gimmik/kernels/hip/cstream-ksplit.mako @@ -69,11 +69,11 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) dotp = cv[${loop.index // ksplit}] + ${' + '.join(f'csub[{i}][{loop.index}][threadIdx.x]' for i in range(ksplit - 1))}; % if beta == 0: - c[i + ${j}*ldc] = dotp; + nt_store_c(&c[i + ${j}*ldc], dotp); % elif beta == 1: - c[i + ${j}*ldc] += dotp; + nt_store_c(&c[i + ${j}*ldc], nt_load_c(&c[i + ${j}*ldc]) + dotp); % else: - c[i + ${j}*ldc] = dotp + ${beta}*c[i + ${j}*ldc]; + nt_store_c(&c[i + ${j}*ldc], dotp + ${beta}*nt_load_c(&c[i + ${j}*ldc])); % endif % endif % endfor diff --git a/gimmik/kernels/hip/cstream-preload-c.mako b/gimmik/kernels/hip/cstream-preload-c.mako new file mode 100644 index 0000000..041e674 --- /dev/null +++ b/gimmik/kernels/hip/cstream-preload-c.mako @@ -0,0 +1,51 @@ +<%inherit file='base'/> + +<% ksplit = 2 if m < 36 else 1 %> + +__global__ __launch_bounds__(128) void +% if n is None: +${kname}(int n, + const ${dtype}* __restrict__ b, int ldb, + ${dtype}* __restrict__ c, int ldc) +{ + % if width > 1: + n = ((n + ${width} - 1) / ${width}) * ${width}; + ldb /= ${width}; + ldc /= ${width}; + % endif +% else: +${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) +{ + const int n = ${-(-n // width)}; + const ${'long long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width}; + const ${'long long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width}; +% endif + const int i = blockDim.x*blockIdx.x + threadIdx.x; + ${dtype} dotp; + + if (i < n) + { +% for j, jx in enumerate(A): + % if (dotex := dot(lambda kx: f'b[i + {kx}*ldb]', jx, maxsplit=ksplit)) != '0.0': + % if beta == 0: + dotp = ${dotex}; + nt_store_c(&c[i + ${j}*ldc], dotp); + % elif beta == 1: + dotp = nt_load_c(&c[i + ${j}*ldc]); + dotp += ${dotex}; + nt_store_c(&c[i + ${j}*ldc], dotp); + % else: + dotp = ${beta}*nt_load_c(&c[i + ${j}*ldc]); + dotp += ${dotex}; + nt_store_c(&c[i + ${j}*ldc], dotp); + % endif + % else: + % if beta == 0: + nt_store_c(&c[i + ${j}*ldc], make_zero()); + % elif beta != 1: + nt_store_c(&c[i + ${j}*ldc], ${beta}*nt_load_c(&c[i + ${j}*ldc])); + % endif + % endif +% endfor + } +} diff --git a/gimmik/kernels/hip/cstream-width-preload-c.mako b/gimmik/kernels/hip/cstream-width-preload-c.mako new file mode 100644 index 0000000..9f2f57c --- /dev/null +++ b/gimmik/kernels/hip/cstream-width-preload-c.mako @@ -0,0 +1,106 @@ +<%inherit file='base'/> + +% if width == 2: +static inline __device__ ${dtype} +gimmik_vmul(${dtype[:-1]} a, ${dtype} b) +{ + return make_${dtype}(a*b.x, a*b.y); +} + +static inline __device__ ${dtype} +gimmik_vadd(${dtype} a, ${dtype} b) +{ + return make_${dtype}(a.x + b.x, a.y + b.y); +} + +static inline __device__ ${dtype} +gimmik_vmadd(${dtype} acc, ${dtype[:-1]} a, ${dtype} b) +{ + // Keep the multiply-add expression visible to the compiler. + return make_${dtype}(acc.x + a*b.x, acc.y + a*b.y); +} +% elif width == 4: +static inline __device__ ${dtype} +gimmik_vmul(${dtype[:-1]} a, ${dtype} b) +{ + return make_${dtype}(a*b.x, a*b.y, a*b.z, a*b.w); +} + +static inline __device__ ${dtype} +gimmik_vadd(${dtype} a, ${dtype} b) +{ + return make_${dtype}(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); +} + +static inline __device__ ${dtype} +gimmik_vmadd(${dtype} acc, ${dtype[:-1]} a, ${dtype} b) +{ + // Keep the multiply-add expression visible to the compiler. + return make_${dtype}(acc.x + a*b.x, acc.y + a*b.y, acc.z + a*b.z, acc.w + a*b.w); +} +% else: +#error "cstream_width_preload_c only supports width=2 or width=4" +% endif + +__global__ __launch_bounds__(${blockx}) void +% if n is None: +${kname}(int n, + const ${dtype}* __restrict__ b, int ldb, + ${dtype}* __restrict__ c, int ldc) +{ + % if width > 1: + n = (n + ${width} - 1) / ${width}; + ldb /= ${width}; + ldc /= ${width}; + % endif +% else: +${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) +{ + const int n = ${-(-n // width)}; + const ${'long long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width}; + const ${'long long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width}; +% endif + const int i = blockDim.x*blockIdx.x + threadIdx.x; + ${dtype} bv, dotp; + + if (i < n) + { +% for j, row in enumerate(A): + <% + nzixs = [kx for kx, val in enumerate(row) if val != 0] + %> + % if nzixs: + % if beta == 0: + <% first_kx = nzixs[0] %> + bv = b[i + ${first_kx}*ldb]; + dotp = gimmik_vmul(${row[first_kx]}, bv); + % for kx in nzixs[1:]: + bv = b[i + ${kx}*ldb]; + dotp = gimmik_vmadd(dotp, ${row[kx]}, bv); + % endfor + nt_store_c(&c[i + ${j}*ldc], dotp); + % elif beta == 1: + dotp = nt_load_c(&c[i + ${j}*ldc]); + % for kx in nzixs: + bv = b[i + ${kx}*ldb]; + dotp = gimmik_vmadd(dotp, ${row[kx]}, bv); + % endfor + nt_store_c(&c[i + ${j}*ldc], dotp); + % else: + dotp = gimmik_vmul(${beta}, nt_load_c(&c[i + ${j}*ldc])); + % for kx in nzixs: + bv = b[i + ${kx}*ldb]; + dotp = gimmik_vmadd(dotp, ${row[kx]}, bv); + % endfor + nt_store_c(&c[i + ${j}*ldc], dotp); + % endif + % else: + % if beta == 0: + nt_store_c(&c[i + ${j}*ldc], make_zero()); + % elif beta != 1: + nt_store_c(&c[i + ${j}*ldc], gimmik_vmul(${beta}, nt_load_c(&c[i + ${j}*ldc]))); + % endif + % endif +% endfor + } +} diff --git a/gimmik/kernels/hip/cstream.mako b/gimmik/kernels/hip/cstream.mako index f75301d..0651e87 100644 --- a/gimmik/kernels/hip/cstream.mako +++ b/gimmik/kernels/hip/cstream.mako @@ -2,7 +2,7 @@ <% ksplit = 2 if m < 36 else 1 %> -__global__ __launch_bounds__(128) void +__global__ __launch_bounds__(${blockx}) void % if n is None: ${kname}(int n, const ${dtype}* __restrict__ b, int ldb, @@ -32,11 +32,11 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) dotp = make_zero(); % endif % if beta == 0: - c[i + ${j}*ldc] = dotp; + nt_store_c(&c[i + ${j}*ldc], dotp); % elif beta == 1 and dotex != '0.0': - c[i + ${j}*ldc] += dotp; + nt_store_c(&c[i + ${j}*ldc], nt_load_c(&c[i + ${j}*ldc]) + dotp); % else: - c[i + ${j}*ldc] = dotp + ${beta}*c[i + ${j}*ldc]; + nt_store_c(&c[i + ${j}*ldc], dotp + ${beta}*nt_load_c(&c[i + ${j}*ldc])); % endif % endfor } From 8f4d03ee5b2c6e00a1bb7872bca64f0fae03d9d4 Mon Sep 17 00:00:00 2001 From: tomjen12 Date: Thu, 18 Jun 2026 00:12:02 -0500 Subject: [PATCH 02/15] Fix HIP GiMMiK block size metadata --- gimmik/hip.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/gimmik/hip.py b/gimmik/hip.py index ee38c9f..920cc0b 100644 --- a/gimmik/hip.py +++ b/gimmik/hip.py @@ -19,11 +19,13 @@ def emit(name, args, meta): if threads <= max_block_threads and shared <= max_shared: yield (name, args, meta) + blkx = self.basemeta['block'][0] + # B loading, C streaming kernel - yield from emit('cstream', {}, {}) + yield from emit('cstream', {'blockx': blkx}, {}) # B streaming, C accumulation kernel - yield from emit('bstream', {}, {}) + yield from emit('bstream', {'blockx': blkx}, {}) # Four-way m-split B streaming, C accumulation kernel ms, bsz, blkx = 4, 24, 64 From 96671a6364f8ccbac0d14d89364cda13f5223952 Mon Sep 17 00:00:00 2001 From: tomjen12 Date: Sun, 21 Jun 2026 23:18:07 -0500 Subject: [PATCH 03/15] Address HIP GiMMiK review comments --- gimmik/hip.py | 4 ++ .../hip/bstream-msplit-width-preload-c.mako | 40 +-------------- .../kernels/hip/bstream-width-preload-c.mako | 40 +-------------- .../kernels/hip/cstream-ksplit-preload-c.mako | 4 +- .../hip/cstream-ksplit-width-preload-c.mako | 51 ++----------------- .../kernels/hip/cstream-width-preload-c.mako | 42 +-------------- gimmik/kernels/hip/vector.mako | 41 +++++++++++++++ 7 files changed, 54 insertions(+), 168 deletions(-) create mode 100644 gimmik/kernels/hip/vector.mako diff --git a/gimmik/hip.py b/gimmik/hip.py index 920cc0b..cea1be5 100644 --- a/gimmik/hip.py +++ b/gimmik/hip.py @@ -39,6 +39,10 @@ def emit(name, args, meta): meta = {'block': (blkx, ks, 1), 'shared': (ks - 1)*csz*blkx*dsize} yield from emit('cstream-ksplit', args, meta) + # Only emit tuned variants on the architecture they were tuned for. + if gcn_arch != 'gfx942' or warp_size != 64: + return + # Tuned HIP variants msplits, ksplits = [4, 8], [2, 4] bsz, csz, blkx = 8, 8, 64 diff --git a/gimmik/kernels/hip/bstream-msplit-width-preload-c.mako b/gimmik/kernels/hip/bstream-msplit-width-preload-c.mako index 9659db7..4466f9f 100644 --- a/gimmik/kernels/hip/bstream-msplit-width-preload-c.mako +++ b/gimmik/kernels/hip/bstream-msplit-width-preload-c.mako @@ -1,44 +1,6 @@ <%inherit file='base'/> -% if width == 2: -static inline __device__ ${dtype} -gimmik_vmul(${dtype[:-1]} a, ${dtype} b) -{ - return make_${dtype}(a*b.x, a*b.y); -} - -static inline __device__ ${dtype} -gimmik_vadd(${dtype} a, ${dtype} b) -{ - return make_${dtype}(a.x + b.x, a.y + b.y); -} - -static inline __device__ ${dtype} -gimmik_vmadd(${dtype} acc, ${dtype[:-1]} a, ${dtype} b) -{ - return make_${dtype}(acc.x + a*b.x, acc.y + a*b.y); -} -% elif width == 4: -static inline __device__ ${dtype} -gimmik_vmul(${dtype[:-1]} a, ${dtype} b) -{ - return make_${dtype}(a*b.x, a*b.y, a*b.z, a*b.w); -} - -static inline __device__ ${dtype} -gimmik_vadd(${dtype} a, ${dtype} b) -{ - return make_${dtype}(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); -} - -static inline __device__ ${dtype} -gimmik_vmadd(${dtype} acc, ${dtype[:-1]} a, ${dtype} b) -{ - return make_${dtype}(acc.x + a*b.x, acc.y + a*b.y, acc.z + a*b.z, acc.w + a*b.w); -} -% else: -#error "bstream_msplit_width_preload_c only supports width=2 or width=4" -% endif +<%include file='vector'/> <% mx = partition(A, into=msplit, by='rows') diff --git a/gimmik/kernels/hip/bstream-width-preload-c.mako b/gimmik/kernels/hip/bstream-width-preload-c.mako index 97a7571..2c4e5c5 100644 --- a/gimmik/kernels/hip/bstream-width-preload-c.mako +++ b/gimmik/kernels/hip/bstream-width-preload-c.mako @@ -1,44 +1,6 @@ <%inherit file='base'/> -% if width == 2: -static inline __device__ ${dtype} -gimmik_vmul(${dtype[:-1]} a, ${dtype} b) -{ - return make_${dtype}(a*b.x, a*b.y); -} - -static inline __device__ ${dtype} -gimmik_vadd(${dtype} a, ${dtype} b) -{ - return make_${dtype}(a.x + b.x, a.y + b.y); -} - -static inline __device__ ${dtype} -gimmik_vmadd(${dtype} acc, ${dtype[:-1]} a, ${dtype} b) -{ - return make_${dtype}(acc.x + a*b.x, acc.y + a*b.y); -} -% elif width == 4: -static inline __device__ ${dtype} -gimmik_vmul(${dtype[:-1]} a, ${dtype} b) -{ - return make_${dtype}(a*b.x, a*b.y, a*b.z, a*b.w); -} - -static inline __device__ ${dtype} -gimmik_vadd(${dtype} a, ${dtype} b) -{ - return make_${dtype}(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); -} - -static inline __device__ ${dtype} -gimmik_vmadd(${dtype} acc, ${dtype[:-1]} a, ${dtype} b) -{ - return make_${dtype}(acc.x + a*b.x, acc.y + a*b.y, acc.z + a*b.z, acc.w + a*b.w); -} -% else: -#error "bstream_width_preload_c only supports width=2 or width=4" -% endif +<%include file='vector'/> __global__ __launch_bounds__(${blockx}) void % if n is None: diff --git a/gimmik/kernels/hip/cstream-ksplit-preload-c.mako b/gimmik/kernels/hip/cstream-ksplit-preload-c.mako index 507c34f..51f1db4 100644 --- a/gimmik/kernels/hip/cstream-ksplit-preload-c.mako +++ b/gimmik/kernels/hip/cstream-ksplit-preload-c.mako @@ -45,7 +45,7 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) % endfor <% dotex = dot(lambda kx: f'bv[{kx}]', A[j, kbx]) - has_dotp = any(A[j, kx] != 0 for kx in range(k)) + has_dotp = A[j].any() %> % if dotex != '0.0': dotp = ${dotex}; @@ -78,7 +78,7 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) ## Sum and output the final set of dot products % for j in cchunk: % if loop.index % ksplit == bid: - <% has_dotp = any(A[j, kx] != 0 for kx in range(k)) %> + <% has_dotp = A[j].any() %> % if beta == 0: dotp = cv[${loop.index // ksplit}] + ${' + '.join(f'csub[{i}][{loop.index}][threadIdx.x]' for i in range(ksplit - 1))}; diff --git a/gimmik/kernels/hip/cstream-ksplit-width-preload-c.mako b/gimmik/kernels/hip/cstream-ksplit-width-preload-c.mako index b435913..bdac6dc 100644 --- a/gimmik/kernels/hip/cstream-ksplit-width-preload-c.mako +++ b/gimmik/kernels/hip/cstream-ksplit-width-preload-c.mako @@ -1,46 +1,6 @@ <%inherit file='base'/> -% if width == 2: -static inline __device__ ${dtype} -gimmik_vmul(${dtype[:-1]} a, ${dtype} b) -{ - return make_${dtype}(a*b.x, a*b.y); -} - -static inline __device__ ${dtype} -gimmik_vadd(${dtype} a, ${dtype} b) -{ - return make_${dtype}(a.x + b.x, a.y + b.y); -} - -static inline __device__ ${dtype} -gimmik_vmadd(${dtype} acc, ${dtype[:-1]} a, ${dtype} b) -{ - // Keep the multiply-add expression visible to the compiler. - return make_${dtype}(acc.x + a*b.x, acc.y + a*b.y); -} -% elif width == 4: -static inline __device__ ${dtype} -gimmik_vmul(${dtype[:-1]} a, ${dtype} b) -{ - return make_${dtype}(a*b.x, a*b.y, a*b.z, a*b.w); -} - -static inline __device__ ${dtype} -gimmik_vadd(${dtype} a, ${dtype} b) -{ - return make_${dtype}(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); -} - -static inline __device__ ${dtype} -gimmik_vmadd(${dtype} acc, ${dtype[:-1]} a, ${dtype} b) -{ - // Keep the multiply-add expression visible to the compiler. - return make_${dtype}(acc.x + a*b.x, acc.y + a*b.y, acc.z + a*b.z, acc.w + a*b.w); -} -% else: -#error "cstream_ksplit_width_preload_c only supports width=2 or width=4" -% endif +<%include file='vector'/> <% kparts = partition(A, ksplit, by='cols') @@ -88,12 +48,9 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) ## Expand vectorized partial dot product <% - nzixs = [] - for l_idx, kx in enumerate(kbx): - if A[j, kx] != 0: - nzixs.append((l_idx, kx)) + nzixs = [(l_idx, kbx[l_idx]) for l_idx in A[j, kbx].nonzero()[0]] - has_dotp = any(A[j, kx] != 0 for kx in range(k)) + has_dotp = A[j].any() if not nzixs: dotex = 'make_zero()' else: @@ -131,7 +88,7 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) % for j in cchunk: % if loop.index % ksplit == bid: <% - has_dotp = any(A[j, kx] != 0 for kx in range(k)) + has_dotp = A[j].any() sum_expr = f"cv[{loop.index // ksplit}]" for s_idx in range(ksplit - 1): sum_expr = f"gimmik_vadd({sum_expr}, csub[{s_idx}][{loop.index}][threadIdx.x])" diff --git a/gimmik/kernels/hip/cstream-width-preload-c.mako b/gimmik/kernels/hip/cstream-width-preload-c.mako index 9f2f57c..86acfcb 100644 --- a/gimmik/kernels/hip/cstream-width-preload-c.mako +++ b/gimmik/kernels/hip/cstream-width-preload-c.mako @@ -1,46 +1,6 @@ <%inherit file='base'/> -% if width == 2: -static inline __device__ ${dtype} -gimmik_vmul(${dtype[:-1]} a, ${dtype} b) -{ - return make_${dtype}(a*b.x, a*b.y); -} - -static inline __device__ ${dtype} -gimmik_vadd(${dtype} a, ${dtype} b) -{ - return make_${dtype}(a.x + b.x, a.y + b.y); -} - -static inline __device__ ${dtype} -gimmik_vmadd(${dtype} acc, ${dtype[:-1]} a, ${dtype} b) -{ - // Keep the multiply-add expression visible to the compiler. - return make_${dtype}(acc.x + a*b.x, acc.y + a*b.y); -} -% elif width == 4: -static inline __device__ ${dtype} -gimmik_vmul(${dtype[:-1]} a, ${dtype} b) -{ - return make_${dtype}(a*b.x, a*b.y, a*b.z, a*b.w); -} - -static inline __device__ ${dtype} -gimmik_vadd(${dtype} a, ${dtype} b) -{ - return make_${dtype}(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); -} - -static inline __device__ ${dtype} -gimmik_vmadd(${dtype} acc, ${dtype[:-1]} a, ${dtype} b) -{ - // Keep the multiply-add expression visible to the compiler. - return make_${dtype}(acc.x + a*b.x, acc.y + a*b.y, acc.z + a*b.z, acc.w + a*b.w); -} -% else: -#error "cstream_width_preload_c only supports width=2 or width=4" -% endif +<%include file='vector'/> __global__ __launch_bounds__(${blockx}) void % if n is None: diff --git a/gimmik/kernels/hip/vector.mako b/gimmik/kernels/hip/vector.mako new file mode 100644 index 0000000..268d6ab --- /dev/null +++ b/gimmik/kernels/hip/vector.mako @@ -0,0 +1,41 @@ +% if width == 2: +static inline __device__ ${dtype} +gimmik_vmul(${dtype[:-1]} a, ${dtype} b) +{ + return make_${dtype}(a*b.x, a*b.y); +} + +static inline __device__ ${dtype} +gimmik_vadd(${dtype} a, ${dtype} b) +{ + return make_${dtype}(a.x + b.x, a.y + b.y); +} + +static inline __device__ ${dtype} +gimmik_vmadd(${dtype} acc, ${dtype[:-1]} a, ${dtype} b) +{ + // Keep the multiply-add expression visible to the compiler. + return make_${dtype}(acc.x + a*b.x, acc.y + a*b.y); +} +% elif width == 4: +static inline __device__ ${dtype} +gimmik_vmul(${dtype[:-1]} a, ${dtype} b) +{ + return make_${dtype}(a*b.x, a*b.y, a*b.z, a*b.w); +} + +static inline __device__ ${dtype} +gimmik_vadd(${dtype} a, ${dtype} b) +{ + return make_${dtype}(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); +} + +static inline __device__ ${dtype} +gimmik_vmadd(${dtype} acc, ${dtype[:-1]} a, ${dtype} b) +{ + // Keep the multiply-add expression visible to the compiler. + return make_${dtype}(acc.x + a*b.x, acc.y + a*b.y, acc.z + a*b.z, acc.w + a*b.w); +} +% else: +#error "HIP vector helpers only support width=2 or width=4" +% endif From 739a82e18f508597081da9c73fb6ed780e7817ed Mon Sep 17 00:00:00 2001 From: tomjen12 Date: Mon, 22 Jun 2026 01:45:23 -0500 Subject: [PATCH 04/15] Handle ROCm feature suffixes for gfx942 tuning --- gimmik/hip.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gimmik/hip.py b/gimmik/hip.py index cea1be5..a31c3d6 100644 --- a/gimmik/hip.py +++ b/gimmik/hip.py @@ -40,7 +40,8 @@ def emit(name, args, meta): yield from emit('cstream-ksplit', args, meta) # Only emit tuned variants on the architecture they were tuned for. - if gcn_arch != 'gfx942' or warp_size != 64: + base_arch = gcn_arch.split(':', 1)[0] if gcn_arch else None + if base_arch != 'gfx942' or warp_size != 64: return # Tuned HIP variants From 0633539fbe6c99c97fdbf7aa91c0cb8df35e5160 Mon Sep 17 00:00:00 2001 From: tomjen12 Date: Mon, 22 Jun 2026 03:30:28 -0500 Subject: [PATCH 05/15] Enable tuned HIP variants on gfx90a --- gimmik/hip.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gimmik/hip.py b/gimmik/hip.py index a31c3d6..e32152d 100644 --- a/gimmik/hip.py +++ b/gimmik/hip.py @@ -39,9 +39,9 @@ def emit(name, args, meta): meta = {'block': (blkx, ks, 1), 'shared': (ks - 1)*csz*blkx*dsize} yield from emit('cstream-ksplit', args, meta) - # Only emit tuned variants on the architecture they were tuned for. + # Only emit tuned variants on architectures they have been validated for. base_arch = gcn_arch.split(':', 1)[0] if gcn_arch else None - if base_arch != 'gfx942' or warp_size != 64: + if base_arch not in {'gfx90a', 'gfx942'} or warp_size != 64: return # Tuned HIP variants From 7b59fb0a791cbf1324a46c5b0451bfc63a8b6140 Mon Sep 17 00:00:00 2001 From: tomjen12 Date: Tue, 23 Jun 2026 04:57:56 -0500 Subject: [PATCH 06/15] Parameterize HIP vector width and refine preload kernels --- gimmik/hip.py | 155 ++++++++---------- gimmik/kernels/hip/base.mako | 83 ++++++++++ .../kernels/hip/bstream-msplit-preload-c.mako | 28 ++-- .../hip/bstream-msplit-width-preload-c.mako | 100 ----------- gimmik/kernels/hip/bstream-msplit.mako | 16 +- gimmik/kernels/hip/bstream-preload-c.mako | 28 ++-- .../kernels/hip/bstream-width-preload-c.mako | 65 -------- gimmik/kernels/hip/bstream.mako | 16 +- .../kernels/hip/cstream-ksplit-preload-c.mako | 45 ++--- .../hip/cstream-ksplit-width-preload-c.mako | 114 ------------- gimmik/kernels/hip/cstream-ksplit.mako | 30 ++-- gimmik/kernels/hip/cstream-preload-c.mako | 32 ++-- .../kernels/hip/cstream-width-preload-c.mako | 66 -------- gimmik/kernels/hip/cstream.mako | 23 ++- gimmik/kernels/hip/vector.mako | 41 ----- 15 files changed, 286 insertions(+), 556 deletions(-) delete mode 100644 gimmik/kernels/hip/bstream-msplit-width-preload-c.mako delete mode 100644 gimmik/kernels/hip/bstream-width-preload-c.mako delete mode 100644 gimmik/kernels/hip/cstream-ksplit-width-preload-c.mako delete mode 100644 gimmik/kernels/hip/cstream-width-preload-c.mako delete mode 100644 gimmik/kernels/hip/vector.mako diff --git a/gimmik/hip.py b/gimmik/hip.py index e32152d..57fa394 100644 --- a/gimmik/hip.py +++ b/gimmik/hip.py @@ -47,96 +47,83 @@ def emit(name, args, meta): # Tuned HIP variants msplits, ksplits = [4, 8], [2, 4] bsz, csz, blkx = 8, 8, 64 - width = 2 if self.aligne is not None and self.aligne % 2 == 0 else 1 - - # B loading, C streaming kernel - args = {'blockx': blkx} - meta = {'block': (blkx, 1, 1), 'desc': f'cstream/x{blkx}'} - yield from emit('cstream', args, meta) - - # B streaming, C accumulation kernel - meta = {'block': (blkx, 1, 1), 'desc': f'bstream/x{blkx}'} - yield from emit('bstream', args, meta) - - for ms in msplits: - # m-split B streaming, C accumulation kernel - args = {'msplit': ms, 'bsz': bsz, 'blockx': blkx} - shared = 2*bsz*blkx*dsize - meta = {'block': (blkx, ms, 1), 'shared': shared, - 'desc': f'bstream-msplit/m{ms}-b{bsz}-x{blkx}'} - yield from emit('bstream-msplit', args, meta) - - for ks in ksplits: - # k-split B loading, C streaming kernel - args = {'ksplit': ks, 'csz': csz, 'blockx': blkx} - shared = (ks - 1)*csz*blkx*dsize - meta = {'block': (blkx, ks, 1), 'shared': shared, - 'desc': f'cstream-ksplit/k{ks}-c{csz}-x{blkx}'} - yield from emit('cstream-ksplit', args, meta) - - # B loading, C preloading, C streaming kernel - args = {'blockx': blkx} - meta = {'block': (blkx, 1, 1), 'desc': f'cstream-preload-c/x{blkx}'} - yield from emit('cstream-preload-c', args, meta) - - # B streaming, C preloading, C accumulation kernel - meta = {'block': (blkx, 1, 1), 'desc': f'bstream-preload-c/x{blkx}'} - yield from emit('bstream-preload-c', args, meta) - - if width > 1: - args = {'dtype': f'{dtype}{width}', 'width': width, - 'blockx': blkx} - meta = {'block': (blkx, 1, 1), 'width': width, - 'desc': f'cstream-width-preload-c/w{width}-x{blkx}'} - yield from emit('cstream-width-preload-c', args, meta) - - meta = {'block': (blkx, 1, 1), 'width': width, - 'desc': f'bstream-width-preload-c/w{width}-x{blkx}'} - yield from emit('bstream-width-preload-c', args, meta) - - for ms in msplits: - # m-split B streaming, C preloading, C accumulation kernel - args = {'msplit': ms, 'bsz': bsz, 'blockx': blkx} - shared = 2*bsz*blkx*dsize - meta = {'block': (blkx, ms, 1), 'shared': shared, - 'desc': f'bstream-msplit-preload-c/m{ms}-b{bsz}-x{blkx}'} - yield from emit('bstream-msplit-preload-c', args, meta) - - if width > 1: - args = {'msplit': ms, 'bsz': bsz, 'blockx': blkx, - 'dtype': f'{dtype}{width}', 'width': width} + widths = [1] + if self.aligne is not None and self.aligne % 2 == 0: + widths.append(2) + + for width in widths: + wargs = ({'dtype': f'{dtype}{width}', 'width': width} + if width > 1 else {}) + wmeta = {'width': width} if width > 1 else {} + wpfx = f'w{width}-' if width > 1 else '' + + # B loading, C streaming kernel + args = {'blockx': blkx} | wargs + meta = {'block': (blkx, 1, 1), + 'desc': f'cstream/{wpfx}x{blkx}'} | wmeta + yield from emit('cstream', args, meta) + + # B streaming, C accumulation kernel + meta = {'block': (blkx, 1, 1), + 'desc': f'bstream/{wpfx}x{blkx}'} | wmeta + yield from emit('bstream', args, meta) + + for ms in msplits: + # m-split B streaming, C accumulation kernel + args = {'msplit': ms, 'bsz': bsz, 'blockx': blkx} | wargs + shared = 2*bsz*blkx*dsize*width meta = { - 'block': (blkx, ms, 1), 'shared': shared*width, - 'width': width, + 'block': (blkx, ms, 1), 'shared': shared, + 'desc': f'bstream-msplit/{wpfx}m{ms}-b{bsz}-x{blkx}' + } | wmeta + yield from emit('bstream-msplit', args, meta) + + for ks in ksplits: + # k-split B loading, C streaming kernel + args = {'ksplit': ks, 'csz': csz, 'blockx': blkx} | wargs + shared = (ks - 1)*csz*blkx*dsize*width + meta = { + 'block': (blkx, ks, 1), 'shared': shared, + 'desc': f'cstream-ksplit/{wpfx}k{ks}-c{csz}-x{blkx}' + } | wmeta + yield from emit('cstream-ksplit', args, meta) + + # B loading, C preloading, C streaming kernel + args = {'blockx': blkx} | wargs + meta = {'block': (blkx, 1, 1), + 'desc': f'cstream-preload-c/{wpfx}x{blkx}'} | wmeta + yield from emit('cstream-preload-c', args, meta) + + # B streaming, C preloading, C accumulation kernel + meta = {'block': (blkx, 1, 1), + 'desc': f'bstream-preload-c/{wpfx}x{blkx}'} | wmeta + yield from emit('bstream-preload-c', args, meta) + + for ms in msplits: + # m-split B streaming, C preloading, C accumulation kernel + args = {'msplit': ms, 'bsz': bsz, 'blockx': blkx} | wargs + shared = 2*bsz*blkx*dsize*width + meta = { + 'block': (blkx, ms, 1), 'shared': shared, 'desc': ( - f'bstream-msplit-width-preload-c/w{width}-' - f'm{ms}-b{bsz}-x{blkx}' + f'bstream-msplit-preload-c/' + f'{wpfx}m{ms}-b{bsz}-x{blkx}' ) - } - yield from emit('bstream-msplit-width-preload-c', args, meta) - - for ks in ksplits: - # k-split B loading, C preloading, C streaming kernel - args = {'ksplit': ks, 'csz': csz, 'blockx': blkx} - shared = (ks - 1)*csz*blkx*dsize - meta = { - 'block': (blkx, ks, 1), 'shared': shared, - 'desc': f'cstream-ksplit-preload-c/k{ks}-c{csz}-x{blkx}' - } - yield from emit('cstream-ksplit-preload-c', args, meta) - - if width > 1: - args = {'ksplit': ks, 'csz': csz, 'blockx': blkx, - 'dtype': f'{dtype}{width}', 'width': width} + } | wmeta + yield from emit('bstream-msplit-preload-c', args, meta) + + for ks in ksplits: + # k-split B loading, C preloading, C streaming kernel + args = {'ksplit': ks, 'csz': csz, 'blockx': blkx} | wargs + shared = (ks - 1)*csz*blkx*dsize*width meta = { - 'block': (blkx, ks, 1), 'shared': shared*width, - 'width': width, + 'block': (blkx, ks, 1), 'shared': shared, 'desc': ( - f'cstream-ksplit-width-preload-c/w{width}-' - f'k{ks}-c{csz}-x{blkx}' + f'cstream-ksplit-preload-c/' + f'{wpfx}k{ks}-c{csz}-x{blkx}' ) - } - yield from emit('cstream-ksplit-width-preload-c', args, meta) + } | wmeta + yield from emit('cstream-ksplit-preload-c', args, meta) def _process_meta(self, meta): if self.n is not None: diff --git a/gimmik/kernels/hip/base.mako b/gimmik/kernels/hip/base.mako index b40a0b9..a03a943 100644 --- a/gimmik/kernels/hip/base.mako +++ b/gimmik/kernels/hip/base.mako @@ -9,6 +9,67 @@ static inline __device__ ${dtype} make_zero() { return 0; } % endif +% if width == 1: +static inline __device__ ${dtype} +gimmik_vmul(${dtype} a, ${dtype} b) +{ + return a*b; +} + +static inline __device__ ${dtype} +gimmik_vadd(${dtype} a, ${dtype} b) +{ + return a + b; +} + +static inline __device__ ${dtype} +gimmik_vmadd(${dtype} acc, ${dtype} a, ${dtype} b) +{ + // Keep the multiply-add expression visible to the compiler. + return acc + a*b; +} +% elif width == 2: +static inline __device__ ${dtype} +gimmik_vmul(${dtype[:-1]} a, ${dtype} b) +{ + return make_${dtype}(a*b.x, a*b.y); +} + +static inline __device__ ${dtype} +gimmik_vadd(${dtype} a, ${dtype} b) +{ + return make_${dtype}(a.x + b.x, a.y + b.y); +} + +static inline __device__ ${dtype} +gimmik_vmadd(${dtype} acc, ${dtype[:-1]} a, ${dtype} b) +{ + // Keep the multiply-add expression visible to the compiler. + return make_${dtype}(acc.x + a*b.x, acc.y + a*b.y); +} +% elif width == 4: +static inline __device__ ${dtype} +gimmik_vmul(${dtype[:-1]} a, ${dtype} b) +{ + return make_${dtype}(a*b.x, a*b.y, a*b.z, a*b.w); +} + +static inline __device__ ${dtype} +gimmik_vadd(${dtype} a, ${dtype} b) +{ + return make_${dtype}(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); +} + +static inline __device__ ${dtype} +gimmik_vmadd(${dtype} acc, ${dtype[:-1]} a, ${dtype} b) +{ + // Keep the multiply-add expression visible to the compiler. + return make_${dtype}(acc.x + a*b.x, acc.y + a*b.y, acc.z + a*b.z, acc.w + a*b.w); +} +% else: +#error "HIP vector helpers only support width=2 or width=4" +% endif + static inline __device__ void nt_store_c(${dtype}* p, ${dtype} v) { @@ -41,4 +102,26 @@ nt_load_c(const ${dtype}* p) % endif } +<% nt_c = context.get('nt_c', True) %> + +static inline __device__ void +store_c(${dtype}* p, ${dtype} v) +{ +% if nt_c: + nt_store_c(p, v); +% else: + *p = v; +% endif +} + +static inline __device__ ${dtype} +load_c(const ${dtype}* p) +{ +% if nt_c: + return nt_load_c(p); +% else: + return *p; +% endif +} + ${next.body()} diff --git a/gimmik/kernels/hip/bstream-msplit-preload-c.mako b/gimmik/kernels/hip/bstream-msplit-preload-c.mako index 8b6f008..6cabeb6 100644 --- a/gimmik/kernels/hip/bstream-msplit-preload-c.mako +++ b/gimmik/kernels/hip/bstream-msplit-preload-c.mako @@ -12,7 +12,7 @@ ${kname}(int n, ${dtype}* __restrict__ c, int ldc) { % if width > 1: - n = ((n + ${width} - 1) / ${width}) * ${width}; + n = (n + ${width} - 1) / ${width}; ldb /= ${width}; ldc /= ${width}; % endif @@ -38,18 +38,18 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) % endif % endfor + % if beta != 0: ## Preload C values for active rows owned by this m-split lane % for j, jx in enumerate(mx[cid]): % if afix[jx] != -1: - % if beta == 0: - csub[${j}] = make_zero(); - % elif beta == 1: - csub[${j}] = nt_load_c(&c[i + ${jx}*ldc]); + % if beta == 1: + csub[${j}] = load_c(&c[i + ${jx}*ldc]); % else: - csub[${j}] = ${beta}*nt_load_c(&c[i + ${jx}*ldc]); + csub[${j}] = gimmik_vmul(${beta}, load_c(&c[i + ${jx}*ldc])); % endif % endif % endfor + % endif } % endfor __syncthreads(); @@ -72,12 +72,18 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) % for kx in bchunks[bb]: bv = bsub[${bb % 2}][${loop.index}][threadIdx.x]; % for j, jx in enumerate(A[mcx, kx]): - % if jx != 0: - csub[${j}] += ${jx}*bv; + % if beta == 0: + % if jx != 0 and kx == afix[mcx[j]]: + csub[${j}] = gimmik_vmul(${jx}, bv); + % elif jx != 0: + csub[${j}] = gimmik_vmadd(csub[${j}], ${jx}, bv); + % endif + % elif jx != 0: + csub[${j}] = gimmik_vmadd(csub[${j}], ${jx}, bv); % endif ## If we're done with this dot product then store to global % if kx == alix[mcx[j]]: - nt_store_c(&c[i + ${mcx[j]}*ldc], csub[${j}]); + store_c(&c[i + ${mcx[j]}*ldc], csub[${j}]); % endif % endfor % endfor @@ -85,9 +91,9 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) % if loop.parent.last: % for j, jx in enumerate(afix): % if jx == -1 and j % msplit == cid and beta == 0: - nt_store_c(&c[i + ${j}*ldc], make_zero()); + store_c(&c[i + ${j}*ldc], make_zero()); % elif jx == -1 and j % msplit == cid and beta != 1: - nt_store_c(&c[i + ${j}*ldc], ${beta}*nt_load_c(&c[i + ${j}*ldc])); + store_c(&c[i + ${j}*ldc], gimmik_vmul(${beta}, load_c(&c[i + ${j}*ldc]))); % endif % endfor % endif diff --git a/gimmik/kernels/hip/bstream-msplit-width-preload-c.mako b/gimmik/kernels/hip/bstream-msplit-width-preload-c.mako deleted file mode 100644 index 4466f9f..0000000 --- a/gimmik/kernels/hip/bstream-msplit-width-preload-c.mako +++ /dev/null @@ -1,100 +0,0 @@ -<%inherit file='base'/> - -<%include file='vector'/> - -<% -mx = partition(A, into=msplit, by='rows') -bchunks = chunk(bix, bsz) -%> - -__global__ __launch_bounds__(${blockx*msplit}) void -% if n is None: -${kname}(int n, - const ${dtype}* __restrict__ b, int ldb, - ${dtype}* __restrict__ c, int ldc) -{ - % if width > 1: - n = (n + ${width} - 1) / ${width}; - ldb /= ${width}; - ldc /= ${width}; - % endif -% else: -${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) -{ - const int n = ${-(-n // width)}; - const ${'long long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width}; - const ${'long long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width}; -% endif - int i = blockDim.x*blockIdx.x + threadIdx.x; - - ${dtype} bv, csub[${-(-m // msplit)}]; - __shared__ ${dtype} bsub[2][${bsz}][${blockx}]; - -## Fill the initial shared memory block -% for cid in range(msplit): - if (i < n && threadIdx.y == ${cid}) - { - % for kx in bchunks[0]: - % if loop.index % msplit == cid: - bsub[0][${loop.index}][threadIdx.x] = b[i + ${kx}*ldb]; - % endif - % endfor - - ## Preload C values for active rows owned by this m-split lane - % for j, jx in enumerate(mx[cid]): - % if afix[jx] != -1: - % if beta == 0: - csub[${j}] = make_zero(); - % elif beta == 1: - csub[${j}] = nt_load_c(&c[i + ${jx}*ldc]); - % else: - csub[${j}] = gimmik_vmul(${beta}, nt_load_c(&c[i + ${jx}*ldc])); - % endif - % endif - % endfor - } -% endfor - __syncthreads(); - -## Iterate over each row-chunk of B -% for bb in range(len(bchunks)): - ## Iterate over each row-chunk of C - % for cid, mcx in enumerate(mx): - if (i < n && threadIdx.y == ${cid}) - { - ## Start filling the next shared memory block - % if not loop.parent.last: - % for kx in bchunks[bb + 1]: - % if loop.index % msplit == cid: - bsub[${(bb + 1) % 2}][${loop.index}][threadIdx.x] = b[i + ${kx}*ldb]; - % endif - % endfor - % endif - ## Accumulate our dot products - % for kx in bchunks[bb]: - bv = bsub[${bb % 2}][${loop.index}][threadIdx.x]; - % for j, jx in enumerate(A[mcx, kx]): - % if jx != 0: - csub[${j}] = gimmik_vmadd(csub[${j}], ${jx}, bv); - % endif - ## If we're done with this dot product then store to global - % if kx == alix[mcx[j]]: - nt_store_c(&c[i + ${mcx[j]}*ldc], csub[${j}]); - % endif - % endfor - % endfor - ## Handle rows of A which are all zero - % if loop.parent.last: - % for j, jx in enumerate(afix): - % if jx == -1 and j % msplit == cid and beta == 0: - nt_store_c(&c[i + ${j}*ldc], make_zero()); - % elif jx == -1 and j % msplit == cid and beta != 1: - nt_store_c(&c[i + ${j}*ldc], gimmik_vmul(${beta}, nt_load_c(&c[i + ${j}*ldc]))); - % endif - % endfor - % endif - } - % endfor - __syncthreads(); -% endfor -} diff --git a/gimmik/kernels/hip/bstream-msplit.mako b/gimmik/kernels/hip/bstream-msplit.mako index 6470477..35d336e 100644 --- a/gimmik/kernels/hip/bstream-msplit.mako +++ b/gimmik/kernels/hip/bstream-msplit.mako @@ -12,7 +12,7 @@ ${kname}(int n, ${dtype}* __restrict__ c, int ldc) { % if width > 1: - n = ((n + ${width} - 1) / ${width}) * ${width}; + n = (n + ${width} - 1) / ${width}; ldb /= ${width}; ldc /= ${width}; % endif @@ -60,17 +60,17 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) bv = bsub[${bb % 2}][${loop.index}][threadIdx.x]; % for j, jx in enumerate(A[mcx, kx]): % if jx != 0 and kx == afix[mcx[j]]: - csub[${j}] = ${jx}*bv; + csub[${j}] = gimmik_vmul(${jx}, bv); % elif jx != 0: - csub[${j}] += ${jx}*bv; + csub[${j}] = gimmik_vmadd(csub[${j}], ${jx}, bv); % endif ## If we're done with this dot product then store to global % if kx == alix[mcx[j]] and beta == 0: - nt_store_c(&c[i + ${mcx[j]}*ldc], csub[${j}]); + store_c(&c[i + ${mcx[j]}*ldc], csub[${j}]); % elif kx == alix[mcx[j]] and beta == 1: - nt_store_c(&c[i + ${mcx[j]}*ldc], nt_load_c(&c[i + ${mcx[j]}*ldc]) + csub[${j}]); + store_c(&c[i + ${mcx[j]}*ldc], gimmik_vadd(load_c(&c[i + ${mcx[j]}*ldc]), csub[${j}])); % elif kx == alix[mcx[j]]: - nt_store_c(&c[i + ${mcx[j]}*ldc], csub[${j}] + ${beta}*nt_load_c(&c[i + ${mcx[j]}*ldc])); + store_c(&c[i + ${mcx[j]}*ldc], gimmik_vadd(csub[${j}], gimmik_vmul(${beta}, load_c(&c[i + ${mcx[j]}*ldc])))); % endif % endfor % endfor @@ -78,9 +78,9 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) % if loop.parent.last: % for j, jx in enumerate(afix): % if jx == -1 and j % msplit == cid and beta == 0: - nt_store_c(&c[i + ${j}*ldc], make_zero()); + store_c(&c[i + ${j}*ldc], make_zero()); % elif jx == -1 and j % msplit == cid and beta != 1: - nt_store_c(&c[i + ${j}*ldc], nt_load_c(&c[i + ${j}*ldc])*${beta}); + store_c(&c[i + ${j}*ldc], gimmik_vmul(${beta}, load_c(&c[i + ${j}*ldc]))); % endif % endfor % endif diff --git a/gimmik/kernels/hip/bstream-preload-c.mako b/gimmik/kernels/hip/bstream-preload-c.mako index 30b08f6..095be83 100644 --- a/gimmik/kernels/hip/bstream-preload-c.mako +++ b/gimmik/kernels/hip/bstream-preload-c.mako @@ -7,7 +7,7 @@ ${kname}(int n, ${dtype}* __restrict__ c, int ldc) { % if width > 1: - n = ((n + ${width} - 1) / ${width}) * ${width}; + n = (n + ${width} - 1) / ${width}; ldb /= ${width}; ldc /= ${width}; % endif @@ -24,29 +24,35 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) { ${dtype} bv, csub[${m}]; +% if beta != 0: ## Preload C values for rows which will receive a non-zero dot product % for j, jx in enumerate(afix): % if jx != -1: - % if beta == 0: - csub[${j}] = make_zero(); - % elif beta == 1: - csub[${j}] = nt_load_c(&c[i + ${j}*ldc]); + % if beta == 1: + csub[${j}] = load_c(&c[i + ${j}*ldc]); % else: - csub[${j}] = ${beta}*nt_load_c(&c[i + ${j}*ldc]); + csub[${j}] = gimmik_vmul(${beta}, load_c(&c[i + ${j}*ldc])); % endif % endif % endfor +% endif ## Iterate through the used rows of B % for kx in bix: bv = b[i + ${kx}*ldb]; % for j, jx in enumerate(A[:, kx]): - % if jx != 0: - csub[${j}] += ${jx}*bv; + % if beta == 0: + % if jx != 0 and kx == afix[j]: + csub[${j}] = gimmik_vmul(${jx}, bv); + % elif jx != 0: + csub[${j}] = gimmik_vmadd(csub[${j}], ${jx}, bv); + % endif + % elif jx != 0: + csub[${j}] = gimmik_vmadd(csub[${j}], ${jx}, bv); % endif ## % if kx == alix[j]: - nt_store_c(&c[i + ${j}*ldc], csub[${j}]); + store_c(&c[i + ${j}*ldc], csub[${j}]); % endif % endfor % endfor @@ -54,9 +60,9 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) ## Handle rows of A which are all zero % for j, jx in enumerate(afix): % if jx == -1 and beta == 0: - nt_store_c(&c[i + ${j}*ldc], make_zero()); + store_c(&c[i + ${j}*ldc], make_zero()); % elif jx == -1 and beta != 1: - nt_store_c(&c[i + ${j}*ldc], ${beta}*nt_load_c(&c[i + ${j}*ldc])); + store_c(&c[i + ${j}*ldc], gimmik_vmul(${beta}, load_c(&c[i + ${j}*ldc]))); % endif % endfor } diff --git a/gimmik/kernels/hip/bstream-width-preload-c.mako b/gimmik/kernels/hip/bstream-width-preload-c.mako deleted file mode 100644 index 2c4e5c5..0000000 --- a/gimmik/kernels/hip/bstream-width-preload-c.mako +++ /dev/null @@ -1,65 +0,0 @@ -<%inherit file='base'/> - -<%include file='vector'/> - -__global__ __launch_bounds__(${blockx}) void -% if n is None: -${kname}(int n, - const ${dtype}* __restrict__ b, int ldb, - ${dtype}* __restrict__ c, int ldc) -{ - % if width > 1: - n = (n + ${width} - 1) / ${width}; - ldb /= ${width}; - ldc /= ${width}; - % endif -% else: -${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) -{ - const int n = ${-(-n // width)}; - const ${'long long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width}; - const ${'long long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width}; -% endif - const int i = blockDim.x*blockIdx.x + threadIdx.x; - - if (i < n) - { - ${dtype} bv, csub[${m}]; - -## Preload C values for rows which will receive a non-zero dot product -% for j, jx in enumerate(afix): - % if jx != -1: - % if beta == 0: - csub[${j}] = make_zero(); - % elif beta == 1: - csub[${j}] = nt_load_c(&c[i + ${j}*ldc]); - % else: - csub[${j}] = gimmik_vmul(${beta}, nt_load_c(&c[i + ${j}*ldc])); - % endif - % endif -% endfor - -## Iterate through the used rows of B -% for kx in bix: - bv = b[i + ${kx}*ldb]; - % for j, jx in enumerate(A[:, kx]): - % if jx != 0: - csub[${j}] = gimmik_vmadd(csub[${j}], ${jx}, bv); - % endif - ## - % if kx == alix[j]: - nt_store_c(&c[i + ${j}*ldc], csub[${j}]); - % endif - % endfor -% endfor - -## Handle rows of A which are all zero -% for j, jx in enumerate(afix): - % if jx == -1 and beta == 0: - nt_store_c(&c[i + ${j}*ldc], make_zero()); - % elif jx == -1 and beta != 1: - nt_store_c(&c[i + ${j}*ldc], gimmik_vmul(${beta}, nt_load_c(&c[i + ${j}*ldc]))); - % endif -% endfor - } -} diff --git a/gimmik/kernels/hip/bstream.mako b/gimmik/kernels/hip/bstream.mako index 9634c73..427dffc 100644 --- a/gimmik/kernels/hip/bstream.mako +++ b/gimmik/kernels/hip/bstream.mako @@ -7,7 +7,7 @@ ${kname}(int n, ${dtype}* __restrict__ c, int ldc) { % if width > 1: - n = ((n + ${width} - 1) / ${width}) * ${width}; + n = (n + ${width} - 1) / ${width}; ldb /= ${width}; ldc /= ${width}; % endif @@ -29,17 +29,17 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) bv = b[i + ${kx}*ldb]; % for j, jx in enumerate(A[:, kx]): % if jx != 0 and kx == afix[j]: - csub[${j}] = ${jx}*bv; + csub[${j}] = gimmik_vmul(${jx}, bv); % elif jx != 0: - csub[${j}] += ${jx}*bv; + csub[${j}] = gimmik_vmadd(csub[${j}], ${jx}, bv); % endif ## % if kx == alix[j] and beta == 0: - nt_store_c(&c[i + ${j}*ldc], csub[${j}]); + store_c(&c[i + ${j}*ldc], csub[${j}]); % elif kx == alix[j] and beta == 1: - nt_store_c(&c[i + ${j}*ldc], nt_load_c(&c[i + ${j}*ldc]) + csub[${j}]); + store_c(&c[i + ${j}*ldc], gimmik_vadd(load_c(&c[i + ${j}*ldc]), csub[${j}])); % elif kx == alix[j]: - nt_store_c(&c[i + ${j}*ldc], csub[${j}] + ${beta}*nt_load_c(&c[i + ${j}*ldc])); + store_c(&c[i + ${j}*ldc], gimmik_vadd(csub[${j}], gimmik_vmul(${beta}, load_c(&c[i + ${j}*ldc])))); % endif % endfor % endfor @@ -47,9 +47,9 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) ## Handle rows of A which are all zero % for j, jx in enumerate(afix): % if jx == -1 and beta == 0: - nt_store_c(&c[i + ${j}*ldc], make_zero()); + store_c(&c[i + ${j}*ldc], make_zero()); % elif jx == -1 and beta != 1: - nt_store_c(&c[i + ${j}*ldc], nt_load_c(&c[i + ${j}*ldc])*${beta}); + store_c(&c[i + ${j}*ldc], gimmik_vmul(${beta}, load_c(&c[i + ${j}*ldc]))); % endif % endfor } diff --git a/gimmik/kernels/hip/cstream-ksplit-preload-c.mako b/gimmik/kernels/hip/cstream-ksplit-preload-c.mako index 51f1db4..4550700 100644 --- a/gimmik/kernels/hip/cstream-ksplit-preload-c.mako +++ b/gimmik/kernels/hip/cstream-ksplit-preload-c.mako @@ -13,7 +13,7 @@ ${kname}(int n, ${dtype}* __restrict__ c, int ldc) { % if width > 1: - n = ((n + ${width} - 1) / ${width}) * ${width}; + n = (n + ${width} - 1) / ${width}; ldb /= ${width}; ldc /= ${width}; % endif @@ -44,24 +44,27 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) % endif % endfor <% - dotex = dot(lambda kx: f'bv[{kx}]', A[j, kbx]) + nzixs = [(l_idx, kbx[l_idx]) for l_idx in A[j, kbx].nonzero()[0]] has_dotp = A[j].any() + if nzixs: + first_l_idx, first_kx = nzixs[0] + dotex = f"gimmik_vmul({A[j, first_kx]}, bv[{first_l_idx}])" + for l_idx, kx in nzixs[1:]: + dotex = f"gimmik_vmadd({dotex}, {A[j, kx]}, bv[{l_idx}])" + else: + dotex = 'make_zero()' %> - % if dotex != '0.0': dotp = ${dotex}; - % else: - dotp = make_zero(); - % endif ## Save to a register % if loop.index % ksplit == bid: % if beta == 0: cv[${loop.index // ksplit}] = dotp; % elif beta == 1 and has_dotp: - cv[${loop.index // ksplit}] = nt_load_c(&c[i + ${j}*ldc]); - cv[${loop.index // ksplit}] += dotp; + cv[${loop.index // ksplit}] = load_c(&c[i + ${j}*ldc]); + cv[${loop.index // ksplit}] = gimmik_vadd(cv[${loop.index // ksplit}], dotp); % elif has_dotp: - cv[${loop.index // ksplit}] = ${beta}*nt_load_c(&c[i + ${j}*ldc]); - cv[${loop.index // ksplit}] += dotp; + cv[${loop.index // ksplit}] = gimmik_vmul(${beta}, load_c(&c[i + ${j}*ldc])); + cv[${loop.index // ksplit}] = gimmik_vadd(cv[${loop.index // ksplit}], dotp); % endif ## Save to shared memory % else: @@ -79,20 +82,22 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) % for j in cchunk: % if loop.index % ksplit == bid: <% has_dotp = A[j].any() %> + <% + sum_expr = f"cv[{loop.index // ksplit}]" + for s_idx in range(ksplit - 1): + sum_expr = f"gimmik_vadd({sum_expr}, csub[{s_idx}][{loop.index}][threadIdx.x])" + %> % if beta == 0: - dotp = cv[${loop.index // ksplit}] + ${' + '.join(f'csub[{i}][{loop.index}][threadIdx.x]' - for i in range(ksplit - 1))}; - nt_store_c(&c[i + ${j}*ldc], dotp); + dotp = ${sum_expr}; + store_c(&c[i + ${j}*ldc], dotp); % elif beta == 1 and has_dotp: - dotp = cv[${loop.index // ksplit}] + ${' + '.join(f'csub[{i}][{loop.index}][threadIdx.x]' - for i in range(ksplit - 1))}; - nt_store_c(&c[i + ${j}*ldc], dotp); + dotp = ${sum_expr}; + store_c(&c[i + ${j}*ldc], dotp); % elif beta != 1 and has_dotp: - dotp = cv[${loop.index // ksplit}] + ${' + '.join(f'csub[{i}][{loop.index}][threadIdx.x]' - for i in range(ksplit - 1))}; - nt_store_c(&c[i + ${j}*ldc], dotp); + dotp = ${sum_expr}; + store_c(&c[i + ${j}*ldc], dotp); % elif beta != 1: - nt_store_c(&c[i + ${j}*ldc], ${beta}*nt_load_c(&c[i + ${j}*ldc])); + store_c(&c[i + ${j}*ldc], gimmik_vmul(${beta}, load_c(&c[i + ${j}*ldc]))); % endif % endif % endfor diff --git a/gimmik/kernels/hip/cstream-ksplit-width-preload-c.mako b/gimmik/kernels/hip/cstream-ksplit-width-preload-c.mako deleted file mode 100644 index bdac6dc..0000000 --- a/gimmik/kernels/hip/cstream-ksplit-width-preload-c.mako +++ /dev/null @@ -1,114 +0,0 @@ -<%inherit file='base'/> - -<%include file='vector'/> - -<% -kparts = partition(A, ksplit, by='cols') -cchunks = chunk(range(m), csz) -loaded = set() -%> - -__global__ __launch_bounds__(${blockx*ksplit}) void -% if n is None: -${kname}(int n, - const ${dtype}* __restrict__ b, int ldb, - ${dtype}* __restrict__ c, int ldc) -{ - % if width > 1: - n = (n + ${width} - 1) / ${width}; - ldb /= ${width}; - ldc /= ${width}; - % endif -% else: -${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) -{ - const int n = ${-(-n // width)}; - const ${'long long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width}; - const ${'long long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width}; -% endif - int i = blockDim.x*blockIdx.x + threadIdx.x; - - ${dtype} cv[${-(-csz // ksplit)}], bv[${-(-k // ksplit)}], dotp; - __shared__ ${dtype} csub[${ksplit - 1}][${csz}][${blockx}]; - -## Iterate over the row-partitions of C -% for cchunk in cchunks: - ## Iterate over the column-partitions of B - % for bid, kbx in enumerate(kparts): - if (i < n && threadIdx.y == ${bid}) - { - ## Evaluate our partial dot products - % for j in cchunk: - ## Load in any missing parts of B - % for kx in kbx: - % if A[j, kx] != 0 and kx not in loaded: - bv[${loop.index}] = b[i + ${kx}*ldb]; <% loaded.add(kx) %> - % endif - % endfor - - ## Expand vectorized partial dot product - <% - nzixs = [(l_idx, kbx[l_idx]) for l_idx in A[j, kbx].nonzero()[0]] - - has_dotp = A[j].any() - if not nzixs: - dotex = 'make_zero()' - else: - first_l_idx, first_kx = nzixs[0] - dotex = f"gimmik_vmul({A[j, first_kx]}, bv[{first_l_idx}])" - for l_idx, kx in nzixs[1:]: - dotex = f"gimmik_vmadd({dotex}, {A[j, kx]}, bv[{l_idx}])" - %> - dotp = ${dotex}; - - ## Save to a register - % if loop.index % ksplit == bid: - % if beta == 0: - cv[${loop.index // ksplit}] = dotp; - % elif beta == 1 and has_dotp: - cv[${loop.index // ksplit}] = nt_load_c(&c[i + ${j}*ldc]); - cv[${loop.index // ksplit}] = gimmik_vadd(cv[${loop.index // ksplit}], dotp); - % elif has_dotp: - cv[${loop.index // ksplit}] = gimmik_vmul(${beta}, nt_load_c(&c[i + ${j}*ldc])); - cv[${loop.index // ksplit}] = gimmik_vadd(cv[${loop.index // ksplit}], dotp); - % endif - ## Save to shared memory - % else: - csub[${bid - (bid > loop.index % ksplit)}][${loop.index}][threadIdx.x] = dotp; - % endif - % endfor - } - % endfor - __syncthreads(); - - ## Sum and output the final set of dot products - % for bid, kbx in enumerate(kparts): - if (i < n && threadIdx.y == ${bid}) - { - % for j in cchunk: - % if loop.index % ksplit == bid: - <% - has_dotp = A[j].any() - sum_expr = f"cv[{loop.index // ksplit}]" - for s_idx in range(ksplit - 1): - sum_expr = f"gimmik_vadd({sum_expr}, csub[{s_idx}][{loop.index}][threadIdx.x])" - %> - % if beta == 0: - dotp = ${sum_expr}; - nt_store_c(&c[i + ${j}*ldc], dotp); - % elif beta == 1 and has_dotp: - dotp = ${sum_expr}; - nt_store_c(&c[i + ${j}*ldc], dotp); - % elif beta != 1 and has_dotp: - dotp = ${sum_expr}; - nt_store_c(&c[i + ${j}*ldc], dotp); - % elif beta != 1: - nt_store_c(&c[i + ${j}*ldc], gimmik_vmul(${beta}, nt_load_c(&c[i + ${j}*ldc]))); - % endif - % endif - % endfor - } - % endfor - __syncthreads(); -% endfor -} diff --git a/gimmik/kernels/hip/cstream-ksplit.mako b/gimmik/kernels/hip/cstream-ksplit.mako index 6fd3210..9c5fc09 100644 --- a/gimmik/kernels/hip/cstream-ksplit.mako +++ b/gimmik/kernels/hip/cstream-ksplit.mako @@ -13,7 +13,7 @@ ${kname}(int n, ${dtype}* __restrict__ c, int ldc) { % if width > 1: - n = ((n + ${width} - 1) / ${width}) * ${width}; + n = (n + ${width} - 1) / ${width}; ldb /= ${width}; ldc /= ${width}; % endif @@ -43,11 +43,17 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) bv[${loop.index}] = b[i + ${kx}*ldb]; <% loaded.add(kx) %> % endif % endfor - % if (dotex := dot(lambda kx: f'bv[{kx}]', A[j, kbx])) != '0.0': + <% + nzixs = [(l_idx, kbx[l_idx]) for l_idx in A[j, kbx].nonzero()[0]] + if nzixs: + first_l_idx, first_kx = nzixs[0] + dotex = f"gimmik_vmul({A[j, first_kx]}, bv[{first_l_idx}])" + for l_idx, kx in nzixs[1:]: + dotex = f"gimmik_vmadd({dotex}, {A[j, kx]}, bv[{l_idx}])" + else: + dotex = 'make_zero()' + %> dotp = ${dotex}; - % else: - dotp = make_zero(); - % endif ## Save to a register % if loop.index % ksplit == bid: cv[${loop.index // ksplit}] = dotp; @@ -66,14 +72,18 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) ## Sum and output the final set of dot products % for j in cchunk: % if loop.index % ksplit == bid: - dotp = cv[${loop.index // ksplit}] + ${' + '.join(f'csub[{i}][{loop.index}][threadIdx.x]' - for i in range(ksplit - 1))}; + <% + sum_expr = f"cv[{loop.index // ksplit}]" + for s_idx in range(ksplit - 1): + sum_expr = f"gimmik_vadd({sum_expr}, csub[{s_idx}][{loop.index}][threadIdx.x])" + %> + dotp = ${sum_expr}; % if beta == 0: - nt_store_c(&c[i + ${j}*ldc], dotp); + store_c(&c[i + ${j}*ldc], dotp); % elif beta == 1: - nt_store_c(&c[i + ${j}*ldc], nt_load_c(&c[i + ${j}*ldc]) + dotp); + store_c(&c[i + ${j}*ldc], gimmik_vadd(load_c(&c[i + ${j}*ldc]), dotp)); % else: - nt_store_c(&c[i + ${j}*ldc], dotp + ${beta}*nt_load_c(&c[i + ${j}*ldc])); + store_c(&c[i + ${j}*ldc], gimmik_vadd(dotp, gimmik_vmul(${beta}, load_c(&c[i + ${j}*ldc])))); % endif % endif % endfor diff --git a/gimmik/kernels/hip/cstream-preload-c.mako b/gimmik/kernels/hip/cstream-preload-c.mako index 041e674..eebb602 100644 --- a/gimmik/kernels/hip/cstream-preload-c.mako +++ b/gimmik/kernels/hip/cstream-preload-c.mako @@ -9,7 +9,7 @@ ${kname}(int n, ${dtype}* __restrict__ c, int ldc) { % if width > 1: - n = ((n + ${width} - 1) / ${width}) * ${width}; + n = (n + ${width} - 1) / ${width}; ldb /= ${width}; ldc /= ${width}; % endif @@ -26,24 +26,34 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) if (i < n) { % for j, jx in enumerate(A): - % if (dotex := dot(lambda kx: f'b[i + {kx}*ldb]', jx, maxsplit=ksplit)) != '0.0': + <% + nzixs = [kx for kx, val in enumerate(jx) if val != 0] + if nzixs: + first_kx = nzixs[0] + dotex = f"gimmik_vmul({jx[first_kx]}, b[i + {first_kx}*ldb])" + for kx in nzixs[1:]: + dotex = f"gimmik_vmadd({dotex}, {jx[kx]}, b[i + {kx}*ldb])" + else: + dotex = 'make_zero()' + %> + % if nzixs: % if beta == 0: dotp = ${dotex}; - nt_store_c(&c[i + ${j}*ldc], dotp); + store_c(&c[i + ${j}*ldc], dotp); % elif beta == 1: - dotp = nt_load_c(&c[i + ${j}*ldc]); - dotp += ${dotex}; - nt_store_c(&c[i + ${j}*ldc], dotp); + dotp = load_c(&c[i + ${j}*ldc]); + dotp = gimmik_vadd(dotp, ${dotex}); + store_c(&c[i + ${j}*ldc], dotp); % else: - dotp = ${beta}*nt_load_c(&c[i + ${j}*ldc]); - dotp += ${dotex}; - nt_store_c(&c[i + ${j}*ldc], dotp); + dotp = gimmik_vmul(${beta}, load_c(&c[i + ${j}*ldc])); + dotp = gimmik_vadd(dotp, ${dotex}); + store_c(&c[i + ${j}*ldc], dotp); % endif % else: % if beta == 0: - nt_store_c(&c[i + ${j}*ldc], make_zero()); + store_c(&c[i + ${j}*ldc], make_zero()); % elif beta != 1: - nt_store_c(&c[i + ${j}*ldc], ${beta}*nt_load_c(&c[i + ${j}*ldc])); + store_c(&c[i + ${j}*ldc], gimmik_vmul(${beta}, load_c(&c[i + ${j}*ldc]))); % endif % endif % endfor diff --git a/gimmik/kernels/hip/cstream-width-preload-c.mako b/gimmik/kernels/hip/cstream-width-preload-c.mako deleted file mode 100644 index 86acfcb..0000000 --- a/gimmik/kernels/hip/cstream-width-preload-c.mako +++ /dev/null @@ -1,66 +0,0 @@ -<%inherit file='base'/> - -<%include file='vector'/> - -__global__ __launch_bounds__(${blockx}) void -% if n is None: -${kname}(int n, - const ${dtype}* __restrict__ b, int ldb, - ${dtype}* __restrict__ c, int ldc) -{ - % if width > 1: - n = (n + ${width} - 1) / ${width}; - ldb /= ${width}; - ldc /= ${width}; - % endif -% else: -${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) -{ - const int n = ${-(-n // width)}; - const ${'long long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width}; - const ${'long long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width}; -% endif - const int i = blockDim.x*blockIdx.x + threadIdx.x; - ${dtype} bv, dotp; - - if (i < n) - { -% for j, row in enumerate(A): - <% - nzixs = [kx for kx, val in enumerate(row) if val != 0] - %> - % if nzixs: - % if beta == 0: - <% first_kx = nzixs[0] %> - bv = b[i + ${first_kx}*ldb]; - dotp = gimmik_vmul(${row[first_kx]}, bv); - % for kx in nzixs[1:]: - bv = b[i + ${kx}*ldb]; - dotp = gimmik_vmadd(dotp, ${row[kx]}, bv); - % endfor - nt_store_c(&c[i + ${j}*ldc], dotp); - % elif beta == 1: - dotp = nt_load_c(&c[i + ${j}*ldc]); - % for kx in nzixs: - bv = b[i + ${kx}*ldb]; - dotp = gimmik_vmadd(dotp, ${row[kx]}, bv); - % endfor - nt_store_c(&c[i + ${j}*ldc], dotp); - % else: - dotp = gimmik_vmul(${beta}, nt_load_c(&c[i + ${j}*ldc])); - % for kx in nzixs: - bv = b[i + ${kx}*ldb]; - dotp = gimmik_vmadd(dotp, ${row[kx]}, bv); - % endfor - nt_store_c(&c[i + ${j}*ldc], dotp); - % endif - % else: - % if beta == 0: - nt_store_c(&c[i + ${j}*ldc], make_zero()); - % elif beta != 1: - nt_store_c(&c[i + ${j}*ldc], gimmik_vmul(${beta}, nt_load_c(&c[i + ${j}*ldc]))); - % endif - % endif -% endfor - } -} diff --git a/gimmik/kernels/hip/cstream.mako b/gimmik/kernels/hip/cstream.mako index 0651e87..1b7b312 100644 --- a/gimmik/kernels/hip/cstream.mako +++ b/gimmik/kernels/hip/cstream.mako @@ -9,7 +9,7 @@ ${kname}(int n, ${dtype}* __restrict__ c, int ldc) { % if width > 1: - n = ((n + ${width} - 1) / ${width}) * ${width}; + n = (n + ${width} - 1) / ${width}; ldb /= ${width}; ldc /= ${width}; % endif @@ -26,17 +26,26 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) if (i < n) { % for j, jx in enumerate(A): - % if (dotex := dot(lambda kx: f'b[i + {kx}*ldb]', jx, maxsplit=ksplit)) != '0.0': + <% + nzixs = [kx for kx, val in enumerate(jx) if val != 0] + if nzixs: + first_kx = nzixs[0] + dotex = f"gimmik_vmul({jx[first_kx]}, b[i + {first_kx}*ldb])" + for kx in nzixs[1:]: + dotex = f"gimmik_vmadd({dotex}, {jx[kx]}, b[i + {kx}*ldb])" + else: + dotex = 'make_zero()' + %> dotp = ${dotex}; - % else: + % if not nzixs: dotp = make_zero(); % endif % if beta == 0: - nt_store_c(&c[i + ${j}*ldc], dotp); - % elif beta == 1 and dotex != '0.0': - nt_store_c(&c[i + ${j}*ldc], nt_load_c(&c[i + ${j}*ldc]) + dotp); + store_c(&c[i + ${j}*ldc], dotp); + % elif beta == 1 and nzixs: + store_c(&c[i + ${j}*ldc], gimmik_vadd(load_c(&c[i + ${j}*ldc]), dotp)); % else: - nt_store_c(&c[i + ${j}*ldc], dotp + ${beta}*nt_load_c(&c[i + ${j}*ldc])); + store_c(&c[i + ${j}*ldc], gimmik_vadd(dotp, gimmik_vmul(${beta}, load_c(&c[i + ${j}*ldc])))); % endif % endfor } diff --git a/gimmik/kernels/hip/vector.mako b/gimmik/kernels/hip/vector.mako deleted file mode 100644 index 268d6ab..0000000 --- a/gimmik/kernels/hip/vector.mako +++ /dev/null @@ -1,41 +0,0 @@ -% if width == 2: -static inline __device__ ${dtype} -gimmik_vmul(${dtype[:-1]} a, ${dtype} b) -{ - return make_${dtype}(a*b.x, a*b.y); -} - -static inline __device__ ${dtype} -gimmik_vadd(${dtype} a, ${dtype} b) -{ - return make_${dtype}(a.x + b.x, a.y + b.y); -} - -static inline __device__ ${dtype} -gimmik_vmadd(${dtype} acc, ${dtype[:-1]} a, ${dtype} b) -{ - // Keep the multiply-add expression visible to the compiler. - return make_${dtype}(acc.x + a*b.x, acc.y + a*b.y); -} -% elif width == 4: -static inline __device__ ${dtype} -gimmik_vmul(${dtype[:-1]} a, ${dtype} b) -{ - return make_${dtype}(a*b.x, a*b.y, a*b.z, a*b.w); -} - -static inline __device__ ${dtype} -gimmik_vadd(${dtype} a, ${dtype} b) -{ - return make_${dtype}(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); -} - -static inline __device__ ${dtype} -gimmik_vmadd(${dtype} acc, ${dtype[:-1]} a, ${dtype} b) -{ - // Keep the multiply-add expression visible to the compiler. - return make_${dtype}(acc.x + a*b.x, acc.y + a*b.y, acc.z + a*b.z, acc.w + a*b.w); -} -% else: -#error "HIP vector helpers only support width=2 or width=4" -% endif From e9b921a99f7ce476c98fc59bace1a4e72c26259b Mon Sep 17 00:00:00 2001 From: tomjen12 Date: Tue, 23 Jun 2026 05:14:11 -0500 Subject: [PATCH 07/15] Use blockx launch bounds for HIP cstream preload --- gimmik/kernels/hip/cstream-preload-c.mako | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gimmik/kernels/hip/cstream-preload-c.mako b/gimmik/kernels/hip/cstream-preload-c.mako index eebb602..a8d7e31 100644 --- a/gimmik/kernels/hip/cstream-preload-c.mako +++ b/gimmik/kernels/hip/cstream-preload-c.mako @@ -2,7 +2,7 @@ <% ksplit = 2 if m < 36 else 1 %> -__global__ __launch_bounds__(128) void +__global__ __launch_bounds__(${blockx}) void % if n is None: ${kname}(int n, const ${dtype}* __restrict__ b, int ldb, From 2aa2577a4a99a5479f57679c62a9debacf083ef6 Mon Sep 17 00:00:00 2001 From: tomjen12 Date: Tue, 23 Jun 2026 21:49:28 -0500 Subject: [PATCH 08/15] Always use non-temporal C accesses for HIP --- gimmik/kernels/hip/base.mako | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/gimmik/kernels/hip/base.mako b/gimmik/kernels/hip/base.mako index a03a943..831b8e0 100644 --- a/gimmik/kernels/hip/base.mako +++ b/gimmik/kernels/hip/base.mako @@ -102,26 +102,16 @@ nt_load_c(const ${dtype}* p) % endif } -<% nt_c = context.get('nt_c', True) %> - static inline __device__ void store_c(${dtype}* p, ${dtype} v) { -% if nt_c: nt_store_c(p, v); -% else: - *p = v; -% endif } static inline __device__ ${dtype} load_c(const ${dtype}* p) { -% if nt_c: return nt_load_c(p); -% else: - return *p; -% endif } ${next.body()} From be1c1dbb58e34353e73d7b863aa6eceecf9a0880 Mon Sep 17 00:00:00 2001 From: "Eric.Chin.AMD" Date: Wed, 24 Jun 2026 22:47:02 +0800 Subject: [PATCH 09/15] feat(hip): add non-temporal B-load (NTB) variants for bstream-msplit On memory-bound operators the B matrix is read once from HBM and reused only within a work-group via LDS -- it is never re-read across blocks. A normal global load still allocates B's line in L2, which is pure overhead: the line is never reused, it evicts genuinely-reusable data, and it adds cache-allocate/eviction traffic. This is the read-side mirror of the non-temporal C store we already use. NTB loads B with a non-temporal load (load_b -> __builtin_nontemporal path) so B bypasses L2 instead of polluting it. It moves the same number of bytes but keeps the cache clean, raising effective bandwidth. Implemented as a flag on the existing templates rather than new files: - base.mako: add a load_b() wrapper (non-temporal B load). - bstream-msplit{,-preload-c}.mako: gate the B read behind an `ntload` flag (context.get('ntload', False)); renders byte-identically to the plain kernel when the flag is absent. - hip.py: emit `*-ntb` variants by passing ntload=True inside the existing width loop, so NTB combines with width (w1/w2) automatically. Backward-compatible (plain variants unchanged) and CDNA-gated like the other tuned variants. On MI300X (gfx942) NTB passes the accuracy check (~1e-15) and wins the autotune in the majority of memory-bound cases (~+4.5% bandwidth on those), being chosen over the plain bstream-msplit. --- gimmik/hip.py | 17 +++++++++++++++++ gimmik/kernels/hip/base.mako | 9 +++++++++ .../kernels/hip/bstream-msplit-preload-c.mako | 6 ++++-- gimmik/kernels/hip/bstream-msplit.mako | 6 ++++-- 4 files changed, 34 insertions(+), 4 deletions(-) diff --git a/gimmik/hip.py b/gimmik/hip.py index 57fa394..860ed6c 100644 --- a/gimmik/hip.py +++ b/gimmik/hip.py @@ -78,6 +78,13 @@ def emit(name, args, meta): } | wmeta yield from emit('bstream-msplit', args, meta) + # non-temporal B-load variant: B is read-once -> skip L2 alloc + nmeta = { + 'block': (blkx, ms, 1), 'shared': shared, + 'desc': f'bstream-msplit-ntb/{wpfx}m{ms}-b{bsz}-x{blkx}' + } | wmeta + yield from emit('bstream-msplit', args | {'ntload': True}, nmeta) + for ks in ksplits: # k-split B loading, C streaming kernel args = {'ksplit': ks, 'csz': csz, 'blockx': blkx} | wargs @@ -112,6 +119,16 @@ def emit(name, args, meta): } | wmeta yield from emit('bstream-msplit-preload-c', args, meta) + # non-temporal B-load variant + nmeta = { + 'block': (blkx, ms, 1), 'shared': shared, + 'desc': ( + f'bstream-msplit-preload-c-ntb/' + f'{wpfx}m{ms}-b{bsz}-x{blkx}' + ) + } | wmeta + yield from emit('bstream-msplit-preload-c', args | {'ntload': True}, nmeta) + for ks in ksplits: # k-split B loading, C preloading, C streaming kernel args = {'ksplit': ks, 'csz': csz, 'blockx': blkx} | wargs diff --git a/gimmik/kernels/hip/base.mako b/gimmik/kernels/hip/base.mako index 831b8e0..5291538 100644 --- a/gimmik/kernels/hip/base.mako +++ b/gimmik/kernels/hip/base.mako @@ -114,4 +114,13 @@ load_c(const ${dtype}* p) return nt_load_c(p); } +static inline __device__ ${dtype} +load_b(const ${dtype}* p) +{ + // B is read-once (reused only within the block via LDS, never re-read across + // blocks), so load it non-temporally to avoid polluting L2 -- the read-side + // twin of the non-temporal C store. + return nt_load_c(p); +} + ${next.body()} diff --git a/gimmik/kernels/hip/bstream-msplit-preload-c.mako b/gimmik/kernels/hip/bstream-msplit-preload-c.mako index 6cabeb6..36783c0 100644 --- a/gimmik/kernels/hip/bstream-msplit-preload-c.mako +++ b/gimmik/kernels/hip/bstream-msplit-preload-c.mako @@ -3,6 +3,8 @@ <% mx = partition(A, into=msplit, by='rows') bchunks = chunk(bix, bsz) +ntload = context.get('ntload', False) +bload = (lambda kx: f'load_b(&b[i + {kx}*ldb])') if ntload else (lambda kx: f'b[i + {kx}*ldb]') %> __global__ __launch_bounds__(${blockx*msplit}) void @@ -34,7 +36,7 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) { % for kx in bchunks[0]: % if loop.index % msplit == cid: - bsub[0][${loop.index}][threadIdx.x] = b[i + ${kx}*ldb]; + bsub[0][${loop.index}][threadIdx.x] = ${bload(kx)}; % endif % endfor @@ -64,7 +66,7 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) % if not loop.parent.last: % for kx in bchunks[bb + 1]: % if loop.index % msplit == cid: - bsub[${(bb + 1) % 2}][${loop.index}][threadIdx.x] = b[i + ${kx}*ldb]; + bsub[${(bb + 1) % 2}][${loop.index}][threadIdx.x] = ${bload(kx)}; % endif % endfor % endif diff --git a/gimmik/kernels/hip/bstream-msplit.mako b/gimmik/kernels/hip/bstream-msplit.mako index 35d336e..30f2c57 100644 --- a/gimmik/kernels/hip/bstream-msplit.mako +++ b/gimmik/kernels/hip/bstream-msplit.mako @@ -3,6 +3,8 @@ <% mx = partition(A, into=msplit, by='rows') bchunks = chunk(bix, bsz) +ntload = context.get('ntload', False) +bload = (lambda kx: f'load_b(&b[i + {kx}*ldb])') if ntload else (lambda kx: f'b[i + {kx}*ldb]') %> __global__ __launch_bounds__(${blockx*msplit}) void @@ -34,7 +36,7 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) { % for kx in bchunks[0]: % if loop.index % msplit == cid: - bsub[0][${loop.index}][threadIdx.x] = b[i + ${kx}*ldb]; + bsub[0][${loop.index}][threadIdx.x] = ${bload(kx)}; % endif % endfor } @@ -51,7 +53,7 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) % if not loop.parent.last: % for kx in bchunks[bb + 1]: % if loop.index % msplit == cid: - bsub[${(bb + 1) % 2}][${loop.index}][threadIdx.x] = b[i + ${kx}*ldb]; + bsub[${(bb + 1) % 2}][${loop.index}][threadIdx.x] = ${bload(kx)}; % endif % endfor % endif From 280e948c90da7643b511c5064ef9f51c784d27ba Mon Sep 17 00:00:00 2001 From: tomjen12 Date: Wed, 24 Jun 2026 22:39:34 -0500 Subject: [PATCH 10/15] Use non-temporal B loads by default for HIP --- gimmik/hip.py | 17 ---- gimmik/kernels/hip/base.mako | 94 +++++-------------- .../kernels/hip/bstream-msplit-preload-c.mako | 16 ++-- gimmik/kernels/hip/bstream-msplit.mako | 16 ++-- gimmik/kernels/hip/bstream-preload-c.mako | 12 +-- gimmik/kernels/hip/bstream.mako | 12 +-- .../kernels/hip/cstream-ksplit-preload-c.mako | 14 +-- gimmik/kernels/hip/cstream-ksplit.mako | 10 +- gimmik/kernels/hip/cstream-preload-c.mako | 12 +-- gimmik/kernels/hip/cstream.mako | 8 +- 10 files changed, 72 insertions(+), 139 deletions(-) diff --git a/gimmik/hip.py b/gimmik/hip.py index 860ed6c..57fa394 100644 --- a/gimmik/hip.py +++ b/gimmik/hip.py @@ -78,13 +78,6 @@ def emit(name, args, meta): } | wmeta yield from emit('bstream-msplit', args, meta) - # non-temporal B-load variant: B is read-once -> skip L2 alloc - nmeta = { - 'block': (blkx, ms, 1), 'shared': shared, - 'desc': f'bstream-msplit-ntb/{wpfx}m{ms}-b{bsz}-x{blkx}' - } | wmeta - yield from emit('bstream-msplit', args | {'ntload': True}, nmeta) - for ks in ksplits: # k-split B loading, C streaming kernel args = {'ksplit': ks, 'csz': csz, 'blockx': blkx} | wargs @@ -119,16 +112,6 @@ def emit(name, args, meta): } | wmeta yield from emit('bstream-msplit-preload-c', args, meta) - # non-temporal B-load variant - nmeta = { - 'block': (blkx, ms, 1), 'shared': shared, - 'desc': ( - f'bstream-msplit-preload-c-ntb/' - f'{wpfx}m{ms}-b{bsz}-x{blkx}' - ) - } | wmeta - yield from emit('bstream-msplit-preload-c', args | {'ntload': True}, nmeta) - for ks in ksplits: # k-split B loading, C preloading, C streaming kernel args = {'ksplit': ks, 'csz': csz, 'blockx': blkx} | wargs diff --git a/gimmik/kernels/hip/base.mako b/gimmik/kernels/hip/base.mako index 5291538..95e622f 100644 --- a/gimmik/kernels/hip/base.mako +++ b/gimmik/kernels/hip/base.mako @@ -1,77 +1,34 @@ % if dtype.endswith('4'): -static inline __device__ ${dtype} make_zero() -{ return make_${dtype}(0, 0, 0, 0); } -% elif dtype.endswith('2'): -static inline __device__ ${dtype} make_zero() -{ return make_${dtype}(0, 0); } -% else: -static inline __device__ ${dtype} make_zero() -{ return 0; } -% endif +inline __device__ ${dtype} operator+(${dtype} a, ${dtype} b) +{ return make_${dtype}(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); } -% if width == 1: -static inline __device__ ${dtype} -gimmik_vmul(${dtype} a, ${dtype} b) -{ - return a*b; -} +inline __device__ ${dtype} operator*(${dtype[:-1]} a, ${dtype} b) +{ return make_${dtype}(a*b.x, a*b.y, a*b.z, a*b.w); } -static inline __device__ ${dtype} -gimmik_vadd(${dtype} a, ${dtype} b) -{ - return a + b; -} - -static inline __device__ ${dtype} -gimmik_vmadd(${dtype} acc, ${dtype} a, ${dtype} b) -{ - // Keep the multiply-add expression visible to the compiler. - return acc + a*b; -} -% elif width == 2: -static inline __device__ ${dtype} -gimmik_vmul(${dtype[:-1]} a, ${dtype} b) -{ - return make_${dtype}(a*b.x, a*b.y); -} +inline __device__ void operator+=(${dtype} &a, ${dtype} b) +{ a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; } -static inline __device__ ${dtype} -gimmik_vadd(${dtype} a, ${dtype} b) -{ - return make_${dtype}(a.x + b.x, a.y + b.y); -} +inline __device__ ${dtype} make_zero() +{ return make_${dtype}(0, 0, 0, 0); } +% elif dtype.endswith('2'): +inline __device__ ${dtype} operator+(${dtype} a, ${dtype} b) +{ return make_${dtype}(a.x + b.x, a.y + b.y); } -static inline __device__ ${dtype} -gimmik_vmadd(${dtype} acc, ${dtype[:-1]} a, ${dtype} b) -{ - // Keep the multiply-add expression visible to the compiler. - return make_${dtype}(acc.x + a*b.x, acc.y + a*b.y); -} -% elif width == 4: -static inline __device__ ${dtype} -gimmik_vmul(${dtype[:-1]} a, ${dtype} b) -{ - return make_${dtype}(a*b.x, a*b.y, a*b.z, a*b.w); -} +inline __device__ ${dtype} operator*(${dtype[:-1]} a, ${dtype} b) +{ return make_${dtype}(a*b.x, a*b.y); } -static inline __device__ ${dtype} -gimmik_vadd(${dtype} a, ${dtype} b) -{ - return make_${dtype}(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); -} +inline __device__ void operator+=(${dtype} &a, ${dtype} b) +{ a.x += b.x; a.y += b.y; } -static inline __device__ ${dtype} -gimmik_vmadd(${dtype} acc, ${dtype[:-1]} a, ${dtype} b) -{ - // Keep the multiply-add expression visible to the compiler. - return make_${dtype}(acc.x + a*b.x, acc.y + a*b.y, acc.z + a*b.z, acc.w + a*b.w); -} +inline __device__ ${dtype} make_zero() +{ return make_${dtype}(0, 0); } % else: -#error "HIP vector helpers only support width=2 or width=4" +inline __device__ ${dtype} make_zero() +{ return 0; } % endif static inline __device__ void -nt_store_c(${dtype}* p, ${dtype} v) +nt_store(${dtype}* p, ${dtype} v) { % if dtype.endswith('4'): __builtin_nontemporal_store(v.x, &p->x); @@ -87,7 +44,7 @@ nt_store_c(${dtype}* p, ${dtype} v) } static inline __device__ ${dtype} -nt_load_c(const ${dtype}* p) +nt_load(const ${dtype}* p) { % if dtype.endswith('4'): return make_${dtype}(__builtin_nontemporal_load(&p->x), @@ -105,22 +62,19 @@ nt_load_c(const ${dtype}* p) static inline __device__ void store_c(${dtype}* p, ${dtype} v) { - nt_store_c(p, v); + nt_store(p, v); } static inline __device__ ${dtype} load_c(const ${dtype}* p) { - return nt_load_c(p); + return nt_load(p); } static inline __device__ ${dtype} load_b(const ${dtype}* p) { - // B is read-once (reused only within the block via LDS, never re-read across - // blocks), so load it non-temporally to avoid polluting L2 -- the read-side - // twin of the non-temporal C store. - return nt_load_c(p); + return nt_load(p); } ${next.body()} diff --git a/gimmik/kernels/hip/bstream-msplit-preload-c.mako b/gimmik/kernels/hip/bstream-msplit-preload-c.mako index 36783c0..e58fa43 100644 --- a/gimmik/kernels/hip/bstream-msplit-preload-c.mako +++ b/gimmik/kernels/hip/bstream-msplit-preload-c.mako @@ -3,8 +3,6 @@ <% mx = partition(A, into=msplit, by='rows') bchunks = chunk(bix, bsz) -ntload = context.get('ntload', False) -bload = (lambda kx: f'load_b(&b[i + {kx}*ldb])') if ntload else (lambda kx: f'b[i + {kx}*ldb]') %> __global__ __launch_bounds__(${blockx*msplit}) void @@ -36,7 +34,7 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) { % for kx in bchunks[0]: % if loop.index % msplit == cid: - bsub[0][${loop.index}][threadIdx.x] = ${bload(kx)}; + bsub[0][${loop.index}][threadIdx.x] = load_b(&b[i + ${kx}*ldb]); % endif % endfor @@ -47,7 +45,7 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) % if beta == 1: csub[${j}] = load_c(&c[i + ${jx}*ldc]); % else: - csub[${j}] = gimmik_vmul(${beta}, load_c(&c[i + ${jx}*ldc])); + csub[${j}] = ${beta}*load_c(&c[i + ${jx}*ldc]); % endif % endif % endfor @@ -66,7 +64,7 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) % if not loop.parent.last: % for kx in bchunks[bb + 1]: % if loop.index % msplit == cid: - bsub[${(bb + 1) % 2}][${loop.index}][threadIdx.x] = ${bload(kx)}; + bsub[${(bb + 1) % 2}][${loop.index}][threadIdx.x] = load_b(&b[i + ${kx}*ldb]); % endif % endfor % endif @@ -76,12 +74,12 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) % for j, jx in enumerate(A[mcx, kx]): % if beta == 0: % if jx != 0 and kx == afix[mcx[j]]: - csub[${j}] = gimmik_vmul(${jx}, bv); + csub[${j}] = ${jx}*bv; % elif jx != 0: - csub[${j}] = gimmik_vmadd(csub[${j}], ${jx}, bv); + csub[${j}] += ${jx}*bv; % endif % elif jx != 0: - csub[${j}] = gimmik_vmadd(csub[${j}], ${jx}, bv); + csub[${j}] += ${jx}*bv; % endif ## If we're done with this dot product then store to global % if kx == alix[mcx[j]]: @@ -95,7 +93,7 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) % if jx == -1 and j % msplit == cid and beta == 0: store_c(&c[i + ${j}*ldc], make_zero()); % elif jx == -1 and j % msplit == cid and beta != 1: - store_c(&c[i + ${j}*ldc], gimmik_vmul(${beta}, load_c(&c[i + ${j}*ldc]))); + store_c(&c[i + ${j}*ldc], ${beta}*load_c(&c[i + ${j}*ldc])); % endif % endfor % endif diff --git a/gimmik/kernels/hip/bstream-msplit.mako b/gimmik/kernels/hip/bstream-msplit.mako index 30f2c57..0681783 100644 --- a/gimmik/kernels/hip/bstream-msplit.mako +++ b/gimmik/kernels/hip/bstream-msplit.mako @@ -3,8 +3,6 @@ <% mx = partition(A, into=msplit, by='rows') bchunks = chunk(bix, bsz) -ntload = context.get('ntload', False) -bload = (lambda kx: f'load_b(&b[i + {kx}*ldb])') if ntload else (lambda kx: f'b[i + {kx}*ldb]') %> __global__ __launch_bounds__(${blockx*msplit}) void @@ -36,7 +34,7 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) { % for kx in bchunks[0]: % if loop.index % msplit == cid: - bsub[0][${loop.index}][threadIdx.x] = ${bload(kx)}; + bsub[0][${loop.index}][threadIdx.x] = load_b(&b[i + ${kx}*ldb]); % endif % endfor } @@ -53,7 +51,7 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) % if not loop.parent.last: % for kx in bchunks[bb + 1]: % if loop.index % msplit == cid: - bsub[${(bb + 1) % 2}][${loop.index}][threadIdx.x] = ${bload(kx)}; + bsub[${(bb + 1) % 2}][${loop.index}][threadIdx.x] = load_b(&b[i + ${kx}*ldb]); % endif % endfor % endif @@ -62,17 +60,17 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) bv = bsub[${bb % 2}][${loop.index}][threadIdx.x]; % for j, jx in enumerate(A[mcx, kx]): % if jx != 0 and kx == afix[mcx[j]]: - csub[${j}] = gimmik_vmul(${jx}, bv); + csub[${j}] = ${jx}*bv; % elif jx != 0: - csub[${j}] = gimmik_vmadd(csub[${j}], ${jx}, bv); + csub[${j}] += ${jx}*bv; % endif ## If we're done with this dot product then store to global % if kx == alix[mcx[j]] and beta == 0: store_c(&c[i + ${mcx[j]}*ldc], csub[${j}]); % elif kx == alix[mcx[j]] and beta == 1: - store_c(&c[i + ${mcx[j]}*ldc], gimmik_vadd(load_c(&c[i + ${mcx[j]}*ldc]), csub[${j}])); + store_c(&c[i + ${mcx[j]}*ldc], load_c(&c[i + ${mcx[j]}*ldc]) + csub[${j}]); % elif kx == alix[mcx[j]]: - store_c(&c[i + ${mcx[j]}*ldc], gimmik_vadd(csub[${j}], gimmik_vmul(${beta}, load_c(&c[i + ${mcx[j]}*ldc])))); + store_c(&c[i + ${mcx[j]}*ldc], csub[${j}] + ${beta}*load_c(&c[i + ${mcx[j]}*ldc])); % endif % endfor % endfor @@ -82,7 +80,7 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) % if jx == -1 and j % msplit == cid and beta == 0: store_c(&c[i + ${j}*ldc], make_zero()); % elif jx == -1 and j % msplit == cid and beta != 1: - store_c(&c[i + ${j}*ldc], gimmik_vmul(${beta}, load_c(&c[i + ${j}*ldc]))); + store_c(&c[i + ${j}*ldc], ${beta}*load_c(&c[i + ${j}*ldc])); % endif % endfor % endif diff --git a/gimmik/kernels/hip/bstream-preload-c.mako b/gimmik/kernels/hip/bstream-preload-c.mako index 095be83..a7140c6 100644 --- a/gimmik/kernels/hip/bstream-preload-c.mako +++ b/gimmik/kernels/hip/bstream-preload-c.mako @@ -31,7 +31,7 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) % if beta == 1: csub[${j}] = load_c(&c[i + ${j}*ldc]); % else: - csub[${j}] = gimmik_vmul(${beta}, load_c(&c[i + ${j}*ldc])); + csub[${j}] = ${beta}*load_c(&c[i + ${j}*ldc]); % endif % endif % endfor @@ -39,16 +39,16 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) ## Iterate through the used rows of B % for kx in bix: - bv = b[i + ${kx}*ldb]; + bv = load_b(&b[i + ${kx}*ldb]); % for j, jx in enumerate(A[:, kx]): % if beta == 0: % if jx != 0 and kx == afix[j]: - csub[${j}] = gimmik_vmul(${jx}, bv); + csub[${j}] = ${jx}*bv; % elif jx != 0: - csub[${j}] = gimmik_vmadd(csub[${j}], ${jx}, bv); + csub[${j}] += ${jx}*bv; % endif % elif jx != 0: - csub[${j}] = gimmik_vmadd(csub[${j}], ${jx}, bv); + csub[${j}] += ${jx}*bv; % endif ## % if kx == alix[j]: @@ -62,7 +62,7 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) % if jx == -1 and beta == 0: store_c(&c[i + ${j}*ldc], make_zero()); % elif jx == -1 and beta != 1: - store_c(&c[i + ${j}*ldc], gimmik_vmul(${beta}, load_c(&c[i + ${j}*ldc]))); + store_c(&c[i + ${j}*ldc], ${beta}*load_c(&c[i + ${j}*ldc])); % endif % endfor } diff --git a/gimmik/kernels/hip/bstream.mako b/gimmik/kernels/hip/bstream.mako index 427dffc..10df790 100644 --- a/gimmik/kernels/hip/bstream.mako +++ b/gimmik/kernels/hip/bstream.mako @@ -26,20 +26,20 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) ## Iterare through the used rows of B % for kx in bix: - bv = b[i + ${kx}*ldb]; + bv = load_b(&b[i + ${kx}*ldb]); % for j, jx in enumerate(A[:, kx]): % if jx != 0 and kx == afix[j]: - csub[${j}] = gimmik_vmul(${jx}, bv); + csub[${j}] = ${jx}*bv; % elif jx != 0: - csub[${j}] = gimmik_vmadd(csub[${j}], ${jx}, bv); + csub[${j}] += ${jx}*bv; % endif ## % if kx == alix[j] and beta == 0: store_c(&c[i + ${j}*ldc], csub[${j}]); % elif kx == alix[j] and beta == 1: - store_c(&c[i + ${j}*ldc], gimmik_vadd(load_c(&c[i + ${j}*ldc]), csub[${j}])); + store_c(&c[i + ${j}*ldc], load_c(&c[i + ${j}*ldc]) + csub[${j}]); % elif kx == alix[j]: - store_c(&c[i + ${j}*ldc], gimmik_vadd(csub[${j}], gimmik_vmul(${beta}, load_c(&c[i + ${j}*ldc])))); + store_c(&c[i + ${j}*ldc], csub[${j}] + ${beta}*load_c(&c[i + ${j}*ldc])); % endif % endfor % endfor @@ -49,7 +49,7 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) % if jx == -1 and beta == 0: store_c(&c[i + ${j}*ldc], make_zero()); % elif jx == -1 and beta != 1: - store_c(&c[i + ${j}*ldc], gimmik_vmul(${beta}, load_c(&c[i + ${j}*ldc]))); + store_c(&c[i + ${j}*ldc], ${beta}*load_c(&c[i + ${j}*ldc])); % endif % endfor } diff --git a/gimmik/kernels/hip/cstream-ksplit-preload-c.mako b/gimmik/kernels/hip/cstream-ksplit-preload-c.mako index 4550700..b922666 100644 --- a/gimmik/kernels/hip/cstream-ksplit-preload-c.mako +++ b/gimmik/kernels/hip/cstream-ksplit-preload-c.mako @@ -48,9 +48,9 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) has_dotp = A[j].any() if nzixs: first_l_idx, first_kx = nzixs[0] - dotex = f"gimmik_vmul({A[j, first_kx]}, bv[{first_l_idx}])" + dotex = f"{A[j, first_kx]}*bv[{first_l_idx}]" for l_idx, kx in nzixs[1:]: - dotex = f"gimmik_vmadd({dotex}, {A[j, kx]}, bv[{l_idx}])" + dotex = f"{dotex} + {A[j, kx]}*bv[{l_idx}]" else: dotex = 'make_zero()' %> @@ -61,10 +61,10 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) cv[${loop.index // ksplit}] = dotp; % elif beta == 1 and has_dotp: cv[${loop.index // ksplit}] = load_c(&c[i + ${j}*ldc]); - cv[${loop.index // ksplit}] = gimmik_vadd(cv[${loop.index // ksplit}], dotp); + cv[${loop.index // ksplit}] += dotp; % elif has_dotp: - cv[${loop.index // ksplit}] = gimmik_vmul(${beta}, load_c(&c[i + ${j}*ldc])); - cv[${loop.index // ksplit}] = gimmik_vadd(cv[${loop.index // ksplit}], dotp); + cv[${loop.index // ksplit}] = ${beta}*load_c(&c[i + ${j}*ldc]); + cv[${loop.index // ksplit}] += dotp; % endif ## Save to shared memory % else: @@ -85,7 +85,7 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) <% sum_expr = f"cv[{loop.index // ksplit}]" for s_idx in range(ksplit - 1): - sum_expr = f"gimmik_vadd({sum_expr}, csub[{s_idx}][{loop.index}][threadIdx.x])" + sum_expr = f"{sum_expr} + csub[{s_idx}][{loop.index}][threadIdx.x]" %> % if beta == 0: dotp = ${sum_expr}; @@ -97,7 +97,7 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) dotp = ${sum_expr}; store_c(&c[i + ${j}*ldc], dotp); % elif beta != 1: - store_c(&c[i + ${j}*ldc], gimmik_vmul(${beta}, load_c(&c[i + ${j}*ldc]))); + store_c(&c[i + ${j}*ldc], ${beta}*load_c(&c[i + ${j}*ldc])); % endif % endif % endfor diff --git a/gimmik/kernels/hip/cstream-ksplit.mako b/gimmik/kernels/hip/cstream-ksplit.mako index 9c5fc09..5fc6711 100644 --- a/gimmik/kernels/hip/cstream-ksplit.mako +++ b/gimmik/kernels/hip/cstream-ksplit.mako @@ -47,9 +47,9 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) nzixs = [(l_idx, kbx[l_idx]) for l_idx in A[j, kbx].nonzero()[0]] if nzixs: first_l_idx, first_kx = nzixs[0] - dotex = f"gimmik_vmul({A[j, first_kx]}, bv[{first_l_idx}])" + dotex = f"{A[j, first_kx]}*bv[{first_l_idx}]" for l_idx, kx in nzixs[1:]: - dotex = f"gimmik_vmadd({dotex}, {A[j, kx]}, bv[{l_idx}])" + dotex = f"{dotex} + {A[j, kx]}*bv[{l_idx}]" else: dotex = 'make_zero()' %> @@ -75,15 +75,15 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) <% sum_expr = f"cv[{loop.index // ksplit}]" for s_idx in range(ksplit - 1): - sum_expr = f"gimmik_vadd({sum_expr}, csub[{s_idx}][{loop.index}][threadIdx.x])" + sum_expr = f"{sum_expr} + csub[{s_idx}][{loop.index}][threadIdx.x]" %> dotp = ${sum_expr}; % if beta == 0: store_c(&c[i + ${j}*ldc], dotp); % elif beta == 1: - store_c(&c[i + ${j}*ldc], gimmik_vadd(load_c(&c[i + ${j}*ldc]), dotp)); + store_c(&c[i + ${j}*ldc], load_c(&c[i + ${j}*ldc]) + dotp); % else: - store_c(&c[i + ${j}*ldc], gimmik_vadd(dotp, gimmik_vmul(${beta}, load_c(&c[i + ${j}*ldc])))); + store_c(&c[i + ${j}*ldc], dotp + ${beta}*load_c(&c[i + ${j}*ldc])); % endif % endif % endfor diff --git a/gimmik/kernels/hip/cstream-preload-c.mako b/gimmik/kernels/hip/cstream-preload-c.mako index a8d7e31..c9a83b0 100644 --- a/gimmik/kernels/hip/cstream-preload-c.mako +++ b/gimmik/kernels/hip/cstream-preload-c.mako @@ -30,9 +30,9 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) nzixs = [kx for kx, val in enumerate(jx) if val != 0] if nzixs: first_kx = nzixs[0] - dotex = f"gimmik_vmul({jx[first_kx]}, b[i + {first_kx}*ldb])" + dotex = f"{jx[first_kx]}*b[i + {first_kx}*ldb]" for kx in nzixs[1:]: - dotex = f"gimmik_vmadd({dotex}, {jx[kx]}, b[i + {kx}*ldb])" + dotex = f"{dotex} + {jx[kx]}*b[i + {kx}*ldb]" else: dotex = 'make_zero()' %> @@ -42,18 +42,18 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) store_c(&c[i + ${j}*ldc], dotp); % elif beta == 1: dotp = load_c(&c[i + ${j}*ldc]); - dotp = gimmik_vadd(dotp, ${dotex}); + dotp += ${dotex}; store_c(&c[i + ${j}*ldc], dotp); % else: - dotp = gimmik_vmul(${beta}, load_c(&c[i + ${j}*ldc])); - dotp = gimmik_vadd(dotp, ${dotex}); + dotp = ${beta}*load_c(&c[i + ${j}*ldc]); + dotp += ${dotex}; store_c(&c[i + ${j}*ldc], dotp); % endif % else: % if beta == 0: store_c(&c[i + ${j}*ldc], make_zero()); % elif beta != 1: - store_c(&c[i + ${j}*ldc], gimmik_vmul(${beta}, load_c(&c[i + ${j}*ldc]))); + store_c(&c[i + ${j}*ldc], ${beta}*load_c(&c[i + ${j}*ldc])); % endif % endif % endfor diff --git a/gimmik/kernels/hip/cstream.mako b/gimmik/kernels/hip/cstream.mako index 1b7b312..4ea995e 100644 --- a/gimmik/kernels/hip/cstream.mako +++ b/gimmik/kernels/hip/cstream.mako @@ -30,9 +30,9 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) nzixs = [kx for kx, val in enumerate(jx) if val != 0] if nzixs: first_kx = nzixs[0] - dotex = f"gimmik_vmul({jx[first_kx]}, b[i + {first_kx}*ldb])" + dotex = f"{jx[first_kx]}*b[i + {first_kx}*ldb]" for kx in nzixs[1:]: - dotex = f"gimmik_vmadd({dotex}, {jx[kx]}, b[i + {kx}*ldb])" + dotex = f"{dotex} + {jx[kx]}*b[i + {kx}*ldb]" else: dotex = 'make_zero()' %> @@ -43,9 +43,9 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) % if beta == 0: store_c(&c[i + ${j}*ldc], dotp); % elif beta == 1 and nzixs: - store_c(&c[i + ${j}*ldc], gimmik_vadd(load_c(&c[i + ${j}*ldc]), dotp)); + store_c(&c[i + ${j}*ldc], load_c(&c[i + ${j}*ldc]) + dotp); % else: - store_c(&c[i + ${j}*ldc], gimmik_vadd(dotp, gimmik_vmul(${beta}, load_c(&c[i + ${j}*ldc])))); + store_c(&c[i + ${j}*ldc], dotp + ${beta}*load_c(&c[i + ${j}*ldc])); % endif % endfor } From c06216d0e09f76313acf0cf53cccd5529d7f6eab Mon Sep 17 00:00:00 2001 From: tomjen12 Date: Wed, 24 Jun 2026 23:00:57 -0500 Subject: [PATCH 11/15] Make HIP preload-C a template option --- gimmik/hip.py | 11 +- .../kernels/hip/bstream-msplit-preload-c.mako | 104 ----------------- gimmik/kernels/hip/bstream-msplit.mako | 22 +++- gimmik/kernels/hip/bstream-preload-c.mako | 69 ----------- gimmik/kernels/hip/bstream.mako | 25 +++- .../kernels/hip/cstream-ksplit-preload-c.mako | 108 ------------------ gimmik/kernels/hip/cstream-ksplit.mako | 28 ++++- gimmik/kernels/hip/cstream-preload-c.mako | 61 ---------- gimmik/kernels/hip/cstream.mako | 25 +++- 9 files changed, 96 insertions(+), 357 deletions(-) delete mode 100644 gimmik/kernels/hip/bstream-msplit-preload-c.mako delete mode 100644 gimmik/kernels/hip/bstream-preload-c.mako delete mode 100644 gimmik/kernels/hip/cstream-ksplit-preload-c.mako delete mode 100644 gimmik/kernels/hip/cstream-preload-c.mako diff --git a/gimmik/hip.py b/gimmik/hip.py index 57fa394..1fceb01 100644 --- a/gimmik/hip.py +++ b/gimmik/hip.py @@ -19,6 +19,9 @@ def emit(name, args, meta): if threads <= max_block_threads and shared <= max_shared: yield (name, args, meta) + def emit_preload(name, args, meta): + yield from emit(name, args | {'preload': True}, meta) + blkx = self.basemeta['block'][0] # B loading, C streaming kernel @@ -92,12 +95,12 @@ def emit(name, args, meta): args = {'blockx': blkx} | wargs meta = {'block': (blkx, 1, 1), 'desc': f'cstream-preload-c/{wpfx}x{blkx}'} | wmeta - yield from emit('cstream-preload-c', args, meta) + yield from emit_preload('cstream', args, meta) # B streaming, C preloading, C accumulation kernel meta = {'block': (blkx, 1, 1), 'desc': f'bstream-preload-c/{wpfx}x{blkx}'} | wmeta - yield from emit('bstream-preload-c', args, meta) + yield from emit_preload('bstream', args, meta) for ms in msplits: # m-split B streaming, C preloading, C accumulation kernel @@ -110,7 +113,7 @@ def emit(name, args, meta): f'{wpfx}m{ms}-b{bsz}-x{blkx}' ) } | wmeta - yield from emit('bstream-msplit-preload-c', args, meta) + yield from emit_preload('bstream-msplit', args, meta) for ks in ksplits: # k-split B loading, C preloading, C streaming kernel @@ -123,7 +126,7 @@ def emit(name, args, meta): f'{wpfx}k{ks}-c{csz}-x{blkx}' ) } | wmeta - yield from emit('cstream-ksplit-preload-c', args, meta) + yield from emit_preload('cstream-ksplit', args, meta) def _process_meta(self, meta): if self.n is not None: diff --git a/gimmik/kernels/hip/bstream-msplit-preload-c.mako b/gimmik/kernels/hip/bstream-msplit-preload-c.mako deleted file mode 100644 index e58fa43..0000000 --- a/gimmik/kernels/hip/bstream-msplit-preload-c.mako +++ /dev/null @@ -1,104 +0,0 @@ -<%inherit file='base'/> - -<% -mx = partition(A, into=msplit, by='rows') -bchunks = chunk(bix, bsz) -%> - -__global__ __launch_bounds__(${blockx*msplit}) void -% if n is None: -${kname}(int n, - const ${dtype}* __restrict__ b, int ldb, - ${dtype}* __restrict__ c, int ldc) -{ - % if width > 1: - n = (n + ${width} - 1) / ${width}; - ldb /= ${width}; - ldc /= ${width}; - % endif -% else: -${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) -{ - const int n = ${-(-n // width)}; - const ${'long long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width}; - const ${'long long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width}; -% endif - int i = blockDim.x*blockIdx.x + threadIdx.x; - - ${dtype} bv, csub[${-(-m // msplit)}]; - __shared__ ${dtype} bsub[2][${bsz}][${blockx}]; - -## Fill the initial shared memory block -% for cid in range(msplit): - if (i < n && threadIdx.y == ${cid}) - { - % for kx in bchunks[0]: - % if loop.index % msplit == cid: - bsub[0][${loop.index}][threadIdx.x] = load_b(&b[i + ${kx}*ldb]); - % endif - % endfor - - % if beta != 0: - ## Preload C values for active rows owned by this m-split lane - % for j, jx in enumerate(mx[cid]): - % if afix[jx] != -1: - % if beta == 1: - csub[${j}] = load_c(&c[i + ${jx}*ldc]); - % else: - csub[${j}] = ${beta}*load_c(&c[i + ${jx}*ldc]); - % endif - % endif - % endfor - % endif - } -% endfor - __syncthreads(); - -## Iterate over each row-chunk of B -% for bb in range(len(bchunks)): - ## Iterate over each row-chunk of C - % for cid, mcx in enumerate(mx): - if (i < n && threadIdx.y == ${cid}) - { - ## Start filling the next shared memory block - % if not loop.parent.last: - % for kx in bchunks[bb + 1]: - % if loop.index % msplit == cid: - bsub[${(bb + 1) % 2}][${loop.index}][threadIdx.x] = load_b(&b[i + ${kx}*ldb]); - % endif - % endfor - % endif - ## Accumulate our dot products - % for kx in bchunks[bb]: - bv = bsub[${bb % 2}][${loop.index}][threadIdx.x]; - % for j, jx in enumerate(A[mcx, kx]): - % if beta == 0: - % if jx != 0 and kx == afix[mcx[j]]: - csub[${j}] = ${jx}*bv; - % elif jx != 0: - csub[${j}] += ${jx}*bv; - % endif - % elif jx != 0: - csub[${j}] += ${jx}*bv; - % endif - ## If we're done with this dot product then store to global - % if kx == alix[mcx[j]]: - store_c(&c[i + ${mcx[j]}*ldc], csub[${j}]); - % endif - % endfor - % endfor - ## Handle rows of A which are all zero - % if loop.parent.last: - % for j, jx in enumerate(afix): - % if jx == -1 and j % msplit == cid and beta == 0: - store_c(&c[i + ${j}*ldc], make_zero()); - % elif jx == -1 and j % msplit == cid and beta != 1: - store_c(&c[i + ${j}*ldc], ${beta}*load_c(&c[i + ${j}*ldc])); - % endif - % endfor - % endif - } - % endfor - __syncthreads(); -% endfor -} diff --git a/gimmik/kernels/hip/bstream-msplit.mako b/gimmik/kernels/hip/bstream-msplit.mako index 0681783..52853f4 100644 --- a/gimmik/kernels/hip/bstream-msplit.mako +++ b/gimmik/kernels/hip/bstream-msplit.mako @@ -3,6 +3,7 @@ <% mx = partition(A, into=msplit, by='rows') bchunks = chunk(bix, bsz) +preload = context.get('preload', False) %> __global__ __launch_bounds__(${blockx*msplit}) void @@ -37,6 +38,19 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) bsub[0][${loop.index}][threadIdx.x] = load_b(&b[i + ${kx}*ldb]); % endif % endfor + + % if preload and beta != 0: + ## Preload C values for active rows owned by this m-split lane + % for j, jx in enumerate(mx[cid]): + % if afix[jx] != -1: + % if beta == 1: + csub[${j}] = load_c(&c[i + ${jx}*ldc]); + % else: + csub[${j}] = ${beta}*load_c(&c[i + ${jx}*ldc]); + % endif + % endif + % endfor + % endif } % endfor __syncthreads(); @@ -59,13 +73,17 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) % for kx in bchunks[bb]: bv = bsub[${bb % 2}][${loop.index}][threadIdx.x]; % for j, jx in enumerate(A[mcx, kx]): - % if jx != 0 and kx == afix[mcx[j]]: + % if preload and beta != 0 and jx != 0: + csub[${j}] += ${jx}*bv; + % elif jx != 0 and kx == afix[mcx[j]]: csub[${j}] = ${jx}*bv; % elif jx != 0: csub[${j}] += ${jx}*bv; % endif ## If we're done with this dot product then store to global - % if kx == alix[mcx[j]] and beta == 0: + % if preload and kx == alix[mcx[j]]: + store_c(&c[i + ${mcx[j]}*ldc], csub[${j}]); + % elif kx == alix[mcx[j]] and beta == 0: store_c(&c[i + ${mcx[j]}*ldc], csub[${j}]); % elif kx == alix[mcx[j]] and beta == 1: store_c(&c[i + ${mcx[j]}*ldc], load_c(&c[i + ${mcx[j]}*ldc]) + csub[${j}]); diff --git a/gimmik/kernels/hip/bstream-preload-c.mako b/gimmik/kernels/hip/bstream-preload-c.mako deleted file mode 100644 index a7140c6..0000000 --- a/gimmik/kernels/hip/bstream-preload-c.mako +++ /dev/null @@ -1,69 +0,0 @@ -<%inherit file='base'/> - -__global__ __launch_bounds__(${blockx}) void -% if n is None: -${kname}(int n, - const ${dtype}* __restrict__ b, int ldb, - ${dtype}* __restrict__ c, int ldc) -{ - % if width > 1: - n = (n + ${width} - 1) / ${width}; - ldb /= ${width}; - ldc /= ${width}; - % endif -% else: -${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) -{ - const int n = ${-(-n // width)}; - const ${'long long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width}; - const ${'long long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width}; -% endif - const int i = blockDim.x*blockIdx.x + threadIdx.x; - - if (i < n) - { - ${dtype} bv, csub[${m}]; - -% if beta != 0: -## Preload C values for rows which will receive a non-zero dot product -% for j, jx in enumerate(afix): - % if jx != -1: - % if beta == 1: - csub[${j}] = load_c(&c[i + ${j}*ldc]); - % else: - csub[${j}] = ${beta}*load_c(&c[i + ${j}*ldc]); - % endif - % endif -% endfor -% endif - -## Iterate through the used rows of B -% for kx in bix: - bv = load_b(&b[i + ${kx}*ldb]); - % for j, jx in enumerate(A[:, kx]): - % if beta == 0: - % if jx != 0 and kx == afix[j]: - csub[${j}] = ${jx}*bv; - % elif jx != 0: - csub[${j}] += ${jx}*bv; - % endif - % elif jx != 0: - csub[${j}] += ${jx}*bv; - % endif - ## - % if kx == alix[j]: - store_c(&c[i + ${j}*ldc], csub[${j}]); - % endif - % endfor -% endfor - -## Handle rows of A which are all zero -% for j, jx in enumerate(afix): - % if jx == -1 and beta == 0: - store_c(&c[i + ${j}*ldc], make_zero()); - % elif jx == -1 and beta != 1: - store_c(&c[i + ${j}*ldc], ${beta}*load_c(&c[i + ${j}*ldc])); - % endif -% endfor - } -} diff --git a/gimmik/kernels/hip/bstream.mako b/gimmik/kernels/hip/bstream.mako index 10df790..1e7a70b 100644 --- a/gimmik/kernels/hip/bstream.mako +++ b/gimmik/kernels/hip/bstream.mako @@ -1,5 +1,7 @@ <%inherit file='base'/> +<% preload = context.get('preload', False) %> + __global__ __launch_bounds__(${blockx}) void % if n is None: ${kname}(int n, @@ -24,17 +26,34 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) { ${dtype} bv, csub[${m}]; -## Iterare through the used rows of B +% if preload and beta != 0: +## Preload C values for rows which will receive a non-zero dot product +% for j, jx in enumerate(afix): + % if jx != -1: + % if beta == 1: + csub[${j}] = load_c(&c[i + ${j}*ldc]); + % else: + csub[${j}] = ${beta}*load_c(&c[i + ${j}*ldc]); + % endif + % endif +% endfor +% endif + +## Iterate through the used rows of B % for kx in bix: bv = load_b(&b[i + ${kx}*ldb]); % for j, jx in enumerate(A[:, kx]): - % if jx != 0 and kx == afix[j]: + % if preload and beta != 0 and jx != 0: + csub[${j}] += ${jx}*bv; + % elif jx != 0 and kx == afix[j]: csub[${j}] = ${jx}*bv; % elif jx != 0: csub[${j}] += ${jx}*bv; % endif ## - % if kx == alix[j] and beta == 0: + % if preload and kx == alix[j]: + store_c(&c[i + ${j}*ldc], csub[${j}]); + % elif kx == alix[j] and beta == 0: store_c(&c[i + ${j}*ldc], csub[${j}]); % elif kx == alix[j] and beta == 1: store_c(&c[i + ${j}*ldc], load_c(&c[i + ${j}*ldc]) + csub[${j}]); diff --git a/gimmik/kernels/hip/cstream-ksplit-preload-c.mako b/gimmik/kernels/hip/cstream-ksplit-preload-c.mako deleted file mode 100644 index b922666..0000000 --- a/gimmik/kernels/hip/cstream-ksplit-preload-c.mako +++ /dev/null @@ -1,108 +0,0 @@ -<%inherit file='base'/> - -<% -kparts = partition(A, ksplit, by='cols') -cchunks = chunk(range(m), csz) -loaded = set() -%> - -__global__ __launch_bounds__(${blockx*ksplit}) void -% if n is None: -${kname}(int n, - const ${dtype}* __restrict__ b, int ldb, - ${dtype}* __restrict__ c, int ldc) -{ - % if width > 1: - n = (n + ${width} - 1) / ${width}; - ldb /= ${width}; - ldc /= ${width}; - % endif -% else: -${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) -{ - const int n = ${-(-n // width)}; - const ${'long long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width}; - const ${'long long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width}; -% endif - int i = blockDim.x*blockIdx.x + threadIdx.x; - - ${dtype} cv[${-(-csz // ksplit)}], bv[${-(-k // ksplit)}], dotp; - __shared__ ${dtype} csub[${ksplit - 1}][${csz}][${blockx}]; - -## Iterate over the row-partitions of C -% for cchunk in cchunks: - ## Iterate over the row-partitions of B - % for bid, kbx in enumerate(kparts): - if (i < n && threadIdx.y == ${bid}) - { - ## Evaluate our partial dot products - % for j in cchunk: - ## Load in any missing parts of B - % for kx in kbx: - % if A[j, kx] != 0 and kx not in loaded: - bv[${loop.index}] = b[i + ${kx}*ldb]; <% loaded.add(kx) %> - % endif - % endfor - <% - nzixs = [(l_idx, kbx[l_idx]) for l_idx in A[j, kbx].nonzero()[0]] - has_dotp = A[j].any() - if nzixs: - first_l_idx, first_kx = nzixs[0] - dotex = f"{A[j, first_kx]}*bv[{first_l_idx}]" - for l_idx, kx in nzixs[1:]: - dotex = f"{dotex} + {A[j, kx]}*bv[{l_idx}]" - else: - dotex = 'make_zero()' - %> - dotp = ${dotex}; - ## Save to a register - % if loop.index % ksplit == bid: - % if beta == 0: - cv[${loop.index // ksplit}] = dotp; - % elif beta == 1 and has_dotp: - cv[${loop.index // ksplit}] = load_c(&c[i + ${j}*ldc]); - cv[${loop.index // ksplit}] += dotp; - % elif has_dotp: - cv[${loop.index // ksplit}] = ${beta}*load_c(&c[i + ${j}*ldc]); - cv[${loop.index // ksplit}] += dotp; - % endif - ## Save to shared memory - % else: - csub[${bid - (bid > loop.index % ksplit)}][${loop.index}][threadIdx.x] = dotp; - % endif - % endfor - } - % endfor - __syncthreads(); - ## Iterate over the column-partitions of B - % for bid, kbx in enumerate(kparts): - if (i < n && threadIdx.y == ${bid}) - { - ## Sum and output the final set of dot products - % for j in cchunk: - % if loop.index % ksplit == bid: - <% has_dotp = A[j].any() %> - <% - sum_expr = f"cv[{loop.index // ksplit}]" - for s_idx in range(ksplit - 1): - sum_expr = f"{sum_expr} + csub[{s_idx}][{loop.index}][threadIdx.x]" - %> - % if beta == 0: - dotp = ${sum_expr}; - store_c(&c[i + ${j}*ldc], dotp); - % elif beta == 1 and has_dotp: - dotp = ${sum_expr}; - store_c(&c[i + ${j}*ldc], dotp); - % elif beta != 1 and has_dotp: - dotp = ${sum_expr}; - store_c(&c[i + ${j}*ldc], dotp); - % elif beta != 1: - store_c(&c[i + ${j}*ldc], ${beta}*load_c(&c[i + ${j}*ldc])); - % endif - % endif - % endfor - } - % endfor - __syncthreads(); -% endfor -} diff --git a/gimmik/kernels/hip/cstream-ksplit.mako b/gimmik/kernels/hip/cstream-ksplit.mako index 5fc6711..12c59ba 100644 --- a/gimmik/kernels/hip/cstream-ksplit.mako +++ b/gimmik/kernels/hip/cstream-ksplit.mako @@ -4,6 +4,7 @@ kparts = partition(A, ksplit, by='cols') cchunks = chunk(range(m), csz) loaded = set() +preload = context.get('preload', False) %> __global__ __launch_bounds__(${blockx*ksplit}) void @@ -45,6 +46,7 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) % endfor <% nzixs = [(l_idx, kbx[l_idx]) for l_idx in A[j, kbx].nonzero()[0]] + has_dotp = A[j].any() if nzixs: first_l_idx, first_kx = nzixs[0] dotex = f"{A[j, first_kx]}*bv[{first_l_idx}]" @@ -56,7 +58,17 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) dotp = ${dotex}; ## Save to a register % if loop.index % ksplit == bid: + % if preload and beta == 0: cv[${loop.index // ksplit}] = dotp; + % elif preload and beta == 1 and has_dotp: + cv[${loop.index // ksplit}] = load_c(&c[i + ${j}*ldc]); + cv[${loop.index // ksplit}] += dotp; + % elif preload and has_dotp: + cv[${loop.index // ksplit}] = ${beta}*load_c(&c[i + ${j}*ldc]); + cv[${loop.index // ksplit}] += dotp; + % elif not preload: + cv[${loop.index // ksplit}] = dotp; + % endif ## Save to shared memory % else: csub[${bid - (bid > loop.index % ksplit)}][${loop.index}][threadIdx.x] = dotp; @@ -72,17 +84,31 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) ## Sum and output the final set of dot products % for j in cchunk: % if loop.index % ksplit == bid: + <% has_dotp = A[j].any() %> <% sum_expr = f"cv[{loop.index // ksplit}]" for s_idx in range(ksplit - 1): sum_expr = f"{sum_expr} + csub[{s_idx}][{loop.index}][threadIdx.x]" %> + % if preload and beta == 0: + dotp = ${sum_expr}; + store_c(&c[i + ${j}*ldc], dotp); + % elif preload and beta == 1 and has_dotp: + dotp = ${sum_expr}; + store_c(&c[i + ${j}*ldc], dotp); + % elif preload and beta != 1 and has_dotp: + dotp = ${sum_expr}; + store_c(&c[i + ${j}*ldc], dotp); + % elif preload and beta != 1: + store_c(&c[i + ${j}*ldc], ${beta}*load_c(&c[i + ${j}*ldc])); + % elif beta == 0: dotp = ${sum_expr}; - % if beta == 0: store_c(&c[i + ${j}*ldc], dotp); % elif beta == 1: + dotp = ${sum_expr}; store_c(&c[i + ${j}*ldc], load_c(&c[i + ${j}*ldc]) + dotp); % else: + dotp = ${sum_expr}; store_c(&c[i + ${j}*ldc], dotp + ${beta}*load_c(&c[i + ${j}*ldc])); % endif % endif diff --git a/gimmik/kernels/hip/cstream-preload-c.mako b/gimmik/kernels/hip/cstream-preload-c.mako deleted file mode 100644 index c9a83b0..0000000 --- a/gimmik/kernels/hip/cstream-preload-c.mako +++ /dev/null @@ -1,61 +0,0 @@ -<%inherit file='base'/> - -<% ksplit = 2 if m < 36 else 1 %> - -__global__ __launch_bounds__(${blockx}) void -% if n is None: -${kname}(int n, - const ${dtype}* __restrict__ b, int ldb, - ${dtype}* __restrict__ c, int ldc) -{ - % if width > 1: - n = (n + ${width} - 1) / ${width}; - ldb /= ${width}; - ldc /= ${width}; - % endif -% else: -${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) -{ - const int n = ${-(-n // width)}; - const ${'long long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width}; - const ${'long long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width}; -% endif - const int i = blockDim.x*blockIdx.x + threadIdx.x; - ${dtype} dotp; - - if (i < n) - { -% for j, jx in enumerate(A): - <% - nzixs = [kx for kx, val in enumerate(jx) if val != 0] - if nzixs: - first_kx = nzixs[0] - dotex = f"{jx[first_kx]}*b[i + {first_kx}*ldb]" - for kx in nzixs[1:]: - dotex = f"{dotex} + {jx[kx]}*b[i + {kx}*ldb]" - else: - dotex = 'make_zero()' - %> - % if nzixs: - % if beta == 0: - dotp = ${dotex}; - store_c(&c[i + ${j}*ldc], dotp); - % elif beta == 1: - dotp = load_c(&c[i + ${j}*ldc]); - dotp += ${dotex}; - store_c(&c[i + ${j}*ldc], dotp); - % else: - dotp = ${beta}*load_c(&c[i + ${j}*ldc]); - dotp += ${dotex}; - store_c(&c[i + ${j}*ldc], dotp); - % endif - % else: - % if beta == 0: - store_c(&c[i + ${j}*ldc], make_zero()); - % elif beta != 1: - store_c(&c[i + ${j}*ldc], ${beta}*load_c(&c[i + ${j}*ldc])); - % endif - % endif -% endfor - } -} diff --git a/gimmik/kernels/hip/cstream.mako b/gimmik/kernels/hip/cstream.mako index 4ea995e..2ee9574 100644 --- a/gimmik/kernels/hip/cstream.mako +++ b/gimmik/kernels/hip/cstream.mako @@ -1,6 +1,8 @@ <%inherit file='base'/> -<% ksplit = 2 if m < 36 else 1 %> +<% +preload = context.get('preload', False) +%> __global__ __launch_bounds__(${blockx}) void % if n is None: @@ -37,10 +39,23 @@ ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) dotex = 'make_zero()' %> dotp = ${dotex}; - % if not nzixs: - dotp = make_zero(); - % endif - % if beta == 0: + % if preload and nzixs: + % if beta == 0: + store_c(&c[i + ${j}*ldc], dotp); + % elif beta == 1: + dotp = load_c(&c[i + ${j}*ldc]) + dotp; + store_c(&c[i + ${j}*ldc], dotp); + % else: + dotp = ${beta}*load_c(&c[i + ${j}*ldc]) + dotp; + store_c(&c[i + ${j}*ldc], dotp); + % endif + % elif preload: + % if beta == 0: + store_c(&c[i + ${j}*ldc], make_zero()); + % elif beta != 1: + store_c(&c[i + ${j}*ldc], ${beta}*load_c(&c[i + ${j}*ldc])); + % endif + % elif beta == 0: store_c(&c[i + ${j}*ldc], dotp); % elif beta == 1 and nzixs: store_c(&c[i + ${j}*ldc], load_c(&c[i + ${j}*ldc]) + dotp); From e014e4d40c1745e2b803e0f85ea3ce92da1aee50 Mon Sep 17 00:00:00 2001 From: tomjen12 Date: Wed, 24 Jun 2026 23:21:08 -0500 Subject: [PATCH 12/15] Avoid HIP vector operator+= overloads --- gimmik/kernels/hip/base.mako | 6 ------ 1 file changed, 6 deletions(-) diff --git a/gimmik/kernels/hip/base.mako b/gimmik/kernels/hip/base.mako index 95e622f..d67ee25 100644 --- a/gimmik/kernels/hip/base.mako +++ b/gimmik/kernels/hip/base.mako @@ -5,9 +5,6 @@ inline __device__ ${dtype} operator+(${dtype} a, ${dtype} b) inline __device__ ${dtype} operator*(${dtype[:-1]} a, ${dtype} b) { return make_${dtype}(a*b.x, a*b.y, a*b.z, a*b.w); } -inline __device__ void operator+=(${dtype} &a, ${dtype} b) -{ a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; } - inline __device__ ${dtype} make_zero() { return make_${dtype}(0, 0, 0, 0); } % elif dtype.endswith('2'): @@ -17,9 +14,6 @@ inline __device__ ${dtype} operator+(${dtype} a, ${dtype} b) inline __device__ ${dtype} operator*(${dtype[:-1]} a, ${dtype} b) { return make_${dtype}(a*b.x, a*b.y); } -inline __device__ void operator+=(${dtype} &a, ${dtype} b) -{ a.x += b.x; a.y += b.y; } - inline __device__ ${dtype} make_zero() { return make_${dtype}(0, 0); } % else: From f6bc30882c7fe8c36da58d4763a2b32089b13a49 Mon Sep 17 00:00:00 2001 From: tomjen12 Date: Thu, 25 Jun 2026 02:02:37 -0500 Subject: [PATCH 13/15] Prune HIP tuned variants to 12 Reduce the HIP tuned kernel search space from 28 variants to 12 and order the remaining variants to try common winners earlier. --- gimmik/hip.py | 54 ++------------------------------------------------- 1 file changed, 2 insertions(+), 52 deletions(-) diff --git a/gimmik/hip.py b/gimmik/hip.py index 1fceb01..b55e3fe 100644 --- a/gimmik/hip.py +++ b/gimmik/hip.py @@ -24,35 +24,17 @@ def emit_preload(name, args, meta): blkx = self.basemeta['block'][0] - # B loading, C streaming kernel - yield from emit('cstream', {'blockx': blkx}, {}) - - # B streaming, C accumulation kernel - yield from emit('bstream', {'blockx': blkx}, {}) - - # Four-way m-split B streaming, C accumulation kernel - ms, bsz, blkx = 4, 24, 64 - args = {'msplit': ms, 'bsz': bsz, 'blockx': blkx} - meta = {'block': (blkx, ms, 1), 'shared': 2*bsz*blkx*dsize} - yield from emit('bstream-msplit', args, meta) - - # Two-way k-split B loading, C streaming kernel - ks, csz, blkx = 2, 24, 64 - args = {'ksplit': ks, 'csz': csz, 'blockx': blkx} - meta = {'block': (blkx, ks, 1), 'shared': (ks - 1)*csz*blkx*dsize} - yield from emit('cstream-ksplit', args, meta) - # Only emit tuned variants on architectures they have been validated for. base_arch = gcn_arch.split(':', 1)[0] if gcn_arch else None if base_arch not in {'gfx90a', 'gfx942'} or warp_size != 64: return # Tuned HIP variants - msplits, ksplits = [4, 8], [2, 4] + msplits, ksplits = [8, 4], [4, 2] bsz, csz, blkx = 8, 8, 64 widths = [1] if self.aligne is not None and self.aligne % 2 == 0: - widths.append(2) + widths.insert(0, 2) for width in widths: wargs = ({'dtype': f'{dtype}{width}', 'width': width} @@ -60,17 +42,6 @@ def emit_preload(name, args, meta): wmeta = {'width': width} if width > 1 else {} wpfx = f'w{width}-' if width > 1 else '' - # B loading, C streaming kernel - args = {'blockx': blkx} | wargs - meta = {'block': (blkx, 1, 1), - 'desc': f'cstream/{wpfx}x{blkx}'} | wmeta - yield from emit('cstream', args, meta) - - # B streaming, C accumulation kernel - meta = {'block': (blkx, 1, 1), - 'desc': f'bstream/{wpfx}x{blkx}'} | wmeta - yield from emit('bstream', args, meta) - for ms in msplits: # m-split B streaming, C accumulation kernel args = {'msplit': ms, 'bsz': bsz, 'blockx': blkx} | wargs @@ -81,27 +52,6 @@ def emit_preload(name, args, meta): } | wmeta yield from emit('bstream-msplit', args, meta) - for ks in ksplits: - # k-split B loading, C streaming kernel - args = {'ksplit': ks, 'csz': csz, 'blockx': blkx} | wargs - shared = (ks - 1)*csz*blkx*dsize*width - meta = { - 'block': (blkx, ks, 1), 'shared': shared, - 'desc': f'cstream-ksplit/{wpfx}k{ks}-c{csz}-x{blkx}' - } | wmeta - yield from emit('cstream-ksplit', args, meta) - - # B loading, C preloading, C streaming kernel - args = {'blockx': blkx} | wargs - meta = {'block': (blkx, 1, 1), - 'desc': f'cstream-preload-c/{wpfx}x{blkx}'} | wmeta - yield from emit_preload('cstream', args, meta) - - # B streaming, C preloading, C accumulation kernel - meta = {'block': (blkx, 1, 1), - 'desc': f'bstream-preload-c/{wpfx}x{blkx}'} | wmeta - yield from emit_preload('bstream', args, meta) - for ms in msplits: # m-split B streaming, C preloading, C accumulation kernel args = {'msplit': ms, 'bsz': bsz, 'blockx': blkx} | wargs From a3aee45280334d46dbfb578a03a90494f64f2041 Mon Sep 17 00:00:00 2001 From: tomjen12 Date: Thu, 25 Jun 2026 02:19:05 -0500 Subject: [PATCH 14/15] Remove HIP variant arch gate --- gimmik/hip.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/gimmik/hip.py b/gimmik/hip.py index b55e3fe..9365938 100644 --- a/gimmik/hip.py +++ b/gimmik/hip.py @@ -24,11 +24,6 @@ def emit_preload(name, args, meta): blkx = self.basemeta['block'][0] - # Only emit tuned variants on architectures they have been validated for. - base_arch = gcn_arch.split(':', 1)[0] if gcn_arch else None - if base_arch not in {'gfx90a', 'gfx942'} or warp_size != 64: - return - # Tuned HIP variants msplits, ksplits = [8, 4], [4, 2] bsz, csz, blkx = 8, 8, 64 From 2c7af9b3dbbe022d1cffe5e4845ead079d398320 Mon Sep 17 00:00:00 2001 From: tomjen12 Date: Thu, 25 Jun 2026 11:18:27 +0000 Subject: [PATCH 15/15] Restore MI355 HIP baseline variants --- gimmik/hip.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/gimmik/hip.py b/gimmik/hip.py index 9365938..142a799 100644 --- a/gimmik/hip.py +++ b/gimmik/hip.py @@ -22,7 +22,21 @@ def emit(name, args, meta): def emit_preload(name, args, meta): yield from emit(name, args | {'preload': True}, meta) - blkx = self.basemeta['block'][0] + ms, bsz, blkx = 4, 24, 64 + args = {'msplit': ms, 'bsz': bsz, 'blockx': blkx} + meta = { + 'block': (blkx, ms, 1), 'shared': 2*bsz*blkx*dsize, + 'desc': f'bstream-msplit/m{ms}-b{bsz}-x{blkx}' + } + yield from emit('bstream-msplit', args, meta) + + ks, csz, blkx = 2, 24, 64 + args = {'ksplit': ks, 'csz': csz, 'blockx': blkx} + meta = { + 'block': (blkx, ks, 1), 'shared': (ks - 1)*csz*blkx*dsize, + 'desc': f'cstream-ksplit/k{ks}-c{csz}-x{blkx}' + } + yield from emit('cstream-ksplit', args, meta) # Tuned HIP variants msplits, ksplits = [8, 4], [4, 2]