Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
fe50a9e
refactor(stark): extract Merkle commitment into a commitment module
diegokingston Jun 19, 2026
5cd227a
perf(stark): row-pair the trace commitment (one Merkle path per query)
diegokingston Jun 19, 2026
f26a13a
refactor(stark): prover cleanup — par helpers, ROWS_PER_LEAF, error p…
diegokingston Jun 19, 2026
256e69c
docs(stark): fix commitment.rs leaf-layout docs after trace pairing
diegokingston Jun 19, 2026
f34a6f6
refactor(prover): dedup commit pipeline (commit_plain + spill_tree) (B)
diegokingston Jun 19, 2026
b4784e6
fix(prover): row-pair the preprocessed-table commitments (CI fix)
diegokingston Jun 22, 2026
c6b9832
test(prover): regenerate SUB_DECODE_COMMITMENT_BLOWUP_2 for row-pair …
diegokingston Jun 22, 2026
acd06e6
test(prover): TEMP print actual decode commitment to regenerate const…
diegokingston Jun 22, 2026
c1ba23f
test(prover): set SUB_DECODE_COMMITMENT_BLOWUP_2 to CI-computed row-p…
diegokingston Jun 22, 2026
a1afb67
mplement changes in GPU
ColoCarletti Jun 26, 2026
7446a9b
refactor
ColoCarletti Jun 26, 2026
fb46406
Merge origin/main into refactor/commitment-module
MauroToscano Jun 26, 2026
ade38c1
Fix CUDA LDE clippy lint
MauroToscano Jun 26, 2026
316dda9
fix(cuda): align review cleanup with row-pair commits (#723)
MauroToscano Jun 26, 2026
38b5858
Merge remote-tracking branch 'origin/refactor/commitment-module' into…
diegokingston Jun 29, 2026
8ddadf4
fix(stark): silence dead_code on par_for_each_mut (debug-checks-only …
diegokingston Jun 29, 2026
9655852
style(stark): cargo fmt after merge resolution
diegokingston Jun 29, 2026
04e1fae
test(cuda): focused GPU row-pair commitment prove+verify test
diegokingston Jun 29, 2026
05bb11a
fix(cuda): drop stale R2 parts-LDE asserts (#700 fused path), silence…
diegokingston Jun 29, 2026
1d8fa2e
Merge branch 'main' into perf/commitment-row-pair
MauroToscano Jun 29, 2026
0847a19
Merge branch 'main' into perf/commitment-row-pair
diegokingston Jun 29, 2026
275eadd
review(stark): address PR #735 review — coverage, dead code, cleanups…
MauroToscano Jun 29, 2026
bd1ebbb
review(stark): tidy test layout + remove AGENTS.md (follow-up to #740…
MauroToscano Jun 29, 2026
1a166e0
Merge origin/main into perf/commitment-row-pair
MauroToscano Jun 29, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 74 additions & 15 deletions crypto/math-cuda/kernels/keccak.cu
Original file line number Diff line number Diff line change
Expand Up @@ -159,8 +159,8 @@ extern "C" __global__ void keccak256_leaves_base_batched(
uint64_t tid = (uint64_t)blockIdx.x * blockDim.x + threadIdx.x;
if (tid >= num_rows) return;

// Bit-reverse the row index so we read columns at `br` but write the
// hashed leaf at `tid` — matching the CPU `commit_columns_bit_reversed`.
// Bit-reverse the row index so we read columns at `br` but write the hashed
// leaf at `tid` — matching the CPU per-row `commit_bit_reversed(.., 1)`.
uint64_t br = __brevll(tid) >> (64 - log_num_rows);

uint64_t st[25];
Expand All @@ -181,6 +181,51 @@ extern "C" __global__ void keccak256_leaves_base_batched(
finalize_keccak256(st, rate_pos, hashed_leaves_out + tid * 32);
}

// ---------------------------------------------------------------------------
// Goldilocks BASE-FIELD row-pair leaf hashing.
//
// Leaf `leaf_idx` hashes TWO consecutive bit-reversed rows
// br_0 = reverse_index(2*leaf_idx), br_1 = reverse_index(2*leaf_idx + 1)
// each written column-by-column in canonical BE (same per-row byte layout as
// `keccak256_leaves_base_batched`), in (br_0 row: col 0..K-1) then (br_1 row:
// col 0..K-1) order. `num_leaves = num_rows / 2`; writes 32 bytes to
// `hashed_leaves_out[leaf_idx * 32 ..]`. Matches the CPU
// `keccak_leaves_row_pair_bit_reversed` (rows_per_leaf = 2) — the base-field
// analog of `keccak_comp_poly_leaves_ext3`.
// ---------------------------------------------------------------------------
extern "C" __global__ void keccak256_leaves_base_row_pair_batched(
const uint64_t *columns_base_ptr,
uint64_t col_stride,
uint64_t num_cols,
uint64_t num_rows,
uint64_t log_num_rows,
uint8_t *hashed_leaves_out) {
uint64_t tid = (uint64_t)blockIdx.x * blockDim.x + threadIdx.x;
uint64_t num_leaves = num_rows >> 1;
if (tid >= num_leaves) return;

uint64_t br_0 = __brevll(2 * tid) >> (64 - log_num_rows);
uint64_t br_1 = __brevll(2 * tid + 1) >> (64 - log_num_rows);

uint64_t st[25];
#pragma unroll
for (int i = 0; i < 25; ++i) st[i] = 0;

uint32_t rate_pos = 0;
// First row (br_0): col 0..K-1.
for (uint64_t c = 0; c < num_cols; ++c) {
uint64_t v = columns_base_ptr[c * col_stride + br_0];
absorb_lane(st, rate_pos, bswap64(goldilocks::canonical(v)));
}
// Second row (br_1): col 0..K-1.
for (uint64_t c = 0; c < num_cols; ++c) {
uint64_t v = columns_base_ptr[c * col_stride + br_1];
absorb_lane(st, rate_pos, bswap64(goldilocks::canonical(v)));
}

finalize_keccak256(st, rate_pos, hashed_leaves_out + tid * 32);
}

// ---------------------------------------------------------------------------
// Goldilocks EXT3 leaf hashing (3 base-field components per ext3 element).
//
Expand Down Expand Up @@ -349,35 +394,49 @@ extern "C" __global__ void keccak_merkle_level(
}

// ---------------------------------------------------------------------------
// Row-major base leaf hashing.
// Row-major ROW-PAIR leaf hashing.
//
// Row-major analog of `keccak256_leaves_base_row_pair_batched` (which reads a
// column-major slab): each leaf hashes TWO consecutive bit-reversed rows.
// Leaf `tid` hashes row `reverse_index(2*tid)` followed by row
// `reverse_index(2*tid + 1)`, each as `m` canonical big-endian lanes read from
// the contiguous row-major buffer (`data + br * m`). `num_leaves = num_rows/2`;
// writes 32 bytes to `hashed_leaves_out[tid*32 ..]`.
//
// Input layout: data[row * m + col] for `num_rows` rows and `m` columns.
// For leaf `tid`, reads the bit-reversed row `br(tid)` — a contiguous slice
// of `m` elements starting at data[br * m]. Coalesced when multiple threads
// in the same warp process consecutive `tid` values (they read non-overlapping
// rows, each a contiguous block of m u64s in order).
// `m` is the row stride in u64s: base trace = num columns; ext3 trace = 3 *
// num columns (an ext3 element's components c0,c1,c2 are consecutive, matching
// the CPU `write_bytes_be`). Byte layout therefore equals the CPU
// `commit_bit_reversed(.., ROWS_PER_LEAF=2)` and the verifier's
// `verify_opening_pair` (queried row ‖ its symmetric counterpart, one leaf).
// ---------------------------------------------------------------------------
extern "C" __global__ void keccak256_leaves_base_row_major(
extern "C" __global__ void keccak256_leaves_base_row_major_row_pair(
const uint64_t *data,
uint64_t m,
uint64_t num_rows,
uint64_t log_num_rows,
uint8_t *hashed_leaves_out)
{
uint64_t tid = (uint64_t)blockIdx.x * blockDim.x + threadIdx.x;
if (tid >= num_rows) return;
uint64_t br = __brevll(tid) >> (64 - log_num_rows);
const uint64_t *row = data + br * m;
uint64_t num_leaves = num_rows >> 1;
if (tid >= num_leaves) return;

uint64_t br_0 = __brevll(2 * tid) >> (64 - log_num_rows);
uint64_t br_1 = __brevll(2 * tid + 1) >> (64 - log_num_rows);
const uint64_t *row_0 = data + br_0 * m;
const uint64_t *row_1 = data + br_1 * m;

uint64_t st[25];
#pragma unroll
for (int i = 0; i < 25; ++i) st[i] = 0;

uint32_t rate_pos = 0;
// First row (br_0): cols 0..m-1.
for (uint64_t c = 0; c < m; ++c) {
uint64_t canon = goldilocks::canonical(row[c]);
uint64_t lane = bswap64(canon);
absorb_lane(st, rate_pos, lane);
absorb_lane(st, rate_pos, bswap64(goldilocks::canonical(row_0[c])));
}
// Second row (br_1): cols 0..m-1.
for (uint64_t c = 0; c < m; ++c) {
absorb_lane(st, rate_pos, bswap64(goldilocks::canonical(row_1[c])));
}
finalize_keccak256(st, rate_pos, hashed_leaves_out + tid * 32);
}
9 changes: 6 additions & 3 deletions crypto/math-cuda/src/device.rs
Original file line number Diff line number Diff line change
Expand Up @@ -147,8 +147,9 @@ pub struct Backend {
pub matrix_transpose_strided: CudaFunction,

// keccak.ptx
pub keccak256_leaves_base_row_major: CudaFunction,
pub keccak256_leaves_base_row_major_row_pair: CudaFunction,
pub keccak256_leaves_base_batched: CudaFunction,
pub keccak256_leaves_base_row_pair_batched: CudaFunction,
pub keccak256_leaves_ext3_batched: CudaFunction,
pub keccak_comp_poly_leaves_ext3: CudaFunction,
pub keccak_fri_leaves_ext3: CudaFunction,
Expand Down Expand Up @@ -247,9 +248,11 @@ impl Backend {
ntt_dit_level_row_major: ntt.load_function("ntt_dit_level_row_major")?,
pointwise_mul_row_major: ntt.load_function("pointwise_mul_row_major")?,
matrix_transpose_strided: ntt.load_function("matrix_transpose_strided")?,
keccak256_leaves_base_row_major: keccak
.load_function("keccak256_leaves_base_row_major")?,
keccak256_leaves_base_row_major_row_pair: keccak
.load_function("keccak256_leaves_base_row_major_row_pair")?,
keccak256_leaves_base_batched: keccak.load_function("keccak256_leaves_base_batched")?,
keccak256_leaves_base_row_pair_batched: keccak
.load_function("keccak256_leaves_base_row_pair_batched")?,
keccak256_leaves_ext3_batched: keccak.load_function("keccak256_leaves_ext3_batched")?,
keccak_comp_poly_leaves_ext3: keccak.load_function("keccak_comp_poly_leaves_ext3")?,
keccak_fri_leaves_ext3: keccak.load_function("keccak_fri_leaves_ext3")?,
Expand Down
Loading
Loading