diff --git a/crypto/math-cuda/kernels/keccak.cu b/crypto/math-cuda/kernels/keccak.cu index 9937d7c6e..557b8dd43 100644 --- a/crypto/math-cuda/kernels/keccak.cu +++ b/crypto/math-cuda/kernels/keccak.cu @@ -159,8 +159,8 @@ extern "C" __global__ void keccak256_leaves_base_batched( uint64_t tid = (uint64_t)blockIdx.x * blockDim.x + threadIdx.x; if (tid >= num_rows) return; - // Bit-reverse the row index so we read columns at `br` but write the - // hashed leaf at `tid` — matching the CPU `commit_columns_bit_reversed`. + // Bit-reverse the row index so we read columns at `br` but write the hashed + // leaf at `tid` — matching the CPU per-row `commit_bit_reversed(.., 1)`. uint64_t br = __brevll(tid) >> (64 - log_num_rows); uint64_t st[25]; @@ -181,6 +181,51 @@ extern "C" __global__ void keccak256_leaves_base_batched( finalize_keccak256(st, rate_pos, hashed_leaves_out + tid * 32); } +// --------------------------------------------------------------------------- +// Goldilocks BASE-FIELD row-pair leaf hashing. +// +// Leaf `leaf_idx` hashes TWO consecutive bit-reversed rows +// br_0 = reverse_index(2*leaf_idx), br_1 = reverse_index(2*leaf_idx + 1) +// each written column-by-column in canonical BE (same per-row byte layout as +// `keccak256_leaves_base_batched`), in (br_0 row: col 0..K-1) then (br_1 row: +// col 0..K-1) order. `num_leaves = num_rows / 2`; writes 32 bytes to +// `hashed_leaves_out[leaf_idx * 32 ..]`. Matches the CPU +// `keccak_leaves_row_pair_bit_reversed` (rows_per_leaf = 2) — the base-field +// analog of `keccak_comp_poly_leaves_ext3`. +// --------------------------------------------------------------------------- +extern "C" __global__ void keccak256_leaves_base_row_pair_batched( + const uint64_t *columns_base_ptr, + uint64_t col_stride, + uint64_t num_cols, + uint64_t num_rows, + uint64_t log_num_rows, + uint8_t *hashed_leaves_out) { + uint64_t tid = (uint64_t)blockIdx.x * blockDim.x + threadIdx.x; + uint64_t num_leaves = num_rows >> 1; + if (tid >= num_leaves) return; + + uint64_t br_0 = __brevll(2 * tid) >> (64 - log_num_rows); + uint64_t br_1 = __brevll(2 * tid + 1) >> (64 - log_num_rows); + + uint64_t st[25]; + #pragma unroll + for (int i = 0; i < 25; ++i) st[i] = 0; + + uint32_t rate_pos = 0; + // First row (br_0): col 0..K-1. + for (uint64_t c = 0; c < num_cols; ++c) { + uint64_t v = columns_base_ptr[c * col_stride + br_0]; + absorb_lane(st, rate_pos, bswap64(goldilocks::canonical(v))); + } + // Second row (br_1): col 0..K-1. + for (uint64_t c = 0; c < num_cols; ++c) { + uint64_t v = columns_base_ptr[c * col_stride + br_1]; + absorb_lane(st, rate_pos, bswap64(goldilocks::canonical(v))); + } + + finalize_keccak256(st, rate_pos, hashed_leaves_out + tid * 32); +} + // --------------------------------------------------------------------------- // Goldilocks EXT3 leaf hashing (3 base-field components per ext3 element). // @@ -349,15 +394,22 @@ extern "C" __global__ void keccak_merkle_level( } // --------------------------------------------------------------------------- -// Row-major base leaf hashing. +// Row-major ROW-PAIR leaf hashing. +// +// Row-major analog of `keccak256_leaves_base_row_pair_batched` (which reads a +// column-major slab): each leaf hashes TWO consecutive bit-reversed rows. +// Leaf `tid` hashes row `reverse_index(2*tid)` followed by row +// `reverse_index(2*tid + 1)`, each as `m` canonical big-endian lanes read from +// the contiguous row-major buffer (`data + br * m`). `num_leaves = num_rows/2`; +// writes 32 bytes to `hashed_leaves_out[tid*32 ..]`. // -// Input layout: data[row * m + col] for `num_rows` rows and `m` columns. -// For leaf `tid`, reads the bit-reversed row `br(tid)` — a contiguous slice -// of `m` elements starting at data[br * m]. Coalesced when multiple threads -// in the same warp process consecutive `tid` values (they read non-overlapping -// rows, each a contiguous block of m u64s in order). +// `m` is the row stride in u64s: base trace = num columns; ext3 trace = 3 * +// num columns (an ext3 element's components c0,c1,c2 are consecutive, matching +// the CPU `write_bytes_be`). Byte layout therefore equals the CPU +// `commit_bit_reversed(.., ROWS_PER_LEAF=2)` and the verifier's +// `verify_opening_pair` (queried row ‖ its symmetric counterpart, one leaf). // --------------------------------------------------------------------------- -extern "C" __global__ void keccak256_leaves_base_row_major( +extern "C" __global__ void keccak256_leaves_base_row_major_row_pair( const uint64_t *data, uint64_t m, uint64_t num_rows, @@ -365,19 +417,26 @@ extern "C" __global__ void keccak256_leaves_base_row_major( uint8_t *hashed_leaves_out) { uint64_t tid = (uint64_t)blockIdx.x * blockDim.x + threadIdx.x; - if (tid >= num_rows) return; - uint64_t br = __brevll(tid) >> (64 - log_num_rows); - const uint64_t *row = data + br * m; + uint64_t num_leaves = num_rows >> 1; + if (tid >= num_leaves) return; + + uint64_t br_0 = __brevll(2 * tid) >> (64 - log_num_rows); + uint64_t br_1 = __brevll(2 * tid + 1) >> (64 - log_num_rows); + const uint64_t *row_0 = data + br_0 * m; + const uint64_t *row_1 = data + br_1 * m; uint64_t st[25]; #pragma unroll for (int i = 0; i < 25; ++i) st[i] = 0; uint32_t rate_pos = 0; + // First row (br_0): cols 0..m-1. for (uint64_t c = 0; c < m; ++c) { - uint64_t canon = goldilocks::canonical(row[c]); - uint64_t lane = bswap64(canon); - absorb_lane(st, rate_pos, lane); + absorb_lane(st, rate_pos, bswap64(goldilocks::canonical(row_0[c]))); + } + // Second row (br_1): cols 0..m-1. + for (uint64_t c = 0; c < m; ++c) { + absorb_lane(st, rate_pos, bswap64(goldilocks::canonical(row_1[c]))); } finalize_keccak256(st, rate_pos, hashed_leaves_out + tid * 32); } diff --git a/crypto/math-cuda/src/device.rs b/crypto/math-cuda/src/device.rs index e9db7657e..4270e5da8 100644 --- a/crypto/math-cuda/src/device.rs +++ b/crypto/math-cuda/src/device.rs @@ -147,8 +147,9 @@ pub struct Backend { pub matrix_transpose_strided: CudaFunction, // keccak.ptx - pub keccak256_leaves_base_row_major: CudaFunction, + pub keccak256_leaves_base_row_major_row_pair: CudaFunction, pub keccak256_leaves_base_batched: CudaFunction, + pub keccak256_leaves_base_row_pair_batched: CudaFunction, pub keccak256_leaves_ext3_batched: CudaFunction, pub keccak_comp_poly_leaves_ext3: CudaFunction, pub keccak_fri_leaves_ext3: CudaFunction, @@ -247,9 +248,11 @@ impl Backend { ntt_dit_level_row_major: ntt.load_function("ntt_dit_level_row_major")?, pointwise_mul_row_major: ntt.load_function("pointwise_mul_row_major")?, matrix_transpose_strided: ntt.load_function("matrix_transpose_strided")?, - keccak256_leaves_base_row_major: keccak - .load_function("keccak256_leaves_base_row_major")?, + keccak256_leaves_base_row_major_row_pair: keccak + .load_function("keccak256_leaves_base_row_major_row_pair")?, keccak256_leaves_base_batched: keccak.load_function("keccak256_leaves_base_batched")?, + keccak256_leaves_base_row_pair_batched: keccak + .load_function("keccak256_leaves_base_row_pair_batched")?, keccak256_leaves_ext3_batched: keccak.load_function("keccak256_leaves_ext3_batched")?, keccak_comp_poly_leaves_ext3: keccak.load_function("keccak_comp_poly_leaves_ext3")?, keccak_fri_leaves_ext3: keccak.load_function("keccak_fri_leaves_ext3")?, diff --git a/crypto/math-cuda/src/lde.rs b/crypto/math-cuda/src/lde.rs index 164267684..b08a9394a 100644 --- a/crypto/math-cuda/src/lde.rs +++ b/crypto/math-cuda/src/lde.rs @@ -16,7 +16,10 @@ use cudarc::driver::{CudaSlice, CudaStream, LaunchConfig, PushKernelArg}; use crate::Result; use crate::device::{Backend, backend}; -use crate::merkle::{keccak_launch_cfg, launch_keccak_base, launch_keccak_ext3}; +use crate::merkle::{ + keccak_launch_cfg, launch_keccak_base, launch_keccak_base_row_pair, launch_keccak_ext3, + launch_keccak_ext3_row_pair, +}; use crate::ntt::run_ntt_body; /// Goldilocks `TWO_ADICITY = 32` puts the theoretical domain ceiling at @@ -34,26 +37,26 @@ fn assert_u32_domain(n: usize, what: &str) { /// Output shape requested from the fused LDE + Keccak entry points. #[derive(Copy, Clone, PartialEq, Eq)] enum KeccakCommit { - /// Only the `lde_size` keccak-256 leaves; no inner-tree build. Caller - /// receives `lde_size * 32` bytes. + /// Only the keccak-256 leaves; no inner-tree build. Caller receives + /// `num_leaves * 32` bytes. LeavesOnly, /// Full Merkle tree: leaves at the tail + inner nodes built on-device. - /// Caller receives `(2*lde_size - 1) * 32` bytes. + /// Caller receives `(2*num_leaves - 1) * 32` bytes. FullTree, } impl KeccakCommit { - fn total_nodes_bytes(self, lde_size: usize) -> usize { + fn total_nodes_bytes(self, num_leaves: usize) -> usize { match self { - KeccakCommit::LeavesOnly => lde_size * 32, - KeccakCommit::FullTree => (2 * lde_size - 1) * 32, + KeccakCommit::LeavesOnly => num_leaves * 32, + KeccakCommit::FullTree => (2 * num_leaves - 1) * 32, } } - fn leaves_offset_bytes(self, lde_size: usize) -> usize { + fn leaves_offset_bytes(self, num_leaves: usize) -> usize { match self { KeccakCommit::LeavesOnly => 0, - KeccakCommit::FullTree => (lde_size - 1) * 32, + KeccakCommit::FullTree => (num_leaves - 1) * 32, } } } @@ -304,7 +307,12 @@ fn run_row_major_ntt_body( Ok(()) } -fn launch_keccak_base_row_major( +/// Row-major ROW-PAIR leaf hashing: leaf `i` hashes the two consecutive +/// bit-reversed rows `reverse_index(2i)`, `reverse_index(2i+1)` (each `m` lanes, +/// read contiguously from the row-major `buf`), producing `num_rows / 2` leaves. +/// Row-major analog of [`launch_keccak_base_row_pair`]; matches the CPU +/// `commit_bit_reversed(.., 2)` and the verifier's `verify_opening_pair`. +fn launch_keccak_base_row_major_row_pair( stream: &CudaStream, be: &Backend, buf: &CudaSlice, @@ -313,20 +321,21 @@ fn launch_keccak_base_row_major( log_num_rows: u64, leaves_out: &mut cudarc::driver::CudaViewMut<'_, u8>, ) -> Result<()> { - // The keccak kernel is register-heavy (Keccak state `uint64_t st[25]`), so it - // must launch with the keccak-tuned block dim (128). `for_num_elems` uses 1024 - // threads/block, which exceeds the per-block register budget and fails the - // launch with CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES — silently dropping the whole - // R1 GPU path to the CPU fallback (no device handle for rounds 2-4). - // - // The kernel derives the bit-reversed row as `__brevll(tid) >> (64 - log_num_rows)`; - // a 64-bit shift is UB, so reject `num_rows < 2` (`log_num_rows == 0`), matching - // the `debug_assert!` guard in `launch_keccak_base`. - debug_assert!(num_rows >= 2, "row-major keccak requires num_rows >= 2"); - let cfg = keccak_launch_cfg(num_rows); + // Register-heavy Keccak kernel: launch with the keccak-tuned block dim (128, + // via `keccak_launch_cfg`); a larger block exceeds the per-block register + // budget and fails the launch (CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES). The kernel + // derives rows as `__brevll(2*tid + k) >> (64 - log_num_rows)`; a 64-bit shift + // is UB at `log_num_rows == 0`, so require `num_rows >= 2` (also the minimum + // for a single row pair). + debug_assert!( + num_rows >= 2, + "row-major row-pair keccak requires num_rows >= 2" + ); + // One thread per leaf (= one bit-reversed row pair). + let cfg = keccak_launch_cfg(num_rows >> 1); unsafe { stream - .launch_builder(&be.keccak256_leaves_base_row_major) + .launch_builder(&be.keccak256_leaves_base_row_major_row_pair) .arg(buf) .arg(&m) .arg(&num_rows) @@ -399,7 +408,12 @@ fn coset_lde_row_major_inner( let lde_size = n * blowup_factor; assert_u32_domain(lde_size, what); - let nodes_bytes = KeccakCommit::FullTree.total_nodes_bytes(lde_size); + // Row-pair trace commit: one Merkle leaf per bit-reversed row pair (rows 2i, + // 2i+1), matching the CPU `commit_bit_reversed(.., ROWS_PER_LEAF=2)` and the + // verifier's `verify_opening_pair`. `lde_size` is a power of two >= 2, so it + // is always even. + let num_leaves = lde_size / 2; + let nodes_bytes = KeccakCommit::FullTree.total_nodes_bytes(num_leaves); let log_n = n.trailing_zeros() as u64; let log_lde = lde_size.trailing_zeros() as u64; let n_u64 = n as u64; @@ -445,12 +459,14 @@ fn coset_lde_row_major_inner( cols_u64, )?; - // Keccak + Merkle on-device. Each leaf reads `total_cols` consecutive u64s. + // Keccak + Merkle on-device. Each row-pair leaf reads two bit-reversed rows + // of `total_cols` consecutive u64s (`lde_u64` is the bit-reverse modulus; the + // kernel emits `lde_size / 2` leaves). let mut nodes_dev = unsafe { stream.alloc::(nodes_bytes) }?; - let leaves_offset = KeccakCommit::FullTree.leaves_offset_bytes(lde_size); + let leaves_offset = KeccakCommit::FullTree.leaves_offset_bytes(num_leaves); { - let mut leaves_view = nodes_dev.slice_mut(leaves_offset..leaves_offset + lde_size * 32); - launch_keccak_base_row_major( + let mut leaves_view = nodes_dev.slice_mut(leaves_offset..leaves_offset + num_leaves * 32); + launch_keccak_base_row_major_row_pair( stream.as_ref(), be, &buf, @@ -460,7 +476,7 @@ fn coset_lde_row_major_inner( &mut leaves_view, )?; } - crate::merkle::build_inner_tree_levels(stream.as_ref(), be, &mut nodes_dev, lde_size)?; + crate::merkle::build_inner_tree_levels(stream.as_ref(), be, &mut nodes_dev, num_leaves)?; // D2H the row-major LDE first (before the handle transpose). Release the // staging lock before the Merkle nodes transfer to minimise lock contention. @@ -927,12 +943,12 @@ pub fn coset_lde_batch_base_into( Ok(()) } -/// Fused LDE + Keccak-256 leaf hashing. Caller receives the `lde_size * 32` -/// bytes of leaf hashes in `hashed_leaves_out` (one 32-byte digest per output -/// row, in natural row order; leaves are computed reading columns at -/// bit-reversed rows, matching `commit_columns_bit_reversed` on the CPU -/// side). Thin wrapper over `coset_lde_batch_base_into_with_merkle_tree_inner` -/// with `LeavesOnly` — no inner-tree build, no device handle. +/// Fused LDE + row-pair Keccak-256 leaf hashing. Caller receives +/// `(lde_size / 2) * 32` bytes of leaf hashes in `hashed_leaves_out` (one +/// 32-byte digest per bit-reversed row pair, in natural leaf order, matching +/// `commit_bit_reversed(.., 2)` on the CPU side). Thin wrapper over +/// `coset_lde_batch_base_into_with_merkle_tree_inner` with `LeavesOnly` — no +/// inner-tree build, no device handle. pub fn coset_lde_batch_base_into_with_leaf_hash( columns: &[&[u64]], blowup_factor: usize, @@ -948,13 +964,15 @@ pub fn coset_lde_batch_base_into_with_leaf_hash( hashed_leaves_out, KeccakCommit::LeavesOnly, false, + 2, ) .map(|_| ()) } /// Like `coset_lde_batch_base_into_with_leaf_hash`, but also builds the full -/// Merkle tree on device and returns the `2*lde_size - 1` node buffer back -/// to the caller in `merkle_nodes_out` (byte length `(2*lde_size - 1) * 32`). +/// row-pair Merkle tree on device and returns the `2*(lde_size/2) - 1` node +/// buffer back to the caller in `merkle_nodes_out` (byte length +/// `(2*(lde_size/2) - 1) * 32`). /// /// The leaf hashes are never exposed to the caller — they stay on device and /// feed straight into the pair-hash tree kernel, avoiding the @@ -975,10 +993,12 @@ pub fn coset_lde_batch_base_into_with_merkle_tree( merkle_nodes_out, KeccakCommit::FullTree, false, + 2, ) .map(|_| ()) } +#[allow(clippy::too_many_arguments)] fn coset_lde_batch_base_into_with_merkle_tree_inner( columns: &[&[u64]], blowup_factor: usize, @@ -987,6 +1007,9 @@ fn coset_lde_batch_base_into_with_merkle_tree_inner( nodes_out: &mut [u8], commit: KeccakCommit, keep_device_buf: bool, + // 1 = one leaf per bit-reversed row; 2 = one leaf per row pair (2i, 2i+1), + // matching the CPU `commit_bit_reversed(.., 2)` used for the trace commit. + rows_per_leaf: usize, ) -> Result> { if columns.is_empty() { assert_eq!(outputs.len(), 0); @@ -1010,7 +1033,13 @@ fn coset_lde_batch_base_into_with_merkle_tree_inner( for o in outputs.iter() { assert_eq!(o.len(), lde_size); } - let nodes_dev_bytes = commit.total_nodes_bytes(lde_size); + assert!( + rows_per_leaf == 1 || rows_per_leaf == 2, + "rows_per_leaf must be 1 or 2" + ); + assert_eq!(lde_size % rows_per_leaf, 0); + let num_leaves = lde_size / rows_per_leaf; + let nodes_dev_bytes = commit.total_nodes_bytes(num_leaves); assert_eq!(nodes_out.len(), nodes_dev_bytes); let log_n = n.trailing_zeros() as u64; let log_lde = lde_size.trailing_zeros() as u64; @@ -1093,28 +1122,39 @@ fn coset_lde_batch_base_into_with_merkle_tree_inner( )?; // Allocate the device output buffer. In `LeavesOnly` mode this is just - // `lde_size * 32` bytes (the leaves themselves); in `FullTree` mode it's - // `(2*lde_size - 1) * 32` bytes (leaves in the tail + inner nodes filled + // `num_leaves * 32` bytes (the leaves themselves); in `FullTree` mode it's + // `(2*num_leaves - 1) * 32` bytes (leaves in the tail + inner nodes filled // below). `alloc` (not `alloc_zeros`) is safe because every byte is // written before any reader sees it: the keccak kernel fills the // leaves slab, the inner-tree pass (when present) fills the head. let mut nodes_dev = unsafe { stream.alloc::(nodes_dev_bytes) }?; - let leaves_offset_bytes = commit.leaves_offset_bytes(lde_size); + let leaves_offset_bytes = commit.leaves_offset_bytes(num_leaves); { let mut leaves_view = - nodes_dev.slice_mut(leaves_offset_bytes..leaves_offset_bytes + lde_size * 32); - launch_keccak_base( - stream.as_ref(), - &buf, - col_stride_u64, - m as u64, - lde_u64, - &mut leaves_view, - )?; + nodes_dev.slice_mut(leaves_offset_bytes..leaves_offset_bytes + num_leaves * 32); + if rows_per_leaf == 2 { + launch_keccak_base_row_pair( + stream.as_ref(), + &buf, + col_stride_u64, + m as u64, + lde_u64, + &mut leaves_view, + )?; + } else { + launch_keccak_base( + stream.as_ref(), + &buf, + col_stride_u64, + m as u64, + lde_u64, + &mut leaves_view, + )?; + } } if commit == KeccakCommit::FullTree { - crate::merkle::build_inner_tree_levels(stream.as_ref(), be, &mut nodes_dev, lde_size)?; + crate::merkle::build_inner_tree_levels(stream.as_ref(), be, &mut nodes_dev, num_leaves)?; } // D2H the LDE and the tree/leaves nodes via pinned staging. @@ -1140,8 +1180,8 @@ fn coset_lde_batch_base_into_with_merkle_tree_inner( } } -/// Ext3 variant of `coset_lde_batch_base_into_with_leaf_hash`: fused -/// LDE + Keccak-256 leaf hashing over ext3 columns. Thin wrapper over +/// Ext3 variant of `coset_lde_batch_base_into_with_leaf_hash`: fused LDE + +/// row-pair Keccak-256 leaf hashing over ext3 columns. Thin wrapper over /// `coset_lde_batch_ext3_into_with_merkle_tree_inner` with `LeavesOnly`. pub fn coset_lde_batch_ext3_into_with_leaf_hash( columns: &[&[u64]], @@ -1160,13 +1200,14 @@ pub fn coset_lde_batch_ext3_into_with_leaf_hash( hashed_leaves_out, KeccakCommit::LeavesOnly, false, + 2, ) .map(|_| ()) } /// Ext3 variant of the fused `coset_lde_batch_base_into_with_merkle_tree`. /// LDE + leaf hashing + inner-tree build, all on device; D2Hs only the LDE -/// evaluations and the full `2*lde_size - 1` node buffer. +/// evaluations and the full `2*(lde_size/2) - 1` row-pair node buffer. pub fn coset_lde_batch_ext3_into_with_merkle_tree( columns: &[&[u64]], n: usize, @@ -1184,6 +1225,7 @@ pub fn coset_lde_batch_ext3_into_with_merkle_tree( merkle_nodes_out, KeccakCommit::FullTree, false, + 2, ) .map(|_| ()) } @@ -1198,6 +1240,9 @@ fn coset_lde_batch_ext3_into_with_merkle_tree_inner( nodes_out: &mut [u8], commit: KeccakCommit, keep_device_buf: bool, + // 1 = one leaf per bit-reversed row; 2 = one leaf per row pair (2i, 2i+1), + // matching the CPU `commit_bit_reversed(.., 2)` used for the trace commit. + rows_per_leaf: usize, ) -> Result> { if columns.is_empty() { assert_eq!(outputs.len(), 0); @@ -1223,7 +1268,13 @@ fn coset_lde_batch_ext3_into_with_merkle_tree_inner( for o in outputs.iter() { assert_eq!(o.len(), 3 * lde_size); } - let nodes_dev_bytes = commit.total_nodes_bytes(lde_size); + assert!( + rows_per_leaf == 1 || rows_per_leaf == 2, + "rows_per_leaf must be 1 or 2" + ); + assert_eq!(lde_size % rows_per_leaf, 0); + let num_leaves = lde_size / rows_per_leaf; + let nodes_dev_bytes = commit.total_nodes_bytes(num_leaves); assert_eq!(nodes_out.len(), nodes_dev_bytes); let log_n = n.trailing_zeros() as u64; let log_lde = lde_size.trailing_zeros() as u64; @@ -1300,26 +1351,37 @@ fn coset_lde_batch_ext3_into_with_merkle_tree_inner( mb_u32, )?; - // Allocate device output buffer (LeavesOnly → lde_size*32; FullTree → - // (2*lde_size - 1)*32). Leaf kernel writes to the leaves slab; the + // Allocate device output buffer (LeavesOnly -> num_leaves*32; FullTree -> + // (2*num_leaves - 1)*32). Leaf kernel writes to the leaves slab; the // inner-tree pass (when present) fills the head. let mut nodes_dev = unsafe { stream.alloc::(nodes_dev_bytes) }?; - let leaves_offset_bytes = commit.leaves_offset_bytes(lde_size); + let leaves_offset_bytes = commit.leaves_offset_bytes(num_leaves); { let mut leaves_view = - nodes_dev.slice_mut(leaves_offset_bytes..leaves_offset_bytes + lde_size * 32); - launch_keccak_ext3( - stream.as_ref(), - &buf, - col_stride_u64, - m as u64, - lde_u64, - &mut leaves_view, - )?; + nodes_dev.slice_mut(leaves_offset_bytes..leaves_offset_bytes + num_leaves * 32); + if rows_per_leaf == 2 { + launch_keccak_ext3_row_pair( + stream.as_ref(), + &buf, + col_stride_u64, + m as u64, + lde_u64, + &mut leaves_view, + )?; + } else { + launch_keccak_ext3( + stream.as_ref(), + &buf, + col_stride_u64, + m as u64, + lde_u64, + &mut leaves_view, + )?; + } } if commit == KeccakCommit::FullTree { - crate::merkle::build_inner_tree_levels(stream.as_ref(), be, &mut nodes_dev, lde_size)?; + crate::merkle::build_inner_tree_levels(stream.as_ref(), be, &mut nodes_dev, num_leaves)?; } // D2H LDE (mb * lde_size u64) and tree/leaves nodes. diff --git a/crypto/math-cuda/src/merkle.rs b/crypto/math-cuda/src/merkle.rs index 932e81325..27f38ce0a 100644 --- a/crypto/math-cuda/src/merkle.rs +++ b/crypto/math-cuda/src/merkle.rs @@ -3,7 +3,7 @@ //! Matches `FieldElementVectorBackend::hash_data` in //! `crypto/crypto/src/merkle_tree/backends/field_element_vector.rs`, combined //! with the `reverse_index` row read pattern used in -//! `commit_columns_bit_reversed` at `crypto/stark/src/prover.rs`. +//! `commit_bit_reversed` at `crypto/stark/src/commitment.rs`. //! //! Caller supplies base-field column slabs already laid out as //! `[col * col_stride + row]` (the same layout `coset_lde_batch_base_into` @@ -25,15 +25,27 @@ use crate::lde::pack_ext3_to_pinned_slabs; /// Run GPU Keccak-256 leaf hashing on a base-field column buffer. /// /// `columns` must hold `num_cols * col_stride` u64s with column `c`'s data -/// at `[c*col_stride .. c*col_stride + num_rows]`. Returns `num_rows * 32` -/// hash bytes in natural (non-bit-reversed) row order. +/// at `[c*col_stride .. c*col_stride + num_rows]`. `rows_per_leaf` selects the +/// leaf layout: `1` = one leaf per bit-reversed row (`num_rows` leaves), `2` = +/// one leaf per bit-reversed row pair `2i`,`2i+1` (`num_rows/2` leaves, the +/// trace-commit layout). Returns `(num_rows / rows_per_leaf) * 32` hash bytes. pub fn keccak_leaves_base( columns: &[u64], col_stride: usize, num_cols: usize, num_rows: usize, + rows_per_leaf: usize, ) -> Result> { assert!(num_rows.is_power_of_two()); + assert!(rows_per_leaf == 1 || rows_per_leaf == 2); + assert!( + num_rows >= rows_per_leaf, + "num_rows must be at least rows_per_leaf" + ); + assert!( + num_rows >= 2, + "num_rows must be at least 2 for bit-reversed GPU leaf hashing" + ); assert!( col_stride >= num_rows, "col_stride must be >= num_rows to keep per-column reads in-bounds" @@ -45,8 +57,13 @@ pub fn keccak_leaves_base( let be = backend()?; let stream = be.next_stream(); let cols_dev = stream.clone_htod(&columns[..total])?; - let mut out_dev = stream.alloc_zeros::(num_rows * 32)?; - launch_keccak_base( + let mut out_dev = stream.alloc_zeros::((num_rows / rows_per_leaf) * 32)?; + let launch = if rows_per_leaf == 2 { + launch_keccak_base_row_pair + } else { + launch_keccak_base + }; + launch( stream.as_ref(), &cols_dev, col_stride as u64, @@ -60,14 +77,25 @@ pub fn keccak_leaves_base( } /// Ext3 variant. Columns interleaved as three base slabs per ext3 column. -/// `columns.len() >= num_cols * 3 * col_stride`. +/// `columns.len() >= num_cols * 3 * col_stride`. `rows_per_leaf` as in +/// [`keccak_leaves_base`]. pub fn keccak_leaves_ext3( columns: &[u64], col_stride: usize, num_cols: usize, num_rows: usize, + rows_per_leaf: usize, ) -> Result> { assert!(num_rows.is_power_of_two()); + assert!(rows_per_leaf == 1 || rows_per_leaf == 2); + assert!( + num_rows >= rows_per_leaf, + "num_rows must be at least rows_per_leaf" + ); + assert!( + num_rows >= 2, + "num_rows must be at least 2 for bit-reversed GPU leaf hashing" + ); assert!( col_stride >= num_rows, "col_stride must be >= num_rows to keep per-column reads in-bounds" @@ -80,8 +108,13 @@ pub fn keccak_leaves_ext3( let be = backend()?; let stream = be.next_stream(); let cols_dev = stream.clone_htod(&columns[..total])?; - let mut out_dev = stream.alloc_zeros::(num_rows * 32)?; - launch_keccak_ext3( + let mut out_dev = stream.alloc_zeros::((num_rows / rows_per_leaf) * 32)?; + let launch = if rows_per_leaf == 2 { + launch_keccak_ext3_row_pair + } else { + launch_keccak_ext3 + }; + launch( stream.as_ref(), &cols_dev, col_stride as u64, @@ -170,6 +203,72 @@ pub(crate) fn launch_keccak_base( Ok(()) } +/// Row-pair base-field leaf hashing: leaf `i` hashes bit-reversed rows `2i`, +/// `2i+1` (one Merkle path per FRI query). Writes `num_rows/2` leaves of 32 +/// bytes into `out_dev`. Base-field analog of the comp-poly ext3 path; matches +/// the CPU `keccak_leaves_row_pair_bit_reversed`. +pub(crate) fn launch_keccak_base_row_pair( + stream: &CudaStream, + cols_dev: &CudaSlice, + col_stride: u64, + num_cols: u64, + num_rows: u64, + out_dev: &mut CudaViewMut<'_, u8>, +) -> Result<()> { + debug_assert!( + num_rows >= 2, + "keccak row-pair leaf kernel: num_rows must be >= 2" + ); + let be = backend()?; + let log_num_rows = num_rows.trailing_zeros() as u64; + // One thread per leaf (= row pair). + let cfg = keccak_launch_cfg(num_rows >> 1); + unsafe { + stream + .launch_builder(&be.keccak256_leaves_base_row_pair_batched) + .arg(cols_dev) + .arg(&col_stride) + .arg(&num_cols) + .arg(&num_rows) + .arg(&log_num_rows) + .arg(out_dev) + .launch(cfg)?; + } + Ok(()) +} + +/// Row-pair ext3 leaf hashing for the aux trace: reuses the comp-poly kernel +/// (`keccak_comp_poly_leaves_ext3`), which hashes bit-reversed rows `2i`, `2i+1` +/// across all ext3 columns. Writes `num_rows/2` leaves of 32 bytes. +pub(crate) fn launch_keccak_ext3_row_pair( + stream: &CudaStream, + cols_dev: &CudaSlice, + col_stride: u64, + num_cols: u64, + num_rows: u64, + out_dev: &mut CudaViewMut<'_, u8>, +) -> Result<()> { + debug_assert!( + num_rows >= 2, + "keccak row-pair leaf kernel: num_rows must be >= 2" + ); + let be = backend()?; + let log_num_rows = num_rows.trailing_zeros() as u64; + let cfg = keccak_launch_cfg(num_rows >> 1); + unsafe { + stream + .launch_builder(&be.keccak_comp_poly_leaves_ext3) + .arg(cols_dev) + .arg(&col_stride) + .arg(&num_cols) + .arg(&num_rows) + .arg(&log_num_rows) + .arg(out_dev) + .launch(cfg)?; + } + Ok(()) +} + /// Given `hashed_leaves` of length `leaves_len * 32`, build the full Merkle /// tree on device and return the complete node buffer `(2*leaves_len - 1) * /// 32` bytes in the standard layout: diff --git a/crypto/math-cuda/tests/comp_poly_tree.rs b/crypto/math-cuda/tests/comp_poly_tree.rs index 29e33b6fe..51b826dd1 100644 --- a/crypto/math-cuda/tests/comp_poly_tree.rs +++ b/crypto/math-cuda/tests/comp_poly_tree.rs @@ -1,6 +1,6 @@ //! Parity: GPU fused `evaluate_poly_coset_batch_ext3_into_with_merkle_tree` //! (LDE + row-pair Keccak leaves + Merkle inner tree) against the same CPU -//! pipeline produced by `commit_composition_polynomial`. +//! row-pair commitment layout used by `commit_bit_reversed(.., 2)`. use math::field::element::FieldElement; use math::field::extensions_goldilocks::Degree3GoldilocksExtensionField; @@ -83,7 +83,7 @@ fn cpu_hash_pair(left: &[u8; 32], right: &[u8; 32]) -> [u8; 32] { out } -/// CPU: `commit_composition_polynomial`-style tree root over num_rows/2 leaves. +/// CPU: `commit_bit_reversed(.., 2)`-style tree root over num_rows/2 leaves. fn cpu_tree_nodes(parts: &[Vec]) -> Vec<[u8; 32]> { let num_rows = parts[0].len(); let num_parts = parts.len(); diff --git a/crypto/math-cuda/tests/keccak_leaves.rs b/crypto/math-cuda/tests/keccak_leaves.rs index d614e233d..61a861f32 100644 --- a/crypto/math-cuda/tests/keccak_leaves.rs +++ b/crypto/math-cuda/tests/keccak_leaves.rs @@ -38,7 +38,7 @@ fn keccak_leaves_base_matches_cpu() { flat[c * n + r] = *e.value(); } } - let gpu = math_cuda::merkle::keccak_leaves_base(&flat, n, num_cols, n).unwrap(); + let gpu = math_cuda::merkle::keccak_leaves_base(&flat, n, num_cols, n, 1).unwrap(); assert_eq!(gpu.len(), n * 32); for i in 0..n { assert_eq!( @@ -84,7 +84,7 @@ fn keccak_leaves_ext3_matches_cpu() { flat[(c * 3 + 2) * n + r] = *e.value()[2].value(); } } - let gpu = math_cuda::merkle::keccak_leaves_ext3(&flat, n, num_cols, n).unwrap(); + let gpu = math_cuda::merkle::keccak_leaves_ext3(&flat, n, num_cols, n, 1).unwrap(); assert_eq!(gpu.len(), n * 32); for i in 0..n { assert_eq!( @@ -97,6 +97,87 @@ fn keccak_leaves_ext3_matches_cpu() { } } +#[test] +fn keccak_leaves_base_row_pair_matches_cpu() { + // Row-pair (trace) commit: leaf `i` hashes bit-reversed rows `2i`, `2i+1`. + // GPU `keccak_leaves_base(.., rows_per_leaf=2)` must match the CPU prover + // helper `keccak_leaves_row_pair_bit_reversed` over base columns. + for log_n in [4u32, 6, 8, 10, 12] { + for num_cols in [1usize, 5, 17, 41] { + let n = 1 << log_n; + let mut rng = ChaCha8Rng::seed_from_u64(500 + log_n as u64 + num_cols as u64); + let columns: Vec> = (0..num_cols) + .map(|_| (0..n).map(|_| Fp::from_raw(rng.r#gen::())).collect()) + .collect(); + + let cpu = keccak_leaves_row_pair_bit_reversed(&columns); + assert_eq!(cpu.len(), n / 2); + + let mut flat = vec![0u64; num_cols * n]; + for (c, col) in columns.iter().enumerate() { + for (r, e) in col.iter().enumerate() { + flat[c * n + r] = *e.value(); + } + } + let gpu = math_cuda::merkle::keccak_leaves_base(&flat, n, num_cols, n, 2).unwrap(); + assert_eq!(gpu.len(), (n / 2) * 32); + for i in 0..n / 2 { + assert_eq!( + &gpu[i * 32..(i + 1) * 32], + &cpu[i][..], + "base row-pair leaf mismatch at i={i} (log_n={log_n}, cols={num_cols})" + ); + } + } + } +} + +#[test] +fn keccak_leaves_ext3_row_pair_matches_cpu() { + for log_n in [4u32, 6, 8, 10] { + for num_cols in [1usize, 3, 11, 20] { + let n = 1 << log_n; + let mut rng = ChaCha8Rng::seed_from_u64(600 + log_n as u64 + num_cols as u64); + let columns: Vec> = (0..num_cols) + .map(|_| { + (0..n) + .map(|_| { + Fp3::new([ + Fp::from_raw(rng.r#gen::()), + Fp::from_raw(rng.r#gen::()), + Fp::from_raw(rng.r#gen::()), + ]) + }) + .collect() + }) + .collect(); + + let cpu = keccak_leaves_row_pair_bit_reversed(&columns); + assert_eq!(cpu.len(), n / 2); + + // De-interleaved 3-slab layout per ext3 column (same as the 1-row + // ext3 leaf path): [col*3+k] each a contiguous slab of n u64s. + let mut flat = vec![0u64; num_cols * 3 * n]; + for (c, col) in columns.iter().enumerate() { + for (r, e) in col.iter().enumerate() { + flat[(c * 3) * n + r] = *e.value()[0].value(); + flat[(c * 3 + 1) * n + r] = *e.value()[1].value(); + flat[(c * 3 + 2) * n + r] = *e.value()[2].value(); + } + } + let gpu = math_cuda::merkle::keccak_leaves_ext3(&flat, n, num_cols, n, 2).unwrap(); + assert_eq!(gpu.len(), (n / 2) * 32); + for i in 0..n / 2 { + assert_eq!( + &gpu[i * 32..(i + 1) * 32], + &cpu[i][..], + "ext3 row-pair leaf mismatch at i={i} (log_n={log_n}, cols={num_cols})" + ); + } + } + } +} + #[test] fn keccak_comp_poly_leaves_matches_cpu() { // Built tree's leaves live at byte offset `(num_leaves - 1) * 32` and diff --git a/crypto/math-cuda/tests/merkle_root_parity.rs b/crypto/math-cuda/tests/merkle_root_parity.rs index 72e2aaea4..0cbe016b6 100644 --- a/crypto/math-cuda/tests/merkle_root_parity.rs +++ b/crypto/math-cuda/tests/merkle_root_parity.rs @@ -55,7 +55,9 @@ fn gpu_merkle_root(columns: &[Vec], blowup: usize, weights: &[u64]) -> [u8; } } - let gpu_leaves = math_cuda::merkle::keccak_leaves_base(&flat, n_lde, num_cols, n_lde) + // Per-row leaves (rows_per_leaf = 1): this parity test compares the generic + // keccak-leaves + Merkle primitives against a per-row CPU reference. + let gpu_leaves = math_cuda::merkle::keccak_leaves_base(&flat, n_lde, num_cols, n_lde, 1) .expect("GPU keccak leaves"); let nodes = math_cuda::merkle::build_merkle_tree_on_device(&gpu_leaves).expect("GPU Merkle tree"); @@ -190,7 +192,7 @@ fn gpu_ext3_merkle_root(columns: &[Vec], blowup: usize, weights: &[u64]) -> } let gpu_leaves = - math_cuda::merkle::keccak_leaves_ext3(&flat_for_keccak, lde_size, num_cols, lde_size) + math_cuda::merkle::keccak_leaves_ext3(&flat_for_keccak, lde_size, num_cols, lde_size, 1) .expect("GPU ext3 keccak leaves"); let nodes = math_cuda::merkle::build_merkle_tree_on_device(&gpu_leaves).expect("GPU Merkle tree"); diff --git a/crypto/stark/src/commitment.rs b/crypto/stark/src/commitment.rs new file mode 100644 index 000000000..d4a6dbdbe --- /dev/null +++ b/crypto/stark/src/commitment.rs @@ -0,0 +1,155 @@ +//! Merkle-tree commitment to bit-reversed, column-major LDE evaluations. +//! +//! This is the commitment layer the prover uses for the main/aux trace LDEs and +//! the composition-polynomial parts. It is decoupled from `IsStarkProver`: the +//! prover only orchestrates *when* to commit; the *how* (leaf layout, bit-reverse +//! permutation, Keccak hashing, tree build) lives here. +//! +//! ## Leaf layout +//! +//! For each leaf `i` we hash `rows_per_leaf` consecutive (bit-reversed) rows, +//! big-endian-concatenated column-by-column: +//! +//! ```text +//! leaf(i) = keccak( col_0[br(R·i)]‖col_1[br(R·i)]‖… ‖ col_0[br(R·i+1)]‖… ‖ … ) +//! where R = rows_per_leaf and br(j) = reverse_index(j, num_rows) +//! ``` +//! +//! - `rows_per_leaf == 2` (`ROWS_PER_LEAF`): a row pair per leaf (leaf `i` hashes +//! rows `2i` and `2i+1`). Used by BOTH the main/aux trace LDE and the +//! composition-polynomial parts: a FRI query opens a value and its symmetric +//! counterpart — exactly this pair — so one Merkle path authenticates both. +//! - `rows_per_leaf == 1`: one row per leaf. No longer used by the prover; kept +//! only so the GPU parity tests can compare against the per-row code path. +//! +//! The field-element serialization (`write_bytes_be`) + `hash_bytes` path is kept +//! exactly as before. + +use math::fft::bit_reversing::reverse_index; +use math::field::element::FieldElement; +use math::field::traits::IsField; +use math::traits::{AsBytes, ByteConversion}; + +#[cfg(feature = "parallel")] +use rayon::prelude::{IntoParallelIterator, ParallelIterator}; + +use crate::config::{BatchedMerkleTree, BatchedMerkleTreeBackend, Commitment}; + +/// Number of consecutive (bit-reversed) rows packed into one Merkle leaf for the +/// trace AND composition-polynomial commitments: the row-pair leaf the FRI +/// openings rely on (leaf `i` hashes rows `2i` and `2i+1`, so one Merkle path +/// authenticates both a value and its symmetric counterpart). +pub const ROWS_PER_LEAF: usize = 2; + +/// Computes the Keccak-256 leaf hashes for a bit-reversed, column-major commitment, +/// grouping `rows_per_leaf` consecutive bit-reversed rows into each leaf. +/// +/// Returns one `Commitment` per leaf (`columns[0].len() / rows_per_leaf` leaves), +/// or an empty `Vec` when there is nothing to hash. See the module docs for the +/// exact leaf byte layout. This is the single code path behind both the per-row +/// ([`keccak_leaves_bit_reversed`]) and per-row-pair +/// ([`keccak_leaves_row_pair_bit_reversed`]) commitments. +pub fn keccak_leaves_bit_reversed_grouped( + columns: &[Vec>], + rows_per_leaf: usize, +) -> Vec +where + E: IsField, + FieldElement: AsBytes + Sync + Send + ByteConversion, +{ + if columns.is_empty() || columns[0].is_empty() { + return Vec::new(); + } + + let num_rows = columns[0].len(); + let byte_len = as ByteConversion>::BYTE_LEN; + + debug_assert!( + num_rows.is_power_of_two(), + "num_rows must be a power of two for reverse_index" + ); + debug_assert!( + rows_per_leaf >= 1 && num_rows.is_multiple_of(rows_per_leaf), + "num_rows must be a multiple of rows_per_leaf" + ); + + let num_leaves = num_rows / rows_per_leaf; + let total_bytes = rows_per_leaf * columns.len() * byte_len; + + // Leaf `i`: the `rows_per_leaf` bit-reversed rows starting at `R·i`, each row + // written column-by-column in big-endian, then hashed once. + let hash_leaf = |buf: &mut [u8], leaf_idx: usize| -> Commitment { + let mut offset = 0; + for k in 0..rows_per_leaf { + let br = reverse_index(rows_per_leaf * leaf_idx + k, num_rows as u64); + for col in columns { + col[br].write_bytes_be(&mut buf[offset..offset + byte_len]); + offset += byte_len; + } + } + BatchedMerkleTreeBackend::::hash_bytes(buf) + }; + + // Per-thread buffer reuse (map_init) avoids millions of small allocations. + #[cfg(feature = "parallel")] + let result: Vec = (0..num_leaves) + .into_par_iter() + .map_init(|| vec![0u8; total_bytes], |buf, i| hash_leaf(buf, i)) + .collect(); + + #[cfg(not(feature = "parallel"))] + let result: Vec = { + let mut buf = vec![0u8; total_bytes]; + (0..num_leaves).map(|i| hash_leaf(&mut buf, i)).collect() + }; + + result +} + +/// Per-row Keccak-256 leaf hashes (one leaf per bit-reversed row). Thin wrapper +/// over [`keccak_leaves_bit_reversed_grouped`] with `rows_per_leaf = 1`. +/// +/// The prover no longer commits per-row (trace and composition both use the +/// row-pair layout, `ROWS_PER_LEAF`); this stays a named public function only so +/// the GPU parity tests in dependent crates can compare the per-row code path. +pub fn keccak_leaves_bit_reversed(columns: &[Vec>]) -> Vec +where + E: IsField, + FieldElement: AsBytes + Sync + Send + ByteConversion, +{ + keccak_leaves_bit_reversed_grouped(columns, 1) +} + +/// Per-row-pair Keccak-256 leaf hashes (leaf `i` hashes bit-reversed rows `2i`, +/// `2i+1`). Used for the composition-polynomial-parts commitment. Thin wrapper +/// over [`keccak_leaves_bit_reversed_grouped`] with `rows_per_leaf = 2`. +pub fn keccak_leaves_row_pair_bit_reversed(parts: &[Vec>]) -> Vec +where + E: IsField, + FieldElement: AsBytes + Sync + Send + ByteConversion, +{ + keccak_leaves_bit_reversed_grouped(parts, 2) +} + +/// Builds the Merkle tree committing to `columns`' bit-reversed, column-major LDE +/// evaluations, grouping `rows_per_leaf` rows per leaf, and returns the tree and +/// its root. `None` when there is nothing to commit. +/// +/// Replaces the prover's former `commit_columns_bit_reversed` (`rows_per_leaf = 1`) +/// and `commit_composition_polynomial` (`rows_per_leaf = 2`). +pub fn commit_bit_reversed( + columns: &[Vec>], + rows_per_leaf: usize, +) -> Option<(BatchedMerkleTree, Commitment)> +where + E: IsField, + FieldElement: AsBytes + Sync + Send + ByteConversion, +{ + if columns.is_empty() || columns[0].is_empty() { + return None; + } + let hashed_leaves = keccak_leaves_bit_reversed_grouped(columns, rows_per_leaf); + let tree = BatchedMerkleTree::::build_from_hashed_leaves(hashed_leaves)?; + let root = tree.root; + Some((tree, root)) +} diff --git a/crypto/stark/src/gpu_lde.rs b/crypto/stark/src/gpu_lde.rs index 29e9b94e6..920bf937e 100644 --- a/crypto/stark/src/gpu_lde.rs +++ b/crypto/stark/src/gpu_lde.rs @@ -269,29 +269,6 @@ fn restore_columns_on_err(columns: &mut [Vec>], n: u } } -/// Allocate the `[u8; 32]` Merkle node buffer for a tree of `lde_size` leaves -/// and return the node `Vec` (length-initialised, contents undefined) together -/// with its node count `total_nodes` (`2 * lde_size - 1`). Returns `None` if -/// the layout would be invalid (`lde_size < 2` or `total_nodes * 32` overflows -/// `usize`). The caller builds the `&mut [u8]` byte view of length -/// `total_nodes * 32` and must overwrite every byte via the GPU D2H. -fn alloc_merkle_nodes(lde_size: usize) -> Option<(Vec<[u8; 32]>, usize)> { - if lde_size < 2 { - return None; - } - let total_nodes = 2usize.saturating_mul(lde_size).checked_sub(1)?; - let _byte_len = total_nodes.checked_mul(32)?; - let mut nodes: Vec<[u8; 32]> = Vec::with_capacity(total_nodes); - // SAFETY: every byte will be overwritten via the GPU D2H before the - // contents are read. The caller computes the byte-length view from the - // returned `nodes` Vec using `total_nodes.checked_mul(32)`. - #[allow(clippy::uninit_vec)] - unsafe { - nodes.set_len(total_nodes) - }; - Some((nodes, total_nodes)) -} - /// Try to GPU-batch all columns in one pass. /// /// Engaged for Goldilocks-base and ext3 tables whose LDE size is above the @@ -303,6 +280,7 @@ fn alloc_merkle_nodes(lde_size: usize) -> Option<(Vec<[u8; 32]>, usize)> { /// Returns `Some(())` if the batch was handled on GPU and `columns` now holds /// the LDE evaluations, or if there were no columns to expand. Returns `None` /// to let the caller run the per-column CPU fallback. +#[cfg_attr(not(feature = "debug-checks"), allow(dead_code))] pub(crate) fn try_expand_columns_batched( columns: &mut [Vec>], blowup_factor: usize, @@ -598,6 +576,7 @@ where /// transform uses only base-field twiddles and coset weights, which act /// componentwise on ext3, so the per-component result equals the ext3 LDE the /// CPU path computes. +#[cfg_attr(not(feature = "debug-checks"), allow(dead_code))] fn try_expand_columns_batched_ext3( columns: &mut [Vec>], blowup_factor: usize, @@ -757,8 +736,8 @@ where /// host-side ext3 LDE eval Vecs produced by /// [`try_evaluate_parts_on_lde_gpu_keep`] (or the CPU path). Uses the same /// row-pair leaf pattern as the CPU -/// `commit_composition_polynomial`: each leaf hashes 2 consecutive -/// bit-reversed rows. +/// `commit_bit_reversed` (composition-polynomial commit path): each leaf hashes +/// 2 consecutive bit-reversed rows. /// /// Returns `None` to fall through to the CPU path when the type or size /// conditions don't hold; returns `None` on a math-cuda `Err` so the caller diff --git a/crypto/stark/src/instruments.rs b/crypto/stark/src/instruments.rs index 16ff95082..aa5cc5436 100644 --- a/crypto/stark/src/instruments.rs +++ b/crypto/stark/src/instruments.rs @@ -33,7 +33,7 @@ pub struct TableSubOps { pub constraints: Duration, /// decompose_and_extend_d2 pub comp_decompose: Duration, - /// commit_composition_polynomial + /// commit_bit_reversed (composition-polynomial commit step) pub comp_commit: Duration, /// Round 3: barycentric OOD evaluation pub ood: Duration, @@ -52,11 +52,11 @@ pub struct TableSubOps { pub struct Round1SubOps { /// Main trace: expand_columns_to_lde (LDE/FFT) pub main_lde: Duration, - /// Main trace: commit_columns_bit_reversed (Merkle) + /// Main trace: commit_bit_reversed (Merkle) pub main_merkle: Duration, /// Aux trace: expand_columns_to_lde (LDE/FFT) pub aux_lde: Duration, - /// Aux trace: commit_columns_bit_reversed (Merkle) + /// Aux trace: commit_bit_reversed (Merkle) pub aux_merkle: Duration, } diff --git a/crypto/stark/src/lib.rs b/crypto/stark/src/lib.rs index e9f6a1cda..87236c5f9 100644 --- a/crypto/stark/src/lib.rs +++ b/crypto/stark/src/lib.rs @@ -5,6 +5,7 @@ compile_error!("the `disk-spill` feature requires memmap2, which does not compil #[cfg(feature = "debug-checks")] pub mod bus_debug; +pub mod commitment; pub mod constraints; pub mod context; pub mod debug; diff --git a/crypto/stark/src/par.rs b/crypto/stark/src/par.rs index a20a452b6..cee693e3f 100644 --- a/crypto/stark/src/par.rs +++ b/crypto/stark/src/par.rs @@ -37,3 +37,58 @@ where (a(), b()) } } + +/// Map `f(i)` over `range` and collect into a `Vec`, preserving index order. +/// Parallel when `feature = "parallel"`, sequential otherwise. Rayon's +/// `collect()` is index-ordered, so the result is identical either way. +pub(crate) fn par_map_collect( + range: std::ops::Range, + f: impl Fn(usize) -> R + Sync + Send, +) -> Vec { + #[cfg(feature = "parallel")] + { + use rayon::prelude::*; + range.into_par_iter().map(f).collect() + } + #[cfg(not(feature = "parallel"))] + { + range.map(f).collect() + } +} + +/// Run `f(&mut item)` for each element of `slice`. Parallel when +/// `feature = "parallel"`, sequential otherwise (ordering is irrelevant). +// Only called from the `debug-checks`-gated column-LDE reconstruct path +// (production LDE is row-major); keep it available without warning otherwise. +#[cfg_attr(not(feature = "debug-checks"), allow(dead_code))] +pub(crate) fn par_for_each_mut(slice: &mut [T], f: impl Fn(&mut T) + Sync + Send) { + #[cfg(feature = "parallel")] + { + use rayon::prelude::*; + slice.par_iter_mut().for_each(f); + } + #[cfg(not(feature = "parallel"))] + { + slice.iter_mut().for_each(f); + } +} + +/// Run `f(&mut item)` for each element of `slice`, short-circuiting on the +/// first `Err`. Parallel when `feature = "parallel"`, sequential otherwise. +// Only called from `disk-spill`-gated paths; keep it available without warning +// when that feature is off. +#[cfg_attr(not(feature = "disk-spill"), allow(dead_code))] +pub(crate) fn par_try_for_each_mut( + slice: &mut [T], + f: impl Fn(&mut T) -> Result<(), E> + Sync + Send, +) -> Result<(), E> { + #[cfg(feature = "parallel")] + { + use rayon::prelude::*; + slice.par_iter_mut().try_for_each(f) + } + #[cfg(not(feature = "parallel"))] + { + slice.iter_mut().try_for_each(f) + } +} diff --git a/crypto/stark/src/proof/stark.rs b/crypto/stark/src/proof/stark.rs index 1751d60fe..851c0b37a 100644 --- a/crypto/stark/src/proof/stark.rs +++ b/crypto/stark/src/proof/stark.rs @@ -10,9 +10,14 @@ use crate::{ #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] #[serde(bound = "")] +/// Opening of a bit-reversed, row-paired commitment at one FRI query. +/// +/// The queried row and its symmetric counterpart (LDE positions `2·iota`, +/// `2·iota+1`) are committed together as a single leaf at position `iota`, so one +/// Merkle `proof` authenticates both `evaluations` (the row) and +/// `evaluations_sym` (its symmetric). Same layout used for trace and composition. pub struct PolynomialOpenings { pub proof: Proof, - pub proof_sym: Proof, pub evaluations: Vec>, pub evaluations_sym: Vec>, } diff --git a/crypto/stark/src/prover.rs b/crypto/stark/src/prover.rs index 30554c15e..2ce1cb855 100644 --- a/crypto/stark/src/prover.rs +++ b/crypto/stark/src/prover.rs @@ -12,7 +12,7 @@ use math::fft::two_half_fft::TwoHalfTwiddles; use log::info; use math::field::traits::{IsField, IsSubFieldOf}; use math::spill_safe::SpillSafe; -use math::traits::{AsBytes, ByteConversion}; +use math::traits::AsBytes; use math::{ field::{element::FieldElement, traits::IsFFTField}, polynomial::Polynomial, @@ -44,6 +44,8 @@ use super::proof::stark::{DeepPolynomialOpening, MultiProof, StarkProof}; use super::trace::TraceTable; use super::traits::AIR; +pub use crate::commitment::{keccak_leaves_bit_reversed, keccak_leaves_row_pair_bit_reversed}; + /// A triple of (AIR, TraceTable, PublicInputs) for proving. type AirTracePair<'a, Field, FieldExtension, PI> = ( &'a dyn AIR, @@ -86,6 +88,17 @@ pub enum ProvingError { /// out of disk space, fd exhaustion, or mmap failure. #[cfg(feature = "disk-spill")] DiskSpill(String), + /// An internal FFT/LDE computation failed (e.g. domain size exceeds the + /// field's two-adicity, or a degenerate coset offset). Distinct from + /// `WrongParameter` because the cause is internal prover machinery, not a + /// caller-supplied parameter. Carries the underlying `FFTError`'s message. + Fft(String), +} + +impl From for ProvingError { + fn from(e: FFTError) -> Self { + ProvingError::Fft(format!("{e}")) + } } /// Commitment artifacts for one trace table (main or auxiliary). Used for both @@ -432,7 +445,7 @@ where /// A container for the results of the third round of the STARK Prove protocol. pub(crate) struct Round3 { - /// Evaluations of the trace polynomials, main ans auxiliary, at the out-of-domain challenge. + /// Evaluations of the trace polynomials, main and auxiliary, at the out-of-domain challenge. trace_ood_evaluations: Table, /// Evaluations of the composition polynomial parts at the out-of-domain challenge. composition_poly_parts_ood_evaluation: Vec>, @@ -475,128 +488,6 @@ where } } -/// Compute Keccak-256 leaf hashes for `commit_columns_bit_reversed`: one -/// leaf per row, where each row is read at `reverse_index(row_idx)` and the -/// columns are concatenated as big-endian bytes before hashing. -/// -/// Returns `Vec` with the same length as `columns[0]`. Exposed -/// (instead of being a closure inside `commit_columns_bit_reversed`) so -/// parity tests in dependent crates can compare against the same code path -/// the prover uses. -pub fn keccak_leaves_bit_reversed(columns: &[Vec>]) -> Vec -where - E: IsField, - FieldElement: AsBytes + Sync + Send + ByteConversion, -{ - if columns.is_empty() || columns[0].is_empty() { - return Vec::new(); - } - - let num_rows = columns[0].len(); - let num_cols = columns.len(); - let byte_len = as ByteConversion>::BYTE_LEN; - - debug_assert!( - num_rows.is_power_of_two(), - "num_rows must be a power of two for reverse_index" - ); - - let total_bytes = num_cols * byte_len; - - let hash_leaf = |buf: &mut [u8], row_idx: usize| -> Commitment { - let br_idx = reverse_index(row_idx, num_rows as u64); - for col_idx in 0..num_cols { - columns[col_idx][br_idx] - .write_bytes_be(&mut buf[col_idx * byte_len..(col_idx + 1) * byte_len]); - } - BatchedMerkleTreeBackend::::hash_bytes(buf) - }; - - #[cfg(feature = "parallel")] - let iter = (0..num_rows).into_par_iter(); - #[cfg(not(feature = "parallel"))] - let iter = 0..num_rows; - - // Per-thread buffer reuse: map_init allocates one buffer per Rayon thread, - // eliminating millions of small heap allocations under parallel contention. - #[cfg(feature = "parallel")] - let result: Vec = iter - .map_init(|| vec![0u8; total_bytes], |buf, i| hash_leaf(buf, i)) - .collect(); - - #[cfg(not(feature = "parallel"))] - let result: Vec = { - let mut buf = vec![0u8; total_bytes]; - iter.map(|i| hash_leaf(&mut buf, i)).collect() - }; - - result -} - -/// Compute Keccak-256 leaf hashes for `commit_composition_polynomial`: one -/// leaf per row-pair, where leaf `i` hashes the BE concatenation of -/// `parts[..][br_0] ++ parts[..][br_1]` with -/// `br_k = reverse_index(2*i + k, num_rows)`. -/// -/// Returns `Vec` of length `parts[0].len() / 2`. -pub fn keccak_leaves_row_pair_bit_reversed(parts: &[Vec>]) -> Vec -where - E: IsField, - FieldElement: AsBytes + Sync + Send + ByteConversion, -{ - let num_parts = parts.len(); - if num_parts == 0 { - return Vec::new(); - } - let num_rows = parts[0].len(); - if num_rows == 0 { - return Vec::new(); - } - - let num_leaves = num_rows / 2; - debug_assert!( - num_rows.is_power_of_two(), - "num_rows must be a power of two for reverse_index" - ); - - let byte_len = as ByteConversion>::BYTE_LEN; - - let total_bytes = 2 * num_parts * byte_len; - - let hash_leaf_pair = |buf: &mut [u8], leaf_idx: usize| -> Commitment { - let br_0 = reverse_index(2 * leaf_idx, num_rows as u64); - let br_1 = reverse_index(2 * leaf_idx + 1, num_rows as u64); - let mut offset = 0; - for part in parts.iter() { - part[br_0].write_bytes_be(&mut buf[offset..offset + byte_len]); - offset += byte_len; - } - for part in parts.iter() { - part[br_1].write_bytes_be(&mut buf[offset..offset + byte_len]); - offset += byte_len; - } - BatchedMerkleTreeBackend::::hash_bytes(buf) - }; - - #[cfg(feature = "parallel")] - let iter = (0..num_leaves).into_par_iter(); - #[cfg(not(feature = "parallel"))] - let iter = 0..num_leaves; - - #[cfg(feature = "parallel")] - let result: Vec = iter - .map_init(|| vec![0u8; total_bytes], |buf, i| hash_leaf_pair(buf, i)) - .collect(); - - #[cfg(not(feature = "parallel"))] - let result: Vec = { - let mut buf = vec![0u8; total_bytes]; - iter.map(|i| hash_leaf_pair(&mut buf, i)).collect() - }; - - result -} - /// The functionality of a STARK prover providing methods to run the STARK Prove protocol /// https://lambdaclass.github.io/lambdaworks/starks/protocol.html /// The default implementation is complete and is compatible with Stone prover @@ -615,34 +506,11 @@ pub trait IsStarkProver< FieldElement: math::traits::ByteConversion, FieldElement: math::traits::ByteConversion, { - /// Builds a Merkle tree commitment from column-major LDE evaluations with - /// bit-reverse permutation, without cloning the full evaluation matrix. - /// - /// For each row index `i`, we hash `col_0[br(i)] || col_1[br(i)] || ...` - /// where `br(i)` is the bit-reversal of `i`. This produces the same Merkle - /// tree as the old clone + bit-reverse + columns2rows + batch_commit flow, - /// but avoids allocating the cloned and transposed matrices entirely. - fn commit_columns_bit_reversed( - columns: &[Vec>], - ) -> Option<(BatchedMerkleTree, Commitment)> - where - FieldElement: AsBytes + Sync + Send + math::traits::ByteConversion, - E: IsField, - { - if columns.is_empty() || columns[0].is_empty() { - return None; - } - let hashed_leaves = keccak_leaves_bit_reversed(columns); - let tree = BatchedMerkleTree::::build_from_hashed_leaves(hashed_leaves)?; - let root = tree.root; - Some((tree, root)) - } - - /// Row-major counterpart of [`commit_columns_bit_reversed`]: commit a - /// row-major flat buffer (`num_rows * num_cols`) by hashing each leaf from - /// the row at `reverse_index(row_idx)`. The leaf bytes are identical to the - /// column-major path (same row values), so the Merkle root is identical — - /// only the read pattern changes (contiguous row slice, no column gather). + /// Commit a row-major flat buffer (`num_rows * num_cols`) by hashing pairs + /// of consecutive bit-reversed rows into each Merkle leaf (`ROWS_PER_LEAF = 2`). + /// The byte layout per leaf matches `keccak_leaves_bit_reversed_grouped(columns, 2)`: + /// leaf i = hash( row[br(2i)] ++ row[br(2i+1)] ), read as contiguous slices from + /// the row-major buffer — no transpose needed. fn commit_rows_bit_reversed( data: &[FieldElement], num_cols: usize, @@ -654,10 +522,10 @@ pub trait IsStarkProver< Self::commit_rows_bit_reversed_subset(data, num_cols, 0, num_cols) } - /// Subset variant of [`commit_rows_bit_reversed`]: hash only columns in the - /// contiguous range `[col_start..col_end)` of each row. Used for - /// preprocessed traces where precomputed cols and multiplicity cols commit - /// to separate Merkle trees from the same row-major buffer. + /// Subset variant of [`commit_rows_bit_reversed`]: hash pairs of bit-reversed rows + /// from the column range `[col_start..col_end)`. Used for preprocessed traces where + /// precomputed cols and multiplicity cols commit to separate Merkle trees from the + /// same row-major buffer, both using the row-pair (`ROWS_PER_LEAF = 2`) leaf layout. fn commit_rows_bit_reversed_subset( data: &[FieldElement], num_cols: usize, @@ -679,44 +547,45 @@ pub trait IsStarkProver< if num_rows == 0 { return None; } - let subset_cols = col_end - col_start; - let byte_len = as ByteConversion>::BYTE_LEN; - let row_bytes = subset_cols * byte_len; - debug_assert!( num_rows.is_power_of_two(), "num_rows must be a power of two for reverse_index" ); + // Local alias for the canonical constant, used several times below. + const ROWS_PER_LEAF: usize = crate::commitment::ROWS_PER_LEAF; + let num_leaves = num_rows / ROWS_PER_LEAF; + let subset_cols = col_end - col_start; + let byte_len = as ByteConversion>::BYTE_LEN; + let leaf_bytes = ROWS_PER_LEAF * subset_cols * byte_len; + + let hash_leaf = |buf: &mut [u8], leaf_idx: usize| -> Commitment { + let mut offset = 0; + for k in 0..ROWS_PER_LEAF { + let br_idx = reverse_index(ROWS_PER_LEAF * leaf_idx + k, num_rows as u64); + let row_start = br_idx * num_cols; + let row = &data[row_start + col_start..row_start + col_end]; + for elem in row.iter() { + elem.write_bytes_be(&mut buf[offset..offset + byte_len]); + offset += byte_len; + } + } + BatchedMerkleTreeBackend::::hash_bytes(buf) + }; + #[cfg(feature = "parallel")] - let hashed_leaves: Vec = (0..num_rows) + let hashed_leaves: Vec = (0..num_leaves) .into_par_iter() .map_init( - || vec![0u8; row_bytes], - |buf, row_idx| { - let br_idx = reverse_index(row_idx, num_rows as u64); - let row_start = br_idx * num_cols; - let row = &data[row_start + col_start..row_start + col_end]; - for (i, elem) in row.iter().enumerate() { - elem.write_bytes_be(&mut buf[i * byte_len..(i + 1) * byte_len]); - } - BatchedMerkleTreeBackend::::hash_bytes(buf) - }, + || vec![0u8; leaf_bytes], + |buf, leaf_idx| hash_leaf(buf, leaf_idx), ) .collect(); #[cfg(not(feature = "parallel"))] let hashed_leaves: Vec = { - let mut buf = vec![0u8; row_bytes]; - (0..num_rows) - .map(|row_idx| { - let br_idx = reverse_index(row_idx, num_rows as u64); - let row_start = br_idx * num_cols; - let row = &data[row_start + col_start..row_start + col_end]; - for (i, elem) in row.iter().enumerate() { - elem.write_bytes_be(&mut buf[i * byte_len..(i + 1) * byte_len]); - } - BatchedMerkleTreeBackend::::hash_bytes(&buf) - }) + let mut buf = vec![0u8; leaf_bytes]; + (0..num_leaves) + .map(|leaf_idx| hash_leaf(&mut buf, leaf_idx)) .collect() }; @@ -747,7 +616,8 @@ pub trait IsStarkProver< let twiddles = LdeTwiddles::new(&domain); let evals = Self::compute_lde_from_columns_cached::(&precomputed, &domain, &twiddles); - let (_, commitment) = Self::commit_columns_bit_reversed(&evals)?; + let (_, commitment) = + crate::commitment::commit_bit_reversed(&evals, crate::commitment::ROWS_PER_LEAF)?; Some(commitment) } @@ -773,23 +643,16 @@ pub trait IsStarkProver< return Vec::new(); } - #[cfg(not(feature = "parallel"))] - let columns_iter = columns.iter(); - #[cfg(feature = "parallel")] - let columns_iter = columns.par_iter(); - - columns_iter - .map(|col| { - Polynomial::coset_lde_full::( - col, - domain.blowup_factor, - &twiddles.coset_weights, - &twiddles.inv, - &twiddles.fwd, - ) - }) - .collect::>>, _>>() + crate::par::par_map_collect(0..columns.len(), |i| { + Polynomial::coset_lde_full::( + &columns[i], + domain.blowup_factor, + &twiddles.coset_weights, + &twiddles.inv, + &twiddles.fwd, + ) .expect("coset LDE computation") + }) } /// Expand each column in-place from N evaluations to N×blowup LDE evaluations. @@ -828,11 +691,7 @@ pub trait IsStarkProver< return; } - #[cfg(feature = "parallel")] - let iter = columns.par_iter_mut(); - #[cfg(not(feature = "parallel"))] - let iter = columns.iter_mut(); - iter.for_each(|buf| { + crate::par::par_for_each_mut(columns, |buf| { Polynomial::coset_lde_full_expand::( buf, domain.blowup_factor, @@ -944,10 +803,7 @@ pub trait IsStarkProver< let (mut tree, root) = Self::commit_rows_bit_reversed(&main_data, total_cols) .ok_or(ProvingError::EmptyCommitment)?; #[cfg(feature = "disk-spill")] - if storage_mode == StorageMode::Disk { - tree.spill_nodes_to_disk() - .map_err(|e| ProvingError::DiskSpill(format!("main Merkle tree: {e}")))?; - } + Self::spill_tree(&mut tree, storage_mode, "main Merkle tree")?; TableCommit::plain(tree, root) } Some((expected_precomputed_root, num_precomputed)) => { @@ -972,13 +828,13 @@ pub trait IsStarkProver< return Err(ProvingError::PrecomputedCommitmentMismatch); } #[cfg(feature = "disk-spill")] - if storage_mode == StorageMode::Disk { - precomputed_tree.spill_nodes_to_disk().map_err(|e| { - ProvingError::DiskSpill(format!("precomputed Merkle tree: {e}")) - })?; - mult_tree - .spill_nodes_to_disk() - .map_err(|e| ProvingError::DiskSpill(format!("mult Merkle tree: {e}")))?; + { + Self::spill_tree( + &mut precomputed_tree, + storage_mode, + "precomputed Merkle tree", + )?; + Self::spill_tree(&mut mult_tree, storage_mode, "mult Merkle tree")?; } TableCommit::preprocessed( mult_tree, @@ -999,6 +855,26 @@ pub trait IsStarkProver< Ok((commit, (main_data, total_cols))) } + /// Spill a committed Merkle tree to disk when `storage_mode` is `Disk`, + /// tagging any I/O error with `label`. No-op otherwise. Shared by every commit + /// site (main / preprocessed split / aux). + #[cfg(feature = "disk-spill")] + fn spill_tree( + tree: &mut BatchedMerkleTree, + storage_mode: StorageMode, + label: &str, + ) -> Result<(), ProvingError> + where + C: IsField, + FieldElement: AsBytes + Sync + Send, + { + if storage_mode == StorageMode::Disk { + tree.spill_nodes_to_disk() + .map_err(|e| ProvingError::DiskSpill(format!("{label}: {e}")))?; + } + Ok(()) + } + /// Recompute Round1 from the trace, reusing the Merkle trees stored in commitments. /// /// Only used by `run_debug_checks` — Phase D consumes the cached LDE @@ -1124,30 +1000,6 @@ pub trait IsStarkProver< } } - /// Returns the Merkle tree and the commitment to the evaluations of the parts of the - /// composition polynomial. - fn commit_composition_polynomial( - lde_composition_poly_parts_evaluations: &[Vec>], - ) -> Option<(BatchedMerkleTree, Commitment)> - where - FieldElement: AsBytes + Sync + Send, - FieldElement: AsBytes + Sync + Send + math::traits::ByteConversion, - { - let num_parts = lde_composition_poly_parts_evaluations.len(); - if num_parts == 0 { - return None; - } - let num_rows = lde_composition_poly_parts_evaluations[0].len(); - if num_rows == 0 { - return None; - } - let hashed_leaves = - keccak_leaves_row_pair_bit_reversed(lde_composition_poly_parts_evaluations); - let tree = BatchedMerkleTree::::build_from_hashed_leaves(hashed_leaves)?; - let root = tree.root; - Some((tree, root)) - } - /// Algebraically decompose H(x) = H₀(x²) + x·H₁(x²) on the LDE coset, then /// extend each half to the full LDE domain. This replaces the expensive /// iFFT(2N) + break_in_parts + FFT(2N)×2 pipeline with: @@ -1294,11 +1146,10 @@ pub trait IsStarkProver< } else { // Fallback for any future AIR with d > 2. let composition_poly = - Polynomial::interpolate_offset_fft(&constraint_evaluations, &domain.coset_offset) - .unwrap(); + Polynomial::interpolate_offset_fft(&constraint_evaluations, &domain.coset_offset)?; let composition_poly_parts = composition_poly.break_in_parts(number_of_parts); - let cpu_eval = || -> Vec>> { + let cpu_eval = || -> Result>>, ProvingError> { composition_poly_parts .iter() .map(|part| { @@ -1308,7 +1159,7 @@ pub trait IsStarkProver< domain.interpolation_domain_size, &domain.coset_offset, ) - .unwrap() + .map_err(ProvingError::from) }) .collect() }; @@ -1333,11 +1184,11 @@ pub trait IsStarkProver< gpu_composition_parts = Some(handle); evals } - None => cpu_eval(), + None => cpu_eval()?, } } #[cfg(not(feature = "cuda"))] - cpu_eval() + cpu_eval()? }; #[cfg(feature = "instruments")] let fft_dur = t_sub.elapsed(); @@ -1359,8 +1210,11 @@ pub trait IsStarkProver< let root = tree.root; (tree, root) } - None => Self::commit_composition_polynomial(&lde_composition_poly_parts_evaluations) - .ok_or(ProvingError::EmptyCommitment)?, + None => crate::commitment::commit_bit_reversed( + &lde_composition_poly_parts_evaluations, + crate::commitment::ROWS_PER_LEAF, + ) + .ok_or(ProvingError::EmptyCommitment)?, }; #[cfg(feature = "instruments")] let merkle_dur = t_sub.elapsed(); @@ -1725,12 +1579,7 @@ pub trait IsStarkProver< }) .collect(); - #[cfg(feature = "parallel")] - let iter = (0..lde_size).into_par_iter(); - #[cfg(not(feature = "parallel"))] - let iter = 0..lde_size; - - iter.map(|i| { + crate::par::par_map_collect(0..lde_size, |i| { let mut result = FieldElement::::zero(); // H terms @@ -1756,7 +1605,6 @@ pub trait IsStarkProver< result }) - .collect() } /// Computes values and validity proofs of the evaluations of the composition polynomial parts @@ -1773,7 +1621,7 @@ pub trait IsStarkProver< { let proof = composition_poly_merkle_tree .get_proof_by_pos(index) - .unwrap(); + .expect("FRI query index in bounds"); let lde_composition_poly_parts_evaluation: Vec<_> = lde_composition_poly_evaluations .iter() @@ -1786,8 +1634,7 @@ pub trait IsStarkProver< .collect(); PolynomialOpenings { - proof: proof.clone(), - proof_sym: proof, + proof, evaluations: lde_composition_poly_parts_evaluation .clone() .into_iter() @@ -1817,13 +1664,15 @@ pub trait IsStarkProver< G: Fn(usize) -> Vec>, { let domain_size = domain.lde_roots_of_unity_coset.len() as u64; - let index = challenge * 2; - let index_sym = challenge * 2 + 1; + // Rows `2·challenge` and `2·challenge+1` are committed together as the + // single leaf at position `challenge`; one Merkle path authenticates both + // the queried row and its symmetric counterpart. PolynomialOpenings { - proof: tree.get_proof_by_pos(index).unwrap(), - proof_sym: tree.get_proof_by_pos(index_sym).unwrap(), - evaluations: gather(reverse_index(index, domain_size)), - evaluations_sym: gather(reverse_index(index_sym, domain_size)), + proof: tree + .get_proof_by_pos(challenge) + .expect("FRI query index in bounds"), + evaluations: gather(reverse_index(challenge * 2, domain_size)), + evaluations_sym: gather(reverse_index(challenge * 2 + 1, domain_size)), } } @@ -1889,7 +1738,7 @@ pub trait IsStarkProver< openings } - // TODO: propagate errors instead of unwrap() in commit_columns, reconstruct_round1, and expand_columns_to_lde + // TODO: propagate errors instead of unwrap() in commit_main_trace, reconstruct_round1, and expand_columns_to_lde /// Generates STARK proofs for one or more AIRs with a shared transcript. /// /// # Multi-Table Proving with LogUp @@ -1988,11 +1837,7 @@ pub trait IsStarkProver< // Spill main traces to mmap before Round 1 LDE. #[cfg(feature = "disk-spill")] if storage_mode == StorageMode::Disk { - #[cfg(feature = "parallel")] - let spill_iter = air_trace_pairs.par_iter_mut(); - #[cfg(not(feature = "parallel"))] - let mut spill_iter = air_trace_pairs.iter_mut(); - spill_iter.try_for_each(|(_, trace, _)| { + crate::par::par_try_for_each_mut(&mut air_trace_pairs, |(_, trace, _)| { trace .main_table .spill_to_disk() @@ -2029,13 +1874,8 @@ pub trait IsStarkProver< let chunk_end = (chunk_start + k).min(num_airs); let chunk_range = chunk_start..chunk_end; - #[cfg(feature = "parallel")] - let iter = chunk_range.into_par_iter(); - #[cfg(not(feature = "parallel"))] - let iter = chunk_range; - - let chunk_results: Vec> = iter - .map(|idx| { + let chunk_results: Vec> = + crate::par::par_map_collect(chunk_range, |idx| { let (air, trace, _) = &air_trace_pairs[idx]; let domain = &domains[idx]; let twiddles = &twiddle_caches[idx]; @@ -2051,8 +1891,7 @@ pub trait IsStarkProver< #[cfg(feature = "disk-spill")] storage_mode, ) - }) - .collect(); + }); // Sequential: append roots to shared transcript (Fiat-Shamir ordering) for result in chunk_results { @@ -2124,17 +1963,13 @@ pub trait IsStarkProver< // Spill all aux trace tables to mmap before any Round 1 aux LDE work. #[cfg(feature = "disk-spill")] if storage_mode == StorageMode::Disk { - #[cfg(feature = "parallel")] - let spill_iter = air_trace_pairs.par_iter_mut(); - #[cfg(not(feature = "parallel"))] - let mut spill_iter = air_trace_pairs.iter_mut(); - spill_iter.try_for_each(|(air, trace, _)| { + crate::par::par_try_for_each_mut(&mut air_trace_pairs, |(air, trace, _)| { if air.has_aux_trace() { trace .spill_aux_to_disk() .map_err(|e| ProvingError::DiskSpill(format!("aux trace: {e}")))?; } - Ok(()) + Ok::<(), ProvingError>(()) })?; } @@ -2180,14 +2015,9 @@ pub trait IsStarkProver< let chunk_end = (chunk_start + k).min(num_airs); let chunk_range = chunk_start..chunk_end; - #[cfg(feature = "parallel")] - let iter = chunk_range.into_par_iter(); - #[cfg(not(feature = "parallel"))] - let iter = chunk_range; - #[allow(clippy::type_complexity)] - let chunk_aux: Vec, ProvingError>> = iter - .map(|idx| { + let chunk_aux: Vec, ProvingError>> = + crate::par::par_map_collect(chunk_range, |idx| { let (air, trace, _) = &air_trace_pairs[idx]; let domain = &domains[idx]; let twiddles = &twiddle_caches[idx]; @@ -2200,7 +2030,11 @@ pub trait IsStarkProver< #[cfg(feature = "cuda")] { let (trace_slice, num_cols) = trace.aux_data_row_major(); - let n = if num_cols > 0 { trace_slice.len() / num_cols } else { 0 }; + let n = if num_cols > 0 { + trace_slice.len() / num_cols + } else { + 0 + }; #[cfg(feature = "instruments")] let t_sub = Instant::now(); if let Some((tree, handle, aux_data)) = @@ -2209,7 +2043,10 @@ pub trait IsStarkProver< FieldExtension, BatchedMerkleTreeBackend, >( - trace_slice, n, num_cols, domain.blowup_factor, + trace_slice, + n, + num_cols, + domain.blowup_factor, &twiddles.coset_weights, ) { @@ -2258,33 +2095,26 @@ pub trait IsStarkProver< #[cfg(feature = "instruments")] let t_sub = Instant::now(); #[allow(unused_mut)] - let (mut tree, root) = Self::commit_rows_bit_reversed(&aux_data, total_cols) - .ok_or(ProvingError::EmptyCommitment)?; + let (mut tree, root) = + Self::commit_rows_bit_reversed(&aux_data, total_cols) + .ok_or(ProvingError::EmptyCommitment)?; + #[cfg(feature = "disk-spill")] + Self::spill_tree(&mut tree, storage_mode, "aux Merkle tree")?; + let commit = TableCommit::plain(tree, root); #[cfg(feature = "instruments")] crate::instruments::accum_r1_aux(aux_lde_dur, t_sub.elapsed()); - #[cfg(feature = "disk-spill")] - if storage_mode == StorageMode::Disk { - tree.spill_nodes_to_disk().map_err(|e| { - ProvingError::DiskSpill(format!("aux Merkle tree: {e}")) - })?; - } #[cfg(feature = "cuda")] - return Ok(( - Some(TableCommit::plain(tree, root)), - (aux_data, total_cols), - None, - )); + return Ok((Some(commit), (aux_data, total_cols), None)); #[cfg(not(feature = "cuda"))] - Ok((Some(TableCommit::plain(tree, root)), (aux_data, total_cols))) + Ok((Some(commit), (aux_data, total_cols))) } else { #[cfg(feature = "cuda")] return Ok((None, (Vec::new(), 0), None)); #[cfg(not(feature = "cuda"))] Ok((None, (Vec::new(), 0))) } - }) - .collect(); + }); // Sequential: append aux roots to forked transcripts. for (j, result) in chunk_aux.into_iter().enumerate() { @@ -2500,7 +2330,7 @@ pub trait IsStarkProver< // TODO: propagate errors instead of unwrap() in open_deep_composition_poly and FRI operations /// Executes rounds 2-4 and generates a STARK proof for the trace `main_trace` with public inputs `pub_inputs`. - /// Warning: the transcript must be safely initializated before passing it to this method. + /// Warning: the transcript must be safely initialized before passing it to this method. fn prove_rounds_2_to_4( air: &dyn AIR, pub_inputs: &PI, @@ -2514,7 +2344,7 @@ pub trait IsStarkProver< FieldElement: AsBytes, PI: Send + Sync + Clone, { - info!("Started proof generation..."); + log::debug!("Started proof generation..."); // =================================== // ==========| Round 2 |========== @@ -2628,7 +2458,7 @@ pub trait IsStarkProver< }); } - info!("End proof generation"); + log::debug!("End proof generation"); Ok(StarkProof { // [t] diff --git a/crypto/stark/src/tests/commitment_tests.rs b/crypto/stark/src/tests/commitment_tests.rs new file mode 100644 index 000000000..f1684112b --- /dev/null +++ b/crypto/stark/src/tests/commitment_tests.rs @@ -0,0 +1,96 @@ +//! Unit tests for the Merkle commitment layer (`crate::commitment`): they pin +//! the bit-reversed, row-grouped leaf byte layout that the GPU kernels and the +//! verifier's `verify_opening_pair` must match. Previously this layout was only +//! covered transitively through full prove→verify. + +use crate::commitment::{ + ROWS_PER_LEAF, commit_bit_reversed, keccak_leaves_bit_reversed, + keccak_leaves_bit_reversed_grouped, keccak_leaves_row_pair_bit_reversed, +}; +use crate::config::{BatchedMerkleTree, BatchedMerkleTreeBackend, Commitment}; +use math::fft::bit_reversing::reverse_index; +use math::field::{element::FieldElement, goldilocks::GoldilocksField}; +use math::traits::ByteConversion; + +type F = GoldilocksField; +type Felt = FieldElement; + +/// 3 columns × 8 rows of distinct, nonzero values. +fn sample_columns() -> Vec> { + (0..3u64) + .map(|c| (0..8u64).map(|r| Felt::from(100 * c + r + 1)).collect()) + .collect() +} + +/// Independent reference for one leaf, written straight from the module-doc +/// layout (`rows_per_leaf` consecutive bit-reversed rows, column-major within +/// each row, big-endian), hashed once with the same backend the prover uses. +/// Structurally separate from the production `map_init` loop, so a transposed +/// row/column order or a wrong bit-reversal in production fails this check. +fn expected_leaf(columns: &[Vec], rows_per_leaf: usize, leaf_idx: usize) -> Commitment { + let num_rows = columns[0].len(); + let byte_len = ::BYTE_LEN; + let mut buf = vec![0u8; rows_per_leaf * columns.len() * byte_len]; + let mut offset = 0; + for k in 0..rows_per_leaf { + let br = reverse_index(rows_per_leaf * leaf_idx + k, num_rows as u64); + for col in columns { + col[br].write_bytes_be(&mut buf[offset..offset + byte_len]); + offset += byte_len; + } + } + BatchedMerkleTreeBackend::::hash_bytes(&buf) +} + +#[test] +fn grouped_leaves_match_documented_layout_for_r1_and_r2() { + let columns = sample_columns(); + let num_rows = columns[0].len(); + for &rows_per_leaf in &[1usize, 2usize] { + let leaves = keccak_leaves_bit_reversed_grouped(&columns, rows_per_leaf); + assert_eq!( + leaves.len(), + num_rows / rows_per_leaf, + "leaf count for rows_per_leaf={rows_per_leaf}" + ); + for (i, leaf) in leaves.iter().enumerate() { + assert_eq!( + *leaf, + expected_leaf(&columns, rows_per_leaf, i), + "leaf {i} for rows_per_leaf={rows_per_leaf}" + ); + } + } +} + +#[test] +fn wrappers_agree_with_grouped() { + let columns = sample_columns(); + assert_eq!( + keccak_leaves_bit_reversed(&columns), + keccak_leaves_bit_reversed_grouped(&columns, 1) + ); + assert_eq!( + keccak_leaves_row_pair_bit_reversed(&columns), + keccak_leaves_bit_reversed_grouped(&columns, ROWS_PER_LEAF) + ); +} + +#[test] +fn commit_root_matches_tree_built_over_leaves() { + let columns = sample_columns(); + let leaves = keccak_leaves_bit_reversed_grouped(&columns, ROWS_PER_LEAF); + let tree = BatchedMerkleTree::::build_from_hashed_leaves(leaves).unwrap(); + let (_, root) = commit_bit_reversed(&columns, ROWS_PER_LEAF).unwrap(); + assert_eq!(root, tree.root); +} + +#[test] +fn empty_and_zero_row_inputs_short_circuit() { + let empty: Vec> = vec![]; + assert!(keccak_leaves_bit_reversed_grouped(&empty, ROWS_PER_LEAF).is_empty()); + assert!(commit_bit_reversed(&empty, ROWS_PER_LEAF).is_none()); + let zero_rows: Vec> = vec![vec![]]; + assert!(keccak_leaves_bit_reversed_grouped(&zero_rows, ROWS_PER_LEAF).is_empty()); + assert!(commit_bit_reversed(&zero_rows, ROWS_PER_LEAF).is_none()); +} diff --git a/crypto/stark/src/tests/mod.rs b/crypto/stark/src/tests/mod.rs index 8c0897ac1..7a3884832 100644 --- a/crypto/stark/src/tests/mod.rs +++ b/crypto/stark/src/tests/mod.rs @@ -2,12 +2,14 @@ pub mod air_tests; #[cfg(feature = "debug-checks")] pub mod bus_debug_tests; pub mod bus_tests; +pub mod commitment_tests; pub mod domain_cache_stats; pub mod fri_tests; pub mod grinding_tests; pub mod proof_options_tests; pub mod prove_verify_roundtrip_tests; pub mod prover_tests; +pub mod row_pair_opening_tests; pub mod small_trace_tests; #[cfg(feature = "disk-spill")] pub mod table_disk_spill_tests; diff --git a/crypto/stark/src/tests/prover_tests.rs b/crypto/stark/src/tests/prover_tests.rs index ab3589702..cb7fb5c44 100644 --- a/crypto/stark/src/tests/prover_tests.rs +++ b/crypto/stark/src/tests/prover_tests.rs @@ -564,7 +564,7 @@ fn test_deep_poly_direct_2n_matches_interpolate_fft_extend() { } #[test] -fn commit_rows_bit_reversed_matches_commit_columns_bit_reversed() { +fn commit_rows_bit_reversed_matches_commit_bit_reversed() { type F = GoldilocksField; type FE = FieldElement; @@ -588,8 +588,12 @@ fn commit_rows_bit_reversed_matches_commit_columns_bit_reversed() { } } - let (_, root_col) = Prover::::commit_columns_bit_reversed(&columns) - .expect("column-major commit must succeed"); + // Both commits are row-pair (ROWS_PER_LEAF=2): the column-major + // `commitment` path and the row-major prover path must produce the + // same Merkle root (identical leaf bytes, only the read pattern differs). + let (_, root_col) = + crate::commitment::commit_bit_reversed(&columns, crate::commitment::ROWS_PER_LEAF) + .expect("column-major commit must succeed"); let (_, root_row) = Prover::::commit_rows_bit_reversed(&row_major, num_cols) .expect("row-major commit must succeed"); diff --git a/crypto/stark/src/tests/row_pair_opening_tests.rs b/crypto/stark/src/tests/row_pair_opening_tests.rs new file mode 100644 index 000000000..93423f49f --- /dev/null +++ b/crypto/stark/src/tests/row_pair_opening_tests.rs @@ -0,0 +1,73 @@ +//! Negative tests for the row-pair trace opening verification +//! (`verifier::verify_opening_pair`). The row pair `(2·iota, 2·iota+1)` is +//! committed as a single Merkle leaf, so one `proof` authenticates both +//! `evaluations` and `evaluations_sym`. Removing the old separate `proof_sym` +//! opening deleted the "symmetric opening mismatch" rejection class; these +//! tests restore it — an implementation that ignored `evaluations_sym` or the +//! authentication path would otherwise pass every other test. + +use crate::tests::trace_test_helpers::make_valid_simple_proof; +use crate::verifier::{IsStarkVerifier, Verifier}; +use crypto::fiat_shamir::default_transcript::DefaultTranscript; +use math::field::{element::FieldElement, goldilocks::GoldilocksField}; + +type Felt = FieldElement; + +/// Tampering the value at the symmetric LDE position must break verification: +/// the committed leaf hashed `evaluations ‖ evaluations_sym`, so a perturbed +/// `evaluations_sym` no longer reconstructs the committed leaf. +#[test_log::test] +fn test_verify_rejects_tampered_main_trace_evaluations_sym() { + let (air, mut proof) = make_valid_simple_proof(); + + let opening = proof + .deep_poly_openings + .first_mut() + .expect("test precondition: a valid proof has at least one deep poly opening"); + assert!( + !opening.main_trace_polys.evaluations_sym.is_empty(), + "test precondition: the main-trace opening has at least one symmetric evaluation", + ); + // Perturb (not resize) the first symmetric evaluation. + opening.main_trace_polys.evaluations_sym[0] = + &opening.main_trace_polys.evaluations_sym[0] + Felt::one(); + + assert!( + !Verifier::verify( + &proof, + &air, + &mut DefaultTranscript::::new(&[]) + ), + "Verifier must reject a tampered symmetric trace evaluation" + ); +} + +/// The row-pair Merkle authentication path itself must be checked. Corrupting a +/// node in `main_trace_polys.proof.merkle_path` is caught ONLY by +/// `verify_opening_pair` (the deep-composition reconstruction does not touch the +/// auth path), so this proves the single row-pair path is actually authenticated +/// against the committed root rather than ignored. +#[test_log::test] +fn test_verify_rejects_tampered_main_trace_merkle_path() { + let (air, mut proof) = make_valid_simple_proof(); + + let opening = proof + .deep_poly_openings + .first_mut() + .expect("test precondition: a valid proof has at least one deep poly opening"); + let path = &mut opening.main_trace_polys.proof.merkle_path; + assert!( + !path.is_empty(), + "test precondition: the row-pair trace tree has a non-trivial authentication path", + ); + path[0][0] ^= 0x01; + + assert!( + !Verifier::verify( + &proof, + &air, + &mut DefaultTranscript::::new(&[]) + ), + "Verifier must reject a corrupted main-trace Merkle authentication path" + ); +} diff --git a/crypto/stark/src/tests/small_trace_tests.rs b/crypto/stark/src/tests/small_trace_tests.rs index 8373ae9d6..96e04858d 100644 --- a/crypto/stark/src/tests/small_trace_tests.rs +++ b/crypto/stark/src/tests/small_trace_tests.rs @@ -11,37 +11,13 @@ use crate::{ }, proof::options::ProofOptions, prover::{IsStarkProver, Prover}, + tests::trace_test_helpers::make_valid_simple_proof, traits::AIR, verifier::{IsStarkVerifier, Verifier}, }; type Felt = FieldElement; -fn make_valid_simple_proof() -> ( - SimpleAdditionAIR, - crate::proof::stark::StarkProof< - GoldilocksField, - GoldilocksField, - SimpleAdditionPublicInputs, - >, -) { - let mut trace = simple_addition_trace::(2); - let proof_options = ProofOptions::default_test_options(); - let pub_inputs = SimpleAdditionPublicInputs { - a: Felt::from(1u64), - b: Felt::from(2u64), - }; - let air = SimpleAdditionAIR::::new(&proof_options); - let proof = Prover::prove( - &air, - &mut trace, - &pub_inputs, - &mut DefaultTranscript::::new(&[]), - ) - .unwrap(); - (air, proof) -} - /// Test STARK prove/verify with a single-row trace. /// This exercises the FRI protocol with 0 FRI layers (trace_length=1, number_layers=0). #[test_log::test] diff --git a/crypto/stark/src/tests/trace_test_helpers.rs b/crypto/stark/src/tests/trace_test_helpers.rs index e62d0d3ec..4ef6455b3 100644 --- a/crypto/stark/src/tests/trace_test_helpers.rs +++ b/crypto/stark/src/tests/trace_test_helpers.rs @@ -1,8 +1,16 @@ +use crate::examples::simple_addition::{ + SimpleAdditionAIR, SimpleAdditionPublicInputs, simple_addition_trace, +}; +use crate::proof::options::ProofOptions; +use crate::prover::{IsStarkProver, Prover}; use crate::table::Table; use crate::trace::{TraceTable, compute_frame_evaluation_points}; +use crate::traits::AIR; +use crypto::fiat_shamir::default_transcript::DefaultTranscript; use itertools::Itertools; use math::field::{ element::FieldElement, + goldilocks::GoldilocksField, traits::{IsField, IsSubFieldOf}, }; use math::polynomial::Polynomial; @@ -10,6 +18,34 @@ use math::polynomial::Polynomial; #[cfg(feature = "parallel")] use rayon::prelude::{IntoParallelRefIterator, ParallelIterator}; +/// Builds a valid 2-row `SimpleAddition` proof. Shared base for the +/// proof-tamper / rejection tests in `small_trace_tests` and +/// `row_pair_opening_tests`. +pub fn make_valid_simple_proof() -> ( + SimpleAdditionAIR, + crate::proof::stark::StarkProof< + GoldilocksField, + GoldilocksField, + SimpleAdditionPublicInputs, + >, +) { + let mut trace = simple_addition_trace::(2); + let proof_options = ProofOptions::default_test_options(); + let pub_inputs = SimpleAdditionPublicInputs { + a: FieldElement::from(1u64), + b: FieldElement::from(2u64), + }; + let air = SimpleAdditionAIR::::new(&proof_options); + let proof = Prover::prove( + &air, + &mut trace, + &pub_inputs, + &mut DefaultTranscript::::new(&[]), + ) + .unwrap(); + (air, proof) +} + /// Reference Horner-based trace-evaluation used as an oracle by the prover /// tests (`tests::prover_tests`). The production prover uses the LDE-based /// barycentric `get_trace_evaluations_from_lde`; the two are diff --git a/crypto/stark/src/trace.rs b/crypto/stark/src/trace.rs index da4a53f6e..72b77947a 100644 --- a/crypto/stark/src/trace.rs +++ b/crypto/stark/src/trace.rs @@ -645,22 +645,6 @@ where Table::new(table_data, table_width) } -pub fn columns2rows(columns: Vec>) -> Vec> -where - F: Clone, -{ - let num_rows = columns[0].len(); - let num_cols = columns.len(); - - (0..num_rows) - .map(|row_index| { - (0..num_cols) - .map(|col_index| columns[col_index][row_index].clone()) - .collect() - }) - .collect() -} - pub(crate) fn compute_frame_evaluation_points( x: &FieldElement, frame_offsets: &[usize], diff --git a/crypto/stark/src/verifier.rs b/crypto/stark/src/verifier.rs index 68819c76b..03119f617 100644 --- a/crypto/stark/src/verifier.rs +++ b/crypto/stark/src/verifier.rs @@ -301,25 +301,11 @@ pub trait IsStarkVerifier< domain.lde_coset_element(reverse_index(raw, domain.lde_length as u64)) } - /// Verifies the validity of the opening proof. - fn verify_opening( - proof: &Proof, - root: &Commitment, - index: usize, - value: &[FieldElement], - ) -> bool - where - FieldElement: AsBytes + Sync + Send, - FieldElement: AsBytes + Sync + Send, - E: IsField, - Field: IsSubFieldOf, - { - proof.verify::>(root, index, &value.to_owned()) - } - - /// Verify both (proof, evaluations) and (proof_sym, evaluations_sym) openings - /// of a `PolynomialOpenings` against the given `root` at iota positions - /// `iota*2` and `iota*2 + 1`. + /// Verify a row-paired `PolynomialOpenings` against `root`. The row pair + /// (`2·iota`, `2·iota+1`) is committed as the single leaf at position `iota`, + /// so one Merkle path authenticates both rows: reconstruct the leaf from + /// `evaluations ‖ evaluations_sym` and verify once. (Same as the composition + /// opening check.) fn verify_opening_pair( opening: &PolynomialOpenings, root: &Commitment, @@ -331,13 +317,11 @@ pub trait IsStarkVerifier< E: IsField, Field: IsSubFieldOf, { - Self::verify_opening::(&opening.proof, root, iota * 2, &opening.evaluations) - && Self::verify_opening::( - &opening.proof_sym, - root, - iota * 2 + 1, - &opening.evaluations_sym, - ) + let mut value = opening.evaluations.clone(); + value.extend_from_slice(&opening.evaluations_sym); + opening + .proof + .verify::>(root, iota, &value) } /// Verify opening Open(tⱼ(D_LDE), 𝜐) and Open(tⱼ(D_LDE), -𝜐) for all trace polynomials tⱼ, diff --git a/prover/src/instruments.rs b/prover/src/instruments.rs index a33fd3dad..aa5d1caa4 100644 --- a/prover/src/instruments.rs +++ b/prover/src/instruments.rs @@ -181,7 +181,7 @@ pub fn print_report( let mut sub_ops: Vec<(&str, Duration)> = vec![ ("R2 evaluate", total_constraints), ("R2 decompose_and_extend_d2", total_comp_decompose), - ("R2 commit_composition_poly", total_comp_commit), + ("R2 commit_bit_reversed (comp-poly)", total_comp_commit), ("R3 OOD evaluation", total_ood), ("R4 deep_composition_poly_evals", total_deep_comp), ("R4 interpolate+evaluate_fft", total_deep_extend), diff --git a/prover/src/tables/bitwise.rs b/prover/src/tables/bitwise.rs index 468e2a5b2..c4871765f 100644 --- a/prover/src/tables/bitwise.rs +++ b/prover/src/tables/bitwise.rs @@ -25,13 +25,13 @@ //! All lookups are provided as receivers with negative multiplicity, //! meaning other tables send to this table. -use math::fft::bit_reversing::in_place_bit_reverse_permute; use math::polynomial::Polynomial; -use stark::config::{BatchedMerkleTree, Commitment}; +use stark::commitment::{ROWS_PER_LEAF, commit_bit_reversed}; +use stark::config::Commitment; use stark::lookup::{BusInteraction, BusValue, Multiplicity, Packing}; use stark::proof::options::ProofOptions; use stark::prover::evaluate_polynomial_on_lde_domain; -use stark::trace::{TraceTable, columns2rows}; +use stark::trace::TraceTable; #[cfg(feature = "parallel")] use rayon::prelude::*; @@ -195,19 +195,19 @@ pub const fn is_preprocessed() -> bool { fn static_commitment(blowup_factor: u8) -> Option { match blowup_factor { 2 => Some([ - 0xfb, 0x46, 0xff, 0x1c, 0xed, 0x4c, 0x97, 0xfb, 0xb2, 0x17, 0x55, 0x24, 0x08, 0x04, - 0x15, 0xee, 0xbe, 0xa6, 0xee, 0x86, 0x69, 0xaf, 0x3a, 0x4f, 0x9e, 0x2a, 0x44, 0x81, - 0xf9, 0xb0, 0xf3, 0xff, + 0xfa, 0x3e, 0xcf, 0x80, 0xfd, 0x95, 0xe5, 0x09, 0x74, 0xd4, 0x55, 0x23, 0xf6, 0x42, + 0xb6, 0x4b, 0x05, 0xc4, 0xf9, 0x66, 0xc2, 0x4d, 0xff, 0xda, 0x31, 0x47, 0xab, 0x7b, + 0x0c, 0x6d, 0xc4, 0xcf, ]), 4 => Some([ - 0xb5, 0xc4, 0xc0, 0x80, 0x03, 0x5b, 0xb6, 0x12, 0x78, 0x8c, 0x4d, 0xd4, 0x9e, 0x3d, - 0xc4, 0xe2, 0xef, 0x95, 0xf0, 0xbf, 0xe8, 0x1d, 0x98, 0xec, 0x7f, 0x58, 0x3a, 0x47, - 0x18, 0x03, 0x7e, 0xa5, + 0xff, 0x76, 0x8e, 0x85, 0x4b, 0xdc, 0x32, 0x61, 0x96, 0x16, 0x15, 0x19, 0x73, 0x70, + 0xf0, 0x64, 0x81, 0xfd, 0x4f, 0x5c, 0xbd, 0x9c, 0x30, 0x26, 0xd5, 0xc0, 0x81, 0xf3, + 0xce, 0x38, 0x50, 0x3e, ]), 8 => Some([ - 0x8a, 0x18, 0x70, 0x51, 0x34, 0x1a, 0x65, 0xaa, 0x79, 0x17, 0x07, 0x9a, 0xf3, 0x0b, - 0xcb, 0xd0, 0x7c, 0xe3, 0x2a, 0xce, 0x89, 0x9a, 0xfd, 0xc8, 0x0d, 0x6b, 0x48, 0x43, - 0x83, 0x5d, 0x18, 0xb8, + 0x0e, 0x1b, 0xc1, 0x0d, 0xae, 0x64, 0xe7, 0xca, 0xe0, 0x2a, 0x3b, 0xab, 0xd7, 0xd2, + 0xbb, 0x80, 0xd5, 0x24, 0x5a, 0xce, 0x25, 0xb6, 0x84, 0x77, 0x9c, 0xb5, 0xeb, 0x67, + 0x61, 0x82, 0x78, 0x3d, ]), _ => None, } @@ -283,7 +283,7 @@ pub fn compute_preprocessed_commitment(options: &ProofOptions) -> Commitment { let coset_offset = FE::from(options.coset_offset); #[cfg(feature = "parallel")] - let mut lde_columns: Vec> = polys + let lde_columns: Vec> = polys .par_iter() .map(|poly| { evaluate_polynomial_on_lde_domain(poly, blowup_factor, NUM_ROWS, &coset_offset) @@ -292,7 +292,7 @@ pub fn compute_preprocessed_commitment(options: &ProofOptions) -> Commitment { .collect(); #[cfg(not(feature = "parallel"))] - let mut lde_columns: Vec> = polys + let lde_columns: Vec> = polys .iter() .map(|poly| { evaluate_polynomial_on_lde_domain(poly, blowup_factor, NUM_ROWS, &coset_offset) @@ -300,25 +300,9 @@ pub fn compute_preprocessed_commitment(options: &ProofOptions) -> Commitment { }) .collect(); - // Step 4: Bit-reverse permute (parallel) - #[cfg(feature = "parallel")] - lde_columns.par_iter_mut().for_each(|col| { - in_place_bit_reverse_permute(col); - }); - - #[cfg(not(feature = "parallel"))] - for col in lde_columns.iter_mut() { - in_place_bit_reverse_permute(col); - } - - // Step 5: Convert columns to rows for Merkle tree - let lde_rows = columns2rows(lde_columns); - - // Step 6: Build Merkle tree over LDE (N * blowup leaves) - let tree = BatchedMerkleTree::::build(&lde_rows) + let (_, root) = commit_bit_reversed(&lde_columns, ROWS_PER_LEAF) .expect("Failed to build Merkle tree for bitwise LDE"); - - tree.root + root } /// Returns the preprocessed commitment for the bitwise table. diff --git a/prover/src/tables/decode.rs b/prover/src/tables/decode.rs index 7bc3c9106..509f86991 100644 --- a/prover/src/tables/decode.rs +++ b/prover/src/tables/decode.rs @@ -34,13 +34,13 @@ use executor::elf::Elf; use executor::vm::instruction::decoding::{Instruction, InstructionError}; use executor::vm::memory::U64HashMap; -use math::fft::bit_reversing::in_place_bit_reverse_permute; use math::polynomial::Polynomial; -use stark::config::{BatchedMerkleTree, Commitment}; +use stark::commitment::{ROWS_PER_LEAF, commit_bit_reversed}; +use stark::config::Commitment; use stark::lookup::{BusInteraction, BusValue, Multiplicity, Packing}; use stark::proof::options::ProofOptions; use stark::prover::evaluate_polynomial_on_lde_domain; -use stark::trace::{TraceTable, columns2rows}; +use stark::trace::TraceTable; use super::types::{BusId, FE, GoldilocksExtension, GoldilocksField, VmTable}; @@ -285,7 +285,7 @@ pub fn compute_precomputed_commitment( // Step 4: Evaluate polynomials on LDE domain (N * blowup_factor points) let blowup_factor = options.blowup_factor as usize; let coset_offset = FE::from(options.coset_offset); - let mut lde_columns: Vec> = polys + let lde_columns: Vec> = polys .iter() .map(|poly| { evaluate_polynomial_on_lde_domain(poly, blowup_factor, num_rows, &coset_offset) @@ -293,19 +293,9 @@ pub fn compute_precomputed_commitment( }) .collect(); - // Step 5: Bit-reverse permute (same as prover) - for col in lde_columns.iter_mut() { - in_place_bit_reverse_permute(col); - } - - // Step 6: Convert columns to rows for Merkle tree - let lde_rows = columns2rows(lde_columns); - - // Step 7: Build Merkle tree over LDE (N * blowup leaves) - let tree = BatchedMerkleTree::::build(&lde_rows) + let (_, root) = commit_bit_reversed(&lde_columns, ROWS_PER_LEAF) .expect("Failed to build Merkle tree for decode LDE"); - - tree.root + root } // ========================================================================= diff --git a/prover/src/tables/keccak_rc.rs b/prover/src/tables/keccak_rc.rs index 3575c8ba1..f9f0d1cc4 100644 --- a/prover/src/tables/keccak_rc.rs +++ b/prover/src/tables/keccak_rc.rs @@ -8,13 +8,13 @@ //! committed via a static lookup table (with recompute as fallback for //! `ProofOptions` not covered by the static table). -use math::fft::bit_reversing::in_place_bit_reverse_permute; use math::polynomial::Polynomial; -use stark::config::{BatchedMerkleTree, Commitment}; +use stark::commitment::{ROWS_PER_LEAF, commit_bit_reversed}; +use stark::config::Commitment; use stark::lookup::{BusInteraction, BusValue, Multiplicity, Packing}; use stark::proof::options::ProofOptions; use stark::prover::evaluate_polynomial_on_lde_domain; -use stark::trace::{TraceTable, columns2rows}; +use stark::trace::TraceTable; use executor::vm::instruction::execution::KECCAK_RC; @@ -97,19 +97,19 @@ pub const fn generate_row(round: usize) -> [u64; NUM_PRECOMPUTED_COLS] { fn static_commitment(blowup_factor: u8) -> Option { match blowup_factor { 2 => Some([ - 0xe8, 0x06, 0x8b, 0xb2, 0xbd, 0x3d, 0x80, 0xf3, 0x92, 0x95, 0x31, 0x1a, 0xfd, 0x55, - 0xba, 0x12, 0x3f, 0x76, 0xeb, 0x44, 0x32, 0x57, 0x9d, 0xb7, 0x7f, 0x1e, 0x63, 0xb4, - 0x98, 0xb5, 0xb0, 0xb7, + 0xab, 0x7a, 0xad, 0xf5, 0xbf, 0xa2, 0xd5, 0x5c, 0x29, 0x83, 0x83, 0xe6, 0x2e, 0x47, + 0xa0, 0xa5, 0x22, 0xf9, 0x57, 0x89, 0x5a, 0x5c, 0xbb, 0x1f, 0x34, 0xbc, 0x21, 0x72, + 0xa9, 0x2c, 0x85, 0xe3, ]), 4 => Some([ - 0xa9, 0xfb, 0xc9, 0x15, 0x1c, 0x22, 0x75, 0xe7, 0x56, 0xeb, 0x6d, 0xf9, 0xfe, 0x83, - 0x2a, 0xb1, 0xa7, 0x1a, 0x20, 0x71, 0x9b, 0x0c, 0xff, 0x6b, 0x3f, 0x57, 0xc6, 0x84, - 0x3e, 0xbf, 0xc8, 0xaa, + 0xfb, 0x42, 0x58, 0x76, 0xf4, 0x30, 0x98, 0x04, 0xef, 0x8c, 0x4e, 0x65, 0xf7, 0x1a, + 0x29, 0x03, 0xd2, 0xc6, 0x12, 0x0d, 0x18, 0xe1, 0x28, 0x6e, 0x70, 0xeb, 0xa8, 0x11, + 0x3c, 0x3e, 0xe1, 0xdd, ]), 8 => Some([ - 0x5c, 0x30, 0xf6, 0xa0, 0xcf, 0x78, 0x43, 0x15, 0x5b, 0x5d, 0x18, 0x34, 0x44, 0xba, - 0x81, 0x9a, 0x64, 0x05, 0x5c, 0x79, 0x26, 0x18, 0x09, 0x24, 0x6b, 0xa2, 0x3f, 0x5f, - 0x77, 0x09, 0xd5, 0xfc, + 0x9f, 0x91, 0xaf, 0xb6, 0x5b, 0x75, 0x1e, 0xfb, 0x73, 0x93, 0x2c, 0xc4, 0xa8, 0xe1, + 0xb5, 0x21, 0x91, 0x5d, 0x6a, 0x19, 0x2e, 0x1d, 0xa8, 0x80, 0x21, 0x1f, 0x36, 0x76, + 0x9b, 0x8e, 0x3d, 0xb6, ]), _ => None, } @@ -144,7 +144,7 @@ pub fn compute_preprocessed_commitment(options: &ProofOptions) -> Commitment { // Evaluate on LDE domain let blowup_factor = options.blowup_factor as usize; let coset_offset = FE::from(options.coset_offset); - let mut lde_columns: Vec> = polys + let lde_columns: Vec> = polys .iter() .map(|poly| { evaluate_polynomial_on_lde_domain(poly, blowup_factor, NUM_ROWS, &coset_offset) @@ -152,17 +152,9 @@ pub fn compute_preprocessed_commitment(options: &ProofOptions) -> Commitment { }) .collect(); - // Bit-reverse permute - for col in lde_columns.iter_mut() { - in_place_bit_reverse_permute(col); - } - - // Build Merkle tree - let lde_rows = columns2rows(lde_columns); - let tree = BatchedMerkleTree::::build(&lde_rows) + let (_, root) = commit_bit_reversed(&lde_columns, ROWS_PER_LEAF) .expect("Failed to build Merkle tree for keccak_rc LDE"); - - tree.root + root } /// Returns the preprocessed commitment for the keccak_rc table. diff --git a/prover/src/tables/page.rs b/prover/src/tables/page.rs index 174225ffa..2d1059bcc 100644 --- a/prover/src/tables/page.rs +++ b/prover/src/tables/page.rs @@ -32,13 +32,13 @@ use std::collections::HashMap; -use math::fft::bit_reversing::in_place_bit_reverse_permute; use math::polynomial::Polynomial; -use stark::config::{BatchedMerkleTree, Commitment}; +use stark::commitment::{ROWS_PER_LEAF, commit_bit_reversed}; +use stark::config::Commitment; use stark::lookup::{BusInteraction, BusValue, LinearTerm, Multiplicity, Packing}; use stark::proof::options::ProofOptions; use stark::prover::evaluate_polynomial_on_lde_domain; -use stark::trace::{TraceTable, columns2rows}; +use stark::trace::TraceTable; use super::types::{BusId, FE, GoldilocksExtension, GoldilocksField, VmTable}; @@ -250,19 +250,19 @@ pub fn generate_page_trace( pub(crate) fn static_zero_page_commitment(blowup_factor: u8) -> Option { match blowup_factor { 2 => Some([ - 0xf9, 0x80, 0x0e, 0x45, 0x72, 0x5a, 0x8e, 0x8e, 0x5e, 0xd7, 0x5b, 0x60, 0xce, 0xd0, - 0x8e, 0xa3, 0x27, 0x3b, 0x8a, 0xb5, 0x98, 0xc0, 0xe3, 0x16, 0xf6, 0x86, 0x75, 0x39, - 0x4c, 0xe5, 0x88, 0x5e, + 0x7d, 0x74, 0x85, 0xf0, 0x2b, 0x74, 0xe0, 0x3f, 0x14, 0x99, 0xb3, 0xa0, 0x5f, 0x1d, + 0x6e, 0xf2, 0x21, 0xff, 0xaf, 0x24, 0x7e, 0x30, 0xb0, 0xda, 0x48, 0x79, 0xe1, 0x43, + 0xee, 0xea, 0x6a, 0x0f, ]), 4 => Some([ - 0x0f, 0xb5, 0x0c, 0xa8, 0x3b, 0x69, 0x4f, 0x91, 0x60, 0xbf, 0x0d, 0x0d, 0xd3, 0x33, - 0x25, 0x38, 0x11, 0xbb, 0xf8, 0xfd, 0x54, 0xbd, 0x06, 0x7d, 0xd1, 0xeb, 0xa3, 0x58, - 0xe8, 0x37, 0x45, 0x56, + 0x5c, 0xcc, 0x5b, 0xb1, 0xe8, 0x11, 0x91, 0x81, 0xbd, 0xdd, 0x39, 0x40, 0x77, 0x87, + 0xdc, 0x98, 0x06, 0x06, 0x8c, 0x63, 0xcd, 0xfd, 0xf1, 0xda, 0x4a, 0x55, 0x31, 0x4d, + 0x6a, 0x16, 0x18, 0xd0, ]), 8 => Some([ - 0x4a, 0xfb, 0xc9, 0x6d, 0x46, 0x29, 0xa3, 0xc2, 0x36, 0x14, 0xd8, 0x24, 0x3e, 0xef, - 0x97, 0x3f, 0xe1, 0xda, 0x2b, 0xf7, 0x87, 0xb6, 0x54, 0xe1, 0xc6, 0x46, 0xc0, 0x85, - 0x96, 0x7f, 0x7f, 0x48, + 0xf0, 0xc0, 0x69, 0xed, 0xf8, 0x59, 0xd6, 0x56, 0x15, 0x3c, 0x2f, 0x93, 0x65, 0xd6, + 0xe9, 0xe9, 0x8e, 0xd1, 0x83, 0x94, 0xf9, 0x75, 0x59, 0xd1, 0xec, 0x16, 0xe1, 0x37, + 0xd5, 0x32, 0xd6, 0xd9, ]), _ => None, } @@ -315,7 +315,7 @@ pub fn compute_precomputed_commitment(config: &PageConfig, options: &ProofOption let blowup_factor = options.blowup_factor as usize; let coset_offset = FE::from(options.coset_offset); - let mut lde_columns: Vec> = polys + let lde_columns: Vec> = polys .iter() .map(|poly| { evaluate_polynomial_on_lde_domain(poly, blowup_factor, num_rows, &coset_offset) @@ -323,14 +323,9 @@ pub fn compute_precomputed_commitment(config: &PageConfig, options: &ProofOption }) .collect(); - for col in lde_columns.iter_mut() { - in_place_bit_reverse_permute(col); - } - - let lde_rows = columns2rows(lde_columns); - let tree = BatchedMerkleTree::::build(&lde_rows) + let (_, root) = commit_bit_reversed(&lde_columns, ROWS_PER_LEAF) .expect("Failed to build Merkle tree for page LDE"); - tree.root + root } /// Returns the zero-init PAGE preprocessed commitment. diff --git a/prover/src/tables/register.rs b/prover/src/tables/register.rs index 09485595a..46c675b65 100644 --- a/prover/src/tables/register.rs +++ b/prover/src/tables/register.rs @@ -20,13 +20,13 @@ use std::collections::HashMap; -use math::fft::bit_reversing::in_place_bit_reverse_permute; use math::polynomial::Polynomial; -use stark::config::{BatchedMerkleTree, Commitment}; +use stark::commitment::{ROWS_PER_LEAF, commit_bit_reversed}; +use stark::config::Commitment; use stark::lookup::{BusInteraction, BusValue, Multiplicity, Packing}; use stark::proof::options::ProofOptions; use stark::prover::evaluate_polynomial_on_lde_domain; -use stark::trace::{TraceTable, columns2rows}; +use stark::trace::TraceTable; #[cfg(test)] use executor::vm::registers::Registers; @@ -336,7 +336,7 @@ fn commit_register_columns(options: &ProofOptions, columns: Vec>) -> Com let blowup_factor = options.blowup_factor as usize; let coset_offset = FE::from(options.coset_offset); - let mut lde_columns: Vec> = polys + let lde_columns: Vec> = polys .iter() .map(|poly| { evaluate_polynomial_on_lde_domain(poly, blowup_factor, num_rows, &coset_offset) @@ -344,14 +344,9 @@ fn commit_register_columns(options: &ProofOptions, columns: Vec>) -> Com }) .collect(); - for col in lde_columns.iter_mut() { - in_place_bit_reverse_permute(col); - } - - let lde_rows = columns2rows(lde_columns); - let tree = BatchedMerkleTree::::build(&lde_rows) + let (_, root) = commit_bit_reversed(&lde_columns, ROWS_PER_LEAF) .expect("Failed to build Merkle tree for register LDE"); - tree.root + root } /// Returns the preprocessed commitment for the REGISTER table. diff --git a/prover/src/tests/decode_tests.rs b/prover/src/tests/decode_tests.rs index 43e6991cf..a761ac929 100644 --- a/prover/src/tests/decode_tests.rs +++ b/prover/src/tests/decode_tests.rs @@ -242,8 +242,8 @@ fn decode_commitment_zero_bytes_rejects() { /// AIR or FFT pipeline changes, this drifts and the test fails — /// regenerate via the `print_decode_commitment_for_sub` helper below. const SUB_DECODE_COMMITMENT_BLOWUP_2: [u8; 32] = [ - 0x60, 0x66, 0x0b, 0x18, 0x0d, 0x41, 0x08, 0xb3, 0x3a, 0x03, 0x99, 0x03, 0x8c, 0x9d, 0x12, 0x57, - 0x68, 0x8d, 0xed, 0x13, 0x60, 0xeb, 0x1d, 0x2b, 0xa8, 0xea, 0x1c, 0x76, 0xc9, 0xdd, 0x25, 0xaf, + 0xe9, 0x71, 0x68, 0xd6, 0x2e, 0xb1, 0xf6, 0x56, 0x61, 0x9d, 0x04, 0x6e, 0x65, 0xed, 0x63, 0x4a, + 0x27, 0xa3, 0x4d, 0xcb, 0x6c, 0x02, 0x11, 0xd7, 0x65, 0xc9, 0xc9, 0xfd, 0x59, 0x34, 0x41, 0x5f, ]; #[test] diff --git a/prover/tests/cuda_path_integration.rs b/prover/tests/cuda_path_integration.rs index cf9bc742c..8033828bf 100644 --- a/prover/tests/cuda_path_integration.rs +++ b/prover/tests/cuda_path_integration.rs @@ -47,7 +47,10 @@ fn gpu_path_fires_end_to_end() { "R2 GPU composition LDE did not fire (neither two-halves d2 nor parts>2 path)" ); - // R2 comp-poly Merkle tree build, paired with the parts LDE above. + // R2 comp-poly Merkle tree build. Dispatched unconditionally (independent of + // the parts-count branch above), so it fires for the common degree-2 case + // too; a silent CPU fallback would still verify, so this counter is what + // guards the GPU comp-poly-tree dispatch. assert!( gpu_comp_poly_tree_calls() > 0, "R2 GPU comp-poly tree did not fire" @@ -72,3 +75,24 @@ fn gpu_path_fires_end_to_end() { let ok = verify(&proof, &elf).expect("verify"); assert!(ok, "GPU-produced proof failed verification"); } + +/// Focused validation of the GPU row-pair trace commitment: proves a large +/// trace with the GPU path and verifies the resulting proof. Independent of the +/// per-round counter assertions in `gpu_path_fires_end_to_end` (the R2 parts-LDE +/// assertion bit-rotted on main and cuts off before the verify). A wrong GPU +/// trace-commit leaf layout (1-row vs the new row-pair) would fail verification. +#[test] +#[ignore = "requires GPU; run with --ignored --nocapture"] +fn gpu_proof_verifies_row_pair_commitment() { + let elf = asm_elf_bytes("fib_iterative_1M"); + reset_all_gpu_call_counters(); + let proof = prove(&elf).expect("prove"); + assert!( + gpu_lde_calls() > 0, + "GPU LDE path did not fire (silent CPU fallback would not test the GPU commit)" + ); + assert!( + verify(&proof, &elf).expect("verify"), + "GPU-produced proof (row-pair commitment) failed verification" + ); +}