From faffc88aefedf2aa1a8ba18717f2027953ee2403 Mon Sep 17 00:00:00 2001 From: MauroFab Date: Mon, 29 Jun 2026 17:45:01 -0300 Subject: [PATCH 1/5] Revert unrelated FxHashMap op-dedup change (out of scope for #715) The FxHasher/FxHashMap op-dedup micro-optimization is unrelated to the row-major GPU LDE rework and was only applied to 4 of 6 dedup tables. Revert the table maps to std HashMap and drop the hasher; it can land as its own focused PR. --- prover/src/tables/branch.rs | 8 +++--- prover/src/tables/dvrm.rs | 8 ++++-- prover/src/tables/lt.rs | 8 +++--- prover/src/tables/mul.rs | 8 ++++-- prover/src/tables/types.rs | 55 ------------------------------------- 5 files changed, 18 insertions(+), 69 deletions(-) diff --git a/prover/src/tables/branch.rs b/prover/src/tables/branch.rs index 6ecd42e78..9443a81a1 100644 --- a/prover/src/tables/branch.rs +++ b/prover/src/tables/branch.rs @@ -33,9 +33,9 @@ use stark::lookup::{BusInteraction, BusValue, LinearTerm, Multiplicity, Packing} use stark::table::TableView; use stark::trace::TraceTable; -use super::types::{ - BusId, FE, FxHashMap, GoldilocksExtension, GoldilocksField, SHIFT_16, VmTable, alu_op, -}; +use std::collections::HashMap; + +use super::types::{BusId, FE, GoldilocksExtension, GoldilocksField, SHIFT_16, VmTable, alu_op}; // ========================================================================= // Column indices for BRANCH table @@ -161,7 +161,7 @@ pub fn generate_branch_trace( operations: &[BranchOperation], ) -> TraceTable { // Deduplicate operations: (pc, offset, register, jalr) -> multiplicity - let mut op_map: FxHashMap = FxHashMap::default(); + let mut op_map: HashMap = HashMap::new(); for op in operations { *op_map.entry(op.clone()).or_insert(0) += 1; } diff --git a/prover/src/tables/dvrm.rs b/prover/src/tables/dvrm.rs index 2dcaae453..3da78dff5 100644 --- a/prover/src/tables/dvrm.rs +++ b/prover/src/tables/dvrm.rs @@ -36,9 +36,11 @@ use stark::lookup::{BusInteraction, BusValue, LinearTerm, Multiplicity, Packing} use stark::table::TableView; use stark::trace::TraceTable; +use std::collections::HashMap; + use super::types::{ - BusId, FE, FxHashMap, GoldilocksExtension, GoldilocksField, NEG_INV_2_16, NEG_INV_2_32, - NEG_INV_2_48, NEG_INV_2_64, SHIFT_16, VmTable, alu_op, + BusId, FE, GoldilocksExtension, GoldilocksField, NEG_INV_2_16, NEG_INV_2_32, NEG_INV_2_48, + NEG_INV_2_64, SHIFT_16, VmTable, alu_op, }; // ========================================================================= @@ -286,7 +288,7 @@ pub fn generate_dvrm_trace( operations: &[(DvrmOperation, bool)], ) -> TraceTable { // Deduplicate: (n, d, signed) -> (mu_q, mu_r) - let mut op_map: FxHashMap = FxHashMap::default(); + let mut op_map: HashMap = HashMap::new(); for (op, wants_remainder) in operations { let entry = op_map.entry(op.clone()).or_default(); diff --git a/prover/src/tables/lt.rs b/prover/src/tables/lt.rs index be0b60773..02ed029bd 100644 --- a/prover/src/tables/lt.rs +++ b/prover/src/tables/lt.rs @@ -33,9 +33,9 @@ use stark::lookup::{BusInteraction, BusValue, LinearTerm, Multiplicity, Packing} use stark::table::TableView; use stark::trace::TraceTable; -use super::types::{ - BusId, FE, FxHashMap, GoldilocksExtension, GoldilocksField, SHIFT_16, VmTable, alu_op, -}; +use std::collections::HashMap; + +use super::types::{BusId, FE, GoldilocksExtension, GoldilocksField, SHIFT_16, VmTable, alu_op}; // ========================================================================= // Column indices for LT table @@ -164,7 +164,7 @@ pub fn generate_lt_trace( operations: &[LtOperation], ) -> TraceTable { // Deduplicate operations: (lhs, rhs, signed) -> multiplicity - let mut op_map: FxHashMap = FxHashMap::default(); + let mut op_map: HashMap = HashMap::new(); for op in operations { *op_map.entry(op.clone()).or_insert(0) += 1; } diff --git a/prover/src/tables/mul.rs b/prover/src/tables/mul.rs index 197a0d334..33679211c 100644 --- a/prover/src/tables/mul.rs +++ b/prover/src/tables/mul.rs @@ -37,9 +37,11 @@ use stark::lookup::{BusInteraction, BusValue, LinearTerm, Multiplicity, Packing} use stark::table::TableView; use stark::trace::TraceTable; +use std::collections::HashMap; + use super::types::{ - BusId, FE, FxHashMap, GoldilocksExtension, GoldilocksField, INV_2_32, INV_2_64, INV_2_96, - INV_2_128, NEG_INV_2_16, NEG_INV_2_32, NEG_INV_2_48, NEG_INV_2_64, NEG_INV_2_80, NEG_INV_2_96, + BusId, FE, GoldilocksExtension, GoldilocksField, INV_2_32, INV_2_64, INV_2_96, INV_2_128, + NEG_INV_2_16, NEG_INV_2_32, NEG_INV_2_48, NEG_INV_2_64, NEG_INV_2_80, NEG_INV_2_96, NEG_INV_2_112, NEG_INV_2_128, SHIFT_16, VmTable, alu_op, }; @@ -294,7 +296,7 @@ pub fn generate_mul_trace( operations: &[(MulOperation, bool)], ) -> TraceTable { // Deduplicate: (lhs, lhs_signed, rhs, rhs_signed) -> (mu_lo, mu_hi) - let mut op_map: FxHashMap = FxHashMap::default(); + let mut op_map: HashMap = HashMap::new(); for (op, wants_hi) in operations { let entry = op_map.entry(op.clone()).or_default(); diff --git a/prover/src/tables/types.rs b/prover/src/tables/types.rs index 2cd5db0f0..d6091d0fd 100644 --- a/prover/src/tables/types.rs +++ b/prover/src/tables/types.rs @@ -968,61 +968,6 @@ impl DecodeEntry { } } -// ========================================================================= -// Fast hashing for op-dedup multiplicity maps -// ========================================================================= - -/// Fast non-cryptographic hash for the op-dedup hot path. Skipping SipHash is -/// safe here: the maps are per-chunk (bounded ≤ `max_rows`), keyed by the -/// prover's own trace, and collisions only cost probes, never soundness. -#[derive(Default)] -pub struct FxHasher(u64); - -impl FxHasher { - const SEED: u64 = 0x51_7c_c1_b7_27_22_0a_95; - - #[inline] - fn add(&mut self, word: u64) { - self.0 = (self.0.rotate_left(5) ^ word).wrapping_mul(Self::SEED); - } -} - -impl std::hash::Hasher for FxHasher { - #[inline] - fn write(&mut self, bytes: &[u8]) { - for &b in bytes { - self.add(b as u64); - } - } - #[inline] - fn write_u8(&mut self, i: u8) { - self.add(i as u64); - } - #[inline] - fn write_u16(&mut self, i: u16) { - self.add(i as u64); - } - #[inline] - fn write_u32(&mut self, i: u32) { - self.add(i as u64); - } - #[inline] - fn write_u64(&mut self, i: u64) { - self.add(i); - } - #[inline] - fn write_usize(&mut self, i: usize) { - self.add(i as u64); - } - #[inline] - fn finish(&self) -> u64 { - self.0 - } -} - -/// `HashMap` keyed with [`FxHasher`]. -pub type FxHashMap = std::collections::HashMap>; - /// The fully sign-extended 64-bit immediate for an instruction (0 when none). fn imm_from_instruction(instruction: Instruction) -> u64 { match instruction { From d90958997bbe8459fb7df77eb2c01958db9296b2 Mon Sep 17 00:00:00 2001 From: MauroFab Date: Mon, 29 Jun 2026 17:45:01 -0300 Subject: [PATCH 2/5] Remove redundant gpu_and_cpu_proofs_both_verify test The GPU full path is covered by the normal prove/verify suite built with --features cuda (plus gpu_path_fires_end_to_end), the CPU path by the non-cuda suite, and GPU/CPU equivalence by the merkle/barycentric parity tests. Its force-CPU leg also never ran on CPU: gpu_lde_threshold() only re-read the env var under cfg(test), but from the prover integration crate stark compiles without cfg(test), so the OnceLock cached the first value. Simplify gpu_lde_threshold() to a single cached impl now that the per-call re-read has no consumer. --- crypto/stark/src/gpu_lde.rs | 18 +++--------------- prover/tests/cuda_path_integration.rs | 22 ---------------------- 2 files changed, 3 insertions(+), 37 deletions(-) diff --git a/crypto/stark/src/gpu_lde.rs b/crypto/stark/src/gpu_lde.rs index 60e8dacc6..29e9b94e6 100644 --- a/crypto/stark/src/gpu_lde.rs +++ b/crypto/stark/src/gpu_lde.rs @@ -41,25 +41,13 @@ use crate::trace::LDETraceTable; const DEFAULT_GPU_LDE_THRESHOLD: usize = 1 << 19; fn gpu_lde_threshold() -> usize { - // In test builds re-read the env var on every call so tests can switch - // between GPU and CPU paths in the same process (OnceLock can't be reset). - #[cfg(test)] - { + static CACHED: OnceLock = OnceLock::new(); + *CACHED.get_or_init(|| { std::env::var("LAMBDA_VM_GPU_LDE_THRESHOLD") .ok() .and_then(|s| s.parse().ok()) .unwrap_or(DEFAULT_GPU_LDE_THRESHOLD) - } - #[cfg(not(test))] - { - static CACHED: OnceLock = OnceLock::new(); - *CACHED.get_or_init(|| { - std::env::var("LAMBDA_VM_GPU_LDE_THRESHOLD") - .ok() - .and_then(|s| s.parse().ok()) - .unwrap_or(DEFAULT_GPU_LDE_THRESHOLD) - }) - } + }) } /// Incremented by the `try_expand_*` functions per base-field column handed to diff --git a/prover/tests/cuda_path_integration.rs b/prover/tests/cuda_path_integration.rs index 6de671cf1..0f7c1f3c7 100644 --- a/prover/tests/cuda_path_integration.rs +++ b/prover/tests/cuda_path_integration.rs @@ -66,25 +66,3 @@ fn gpu_path_fires_end_to_end() { let ok = verify(&proof, &elf).expect("verify"); assert!(ok, "GPU-produced proof failed verification"); } - -#[test] -#[ignore = "requires GPU; run with --ignored --nocapture"] -fn gpu_and_cpu_proofs_both_verify() { - let elf = asm_elf_bytes("fib_iterative_1M"); - - let proof_gpu = prove(&elf).expect("GPU prove"); - assert!( - verify(&proof_gpu, &elf).expect("GPU verify"), - "GPU proof failed" - ); - - // Force CPU path by pushing the GPU threshold above any real table size. - // SAFETY: no other thread reads this env var during the test. - unsafe { std::env::set_var("LAMBDA_VM_GPU_LDE_THRESHOLD", "999999999") }; - let proof_cpu = prove(&elf).expect("CPU prove"); - unsafe { std::env::remove_var("LAMBDA_VM_GPU_LDE_THRESHOLD") }; - assert!( - verify(&proof_cpu, &elf).expect("CPU verify"), - "CPU proof failed" - ); -} From 8c4ebf5bae52435ab73a18e3395e7eede1c8a438 Mon Sep 17 00:00:00 2001 From: MauroFab Date: Mon, 29 Jun 2026 17:45:01 -0300 Subject: [PATCH 3/5] Fix stale docs and remove dead code keccak.cu: move keccak256_leaves_base_row_major out of keccak_merkle_level's doc block so the child-pair->parent doc rejoins its kernel. prover.rs: delete columns_to_row_major, which has no callers after the row-major GPU path stopped materializing GPU-expanded columns. --- crypto/math-cuda/kernels/keccak.cu | 62 +++++++++++++++--------------- crypto/stark/src/prover.rs | 26 ------------- 2 files changed, 31 insertions(+), 57 deletions(-) diff --git a/crypto/math-cuda/kernels/keccak.cu b/crypto/math-cuda/kernels/keccak.cu index 86c0e519e..9937d7c6e 100644 --- a/crypto/math-cuda/kernels/keccak.cu +++ b/crypto/math-cuda/kernels/keccak.cu @@ -317,6 +317,37 @@ extern "C" __global__ void keccak_fri_leaves_ext3( // children: nodes[parent_begin + n_pairs .. parent_begin + 3 * n_pairs] // parents: nodes[parent_begin .. parent_begin + n_pairs] // +// Each thread hashes one child pair → one parent. Keccak-256 of the +// concatenation of two 32-byte siblings, identical to +// `FieldElementVectorBackend::hash_new_parent` on host. +// --------------------------------------------------------------------------- +extern "C" __global__ void keccak_merkle_level( + uint8_t *nodes, + uint64_t parent_begin, // node index (counted in 32-byte nodes) + uint64_t n_pairs) { + uint64_t tid = (uint64_t)blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= n_pairs) return; + + uint64_t st[25]; + #pragma unroll + for (int i = 0; i < 25; ++i) st[i] = 0; + + uint32_t rate_pos = 0; + // `nodes` comes from cuMemAlloc (256-byte aligned); each 32-byte node + // sits at a 32-byte-aligned offset, so the u64 cast is safe. + const uint64_t *left = reinterpret_cast( + nodes + (parent_begin + n_pairs + 2 * tid) * 32); + #pragma unroll + for (int i = 0; i < 4; ++i) absorb_lane(st, rate_pos, left[i]); + + const uint64_t *right = reinterpret_cast( + nodes + (parent_begin + n_pairs + 2 * tid + 1) * 32); + #pragma unroll + for (int i = 0; i < 4; ++i) absorb_lane(st, rate_pos, right[i]); + + finalize_keccak256(st, rate_pos, nodes + (parent_begin + tid) * 32); +} + // --------------------------------------------------------------------------- // Row-major base leaf hashing. // @@ -350,34 +381,3 @@ extern "C" __global__ void keccak256_leaves_base_row_major( } finalize_keccak256(st, rate_pos, hashed_leaves_out + tid * 32); } - -// Each thread hashes one child pair → one parent. Keccak-256 of the -// concatenation of two 32-byte siblings, identical to -// `FieldElementVectorBackend::hash_new_parent` on host. -// --------------------------------------------------------------------------- -extern "C" __global__ void keccak_merkle_level( - uint8_t *nodes, - uint64_t parent_begin, // node index (counted in 32-byte nodes) - uint64_t n_pairs) { - uint64_t tid = (uint64_t)blockIdx.x * blockDim.x + threadIdx.x; - if (tid >= n_pairs) return; - - uint64_t st[25]; - #pragma unroll - for (int i = 0; i < 25; ++i) st[i] = 0; - - uint32_t rate_pos = 0; - // `nodes` comes from cuMemAlloc (256-byte aligned); each 32-byte node - // sits at a 32-byte-aligned offset, so the u64 cast is safe. - const uint64_t *left = reinterpret_cast( - nodes + (parent_begin + n_pairs + 2 * tid) * 32); - #pragma unroll - for (int i = 0; i < 4; ++i) absorb_lane(st, rate_pos, left[i]); - - const uint64_t *right = reinterpret_cast( - nodes + (parent_begin + n_pairs + 2 * tid + 1) * 32); - #pragma unroll - for (int i = 0; i < 4; ++i) absorb_lane(st, rate_pos, right[i]); - - finalize_keccak256(st, rate_pos, nodes + (parent_begin + tid) * 32); -} diff --git a/crypto/stark/src/prover.rs b/crypto/stark/src/prover.rs index 9b716ff13..30554c15e 100644 --- a/crypto/stark/src/prover.rs +++ b/crypto/stark/src/prover.rs @@ -475,32 +475,6 @@ where } } -/// Interleave column-major data into a flat row-major buffer + its column -/// count. Used only by the cuda fast path to materialize the GPU-expanded -/// columns in the row-major layout the table expects (CPU paths read the -/// already-row-major trace directly, with no transpose). -#[cfg(feature = "cuda")] -fn columns_to_row_major( - columns: &[Vec>], -) -> (Vec>, usize) { - let num_cols = columns.len(); - let n = if num_cols > 0 { columns[0].len() } else { 0 }; - // All columns must be the same length; otherwise `col[row]` below indexes - // out of bounds. The producers (CPU/GPU LDE) always emit uniform columns — - // this guards against a future regression cheaply (debug builds only). - debug_assert!( - columns.iter().all(|c| c.len() == n), - "columns_to_row_major requires all columns to have equal length" - ); - let mut data = Vec::with_capacity(n * num_cols); - for row in 0..n { - for col in columns { - data.push(col[row].clone()); - } - } - (data, num_cols) -} - /// Compute Keccak-256 leaf hashes for `commit_columns_bit_reversed`: one /// leaf per row, where each row is read at `reverse_index(row_idx)` and the /// columns are concatenated as big-endian bytes before hashing. From 91383bf1a11d27889ccd260c9c6a1e7c34acbb1d Mon Sep 17 00:00:00 2001 From: MauroFab Date: Mon, 29 Jun 2026 17:45:01 -0300 Subject: [PATCH 4/5] Consolidate row-major LDE pipeline; guard keccak num_rows Extract coset_lde_row_major_inner shared by the base and ext3 _keep entry points (they differed only by m vs m*3 and the handle type), removing ~110 lines of drift-prone duplication. Add debug_assert!(num_rows >= 2) to launch_keccak_base_row_major: the kernel shifts by (64 - log_num_rows), UB at num_rows==1, matching the guard in launch_keccak_base. --- crypto/math-cuda/src/lde.rs | 181 ++++++++++++++---------------------- 1 file changed, 69 insertions(+), 112 deletions(-) diff --git a/crypto/math-cuda/src/lde.rs b/crypto/math-cuda/src/lde.rs index 8c319f764..164267684 100644 --- a/crypto/math-cuda/src/lde.rs +++ b/crypto/math-cuda/src/lde.rs @@ -318,6 +318,11 @@ fn launch_keccak_base_row_major( // threads/block, which exceeds the per-block register budget and fails the // launch with CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES — silently dropping the whole // R1 GPU path to the CPU fallback (no device handle for rounds 2-4). + // + // The kernel derives the bit-reversed row as `__brevll(tid) >> (64 - log_num_rows)`; + // a 64-bit shift is UB, so reject `num_rows < 2` (`log_num_rows == 0`), matching + // the `debug_assert!` guard in `launch_keccak_base`. + debug_assert!(num_rows >= 2, "row-major keccak requires num_rows >= 2"); let cfg = keccak_launch_cfg(num_rows); unsafe { stream @@ -368,48 +373,53 @@ fn launch_row_to_col_major( Ok(dst) } -/// Row-major LDE + Keccak + Merkle, all on-device. +/// Shared row-major LDE + Keccak + Merkle pipeline for the base and ext3 paths. +/// +/// `total_cols` is the number of base-field columns in the row-major layout: +/// `m` for base, `m * 3` for ext3. Because `Fp3 = [u64; 3]`, the three ext3 +/// components are just three adjacent base-field columns, so the same row-major +/// NTT and Keccak kernels process all of them simultaneously — no de-interleave. /// -/// Input: `row_major` is a flat `n * m` slice in row-major order. -/// Returns (merkle_nodes, GpuLdeBase handle, row-major LDE Vec). /// Single H2D, row-major NTT, single D2H — no CPU-side extract or transpose. -/// The returned handle is column-major (as required by downstream GPU kernels): -/// after D2H, `buf` is transposed on-device to column-major for the handle. -pub fn coset_lde_row_major_with_merkle_tree_keep( +/// Returns (merkle_nodes, column-major device buffer, row-major LDE Vec). The +/// buffer is transposed to column-major (as required by the downstream GPU +/// kernels DEEP/barycentric); callers wrap it in the appropriate LDE handle. +fn coset_lde_row_major_inner( row_major: &[u64], n: usize, - m: usize, + total_cols: usize, blowup_factor: usize, weights: &[u64], -) -> Result<(Vec, GpuLdeBase, Vec)> { - assert_eq!(row_major.len(), n * m); + what: &str, +) -> Result<(Vec, CudaSlice, Vec)> { + assert_eq!(row_major.len(), n * total_cols); assert!(n.is_power_of_two()); assert_eq!(weights.len(), n); assert!(blowup_factor.is_power_of_two()); let lde_size = n * blowup_factor; - assert_u32_domain(lde_size, "coset_lde_row_major lde_size"); + assert_u32_domain(lde_size, what); let nodes_bytes = KeccakCommit::FullTree.total_nodes_bytes(lde_size); let log_n = n.trailing_zeros() as u64; let log_lde = lde_size.trailing_zeros() as u64; let n_u64 = n as u64; let lde_u64 = lde_size as u64; - let m_u64 = m as u64; + let cols_u64 = total_cols as u64; let be = backend()?; let stream = be.next_stream(); - // H2D into a zeroed lde_size*m buffer; only the first n*m rows carry data, - // the remainder are already zero (zero-padding for LDE expansion). - let mut buf = stream.alloc_zeros::(lde_size * m)?; - stream.memcpy_htod(row_major, &mut buf.slice_mut(0..n * m))?; + // H2D into a zeroed lde_size*total_cols buffer; only the first n*total_cols + // rows carry data, the remainder are already zero (zero-padding for LDE). + let mut buf = stream.alloc_zeros::(lde_size * total_cols)?; + stream.memcpy_htod(row_major, &mut buf.slice_mut(0..n * total_cols))?; let inv_tw = be.inv_twiddles_for(log_n)?; let fwd_tw = be.fwd_twiddles_for(log_lde)?; let weights_dev = stream.clone_htod(weights)?; // iNTT: bit-reverse rows → per-level DIT. - launch_bit_reverse_row_major(stream.as_ref(), be, &mut buf, n_u64, log_n, m_u64)?; + launch_bit_reverse_row_major(stream.as_ref(), be, &mut buf, n_u64, log_n, cols_u64)?; run_row_major_ntt_body( stream.as_ref(), be, @@ -417,14 +427,14 @@ pub fn coset_lde_row_major_with_merkle_tree_keep( inv_tw.as_ref(), n_u64, log_n, - m_u64, + cols_u64, )?; - // Coset weights: one weight per row, broadcast across all m columns. - launch_pointwise_mul_row_major(stream.as_ref(), be, &mut buf, &weights_dev, n_u64, m_u64)?; + // Coset weights: one weight per row, broadcast across all columns. + launch_pointwise_mul_row_major(stream.as_ref(), be, &mut buf, &weights_dev, n_u64, cols_u64)?; // Forward NTT at lde_size. - launch_bit_reverse_row_major(stream.as_ref(), be, &mut buf, lde_u64, log_lde, m_u64)?; + launch_bit_reverse_row_major(stream.as_ref(), be, &mut buf, lde_u64, log_lde, cols_u64)?; run_row_major_ntt_body( stream.as_ref(), be, @@ -432,10 +442,10 @@ pub fn coset_lde_row_major_with_merkle_tree_keep( fwd_tw.as_ref(), lde_u64, log_lde, - m_u64, + cols_u64, )?; - // Keccak + Merkle on-device. + // Keccak + Merkle on-device. Each leaf reads `total_cols` consecutive u64s. let mut nodes_dev = unsafe { stream.alloc::(nodes_bytes) }?; let leaves_offset = KeccakCommit::FullTree.leaves_offset_bytes(lde_size); { @@ -444,7 +454,7 @@ pub fn coset_lde_row_major_with_merkle_tree_keep( stream.as_ref(), be, &buf, - m_u64, + cols_u64, lde_u64, log_lde, &mut leaves_view, @@ -457,11 +467,11 @@ pub fn coset_lde_row_major_with_merkle_tree_keep( let lde_out = { let staging_slot = be.pinned_staging(); let mut staging = staging_slot.lock().unwrap(); - staging.ensure_capacity(lde_size * m, &be.ctx)?; - let pinned = unsafe { staging.as_mut_slice(lde_size * m) }; + staging.ensure_capacity(lde_size * total_cols, &be.ctx)?; + let pinned = unsafe { staging.as_mut_slice(lde_size * total_cols) }; stream.memcpy_dtoh(&buf, pinned)?; stream.synchronize()?; - let out = pinned[..lde_size * m].to_vec(); + let out = pinned[..lde_size * total_cols].to_vec(); drop(staging); out }; @@ -471,16 +481,39 @@ pub fn coset_lde_row_major_with_merkle_tree_keep( // Transpose row-major buf → column-major for the handle. Downstream kernels // (DEEP, barycentric) expect buf[c * lde_size + r] (column-major). - let col_major_dev = launch_row_to_col_major(&stream, be, &buf, lde_size, m, lde_u64)?; + let col_major_dev = launch_row_to_col_major(&stream, be, &buf, lde_size, total_cols, lde_u64)?; // Synchronize before returning: the handle crosses stream boundaries — downstream // consumers call be.next_stream() and read handle.buf on a different stream. // Without this, a barycentric or DEEP kernel can start before the transpose finishes. stream.synchronize()?; + Ok((nodes_out, col_major_dev, lde_out)) +} + +/// Row-major LDE + Keccak + Merkle, all on-device. +/// +/// Input: `row_major` is a flat `n * m` slice in row-major order. +/// Returns (merkle_nodes, GpuLdeBase handle, row-major LDE Vec). +/// The returned handle is column-major (as required by downstream GPU kernels). +pub fn coset_lde_row_major_with_merkle_tree_keep( + row_major: &[u64], + n: usize, + m: usize, + blowup_factor: usize, + weights: &[u64], +) -> Result<(Vec, GpuLdeBase, Vec)> { + let (nodes_out, col_major_dev, lde_out) = coset_lde_row_major_inner( + row_major, + n, + m, + blowup_factor, + weights, + "coset_lde_row_major lde_size", + )?; let handle = GpuLdeBase { buf: Arc::new(col_major_dev), m, - lde_size, + lde_size: n * blowup_factor, }; Ok((nodes_out, handle, lde_out)) } @@ -502,94 +535,18 @@ pub fn coset_lde_ext3_row_major_with_merkle_tree_keep( blowup_factor: usize, weights: &[u64], ) -> Result<(Vec, GpuLdeExt3, Vec)> { - let m3 = m * 3; - assert_eq!(row_major.len(), n * m3); - assert!(n.is_power_of_two()); - assert_eq!(weights.len(), n); - assert!(blowup_factor.is_power_of_two()); - let lde_size = n * blowup_factor; - assert_u32_domain(lde_size, "coset_lde_ext3_row_major lde_size"); - - let nodes_bytes = KeccakCommit::FullTree.total_nodes_bytes(lde_size); - let log_n = n.trailing_zeros() as u64; - let log_lde = lde_size.trailing_zeros() as u64; - let n_u64 = n as u64; - let lde_u64 = lde_size as u64; - let m3_u64 = m3 as u64; - - let be = backend()?; - let stream = be.next_stream(); - - let mut buf = stream.alloc_zeros::(lde_size * m3)?; - stream.memcpy_htod(row_major, &mut buf.slice_mut(0..n * m3))?; - - let inv_tw = be.inv_twiddles_for(log_n)?; - let fwd_tw = be.fwd_twiddles_for(log_lde)?; - let weights_dev = stream.clone_htod(weights)?; - - // iNTT + coset weights + forward NTT — same row-major kernels as base-field - // but with m3 = m*3 (all 3 components processed simultaneously). - launch_bit_reverse_row_major(stream.as_ref(), be, &mut buf, n_u64, log_n, m3_u64)?; - run_row_major_ntt_body( - stream.as_ref(), - be, - &mut buf, - inv_tw.as_ref(), - n_u64, - log_n, - m3_u64, - )?; - launch_pointwise_mul_row_major(stream.as_ref(), be, &mut buf, &weights_dev, n_u64, m3_u64)?; - launch_bit_reverse_row_major(stream.as_ref(), be, &mut buf, lde_u64, log_lde, m3_u64)?; - run_row_major_ntt_body( - stream.as_ref(), - be, - &mut buf, - fwd_tw.as_ref(), - lde_u64, - log_lde, - m3_u64, + let (nodes_out, col_major_dev, lde_out) = coset_lde_row_major_inner( + row_major, + n, + m * 3, + blowup_factor, + weights, + "coset_lde_ext3_row_major lde_size", )?; - - // Keccak: same row-major kernel — each leaf reads m3 consecutive u64s (= m ext3 elements). - let mut nodes_dev = unsafe { stream.alloc::(nodes_bytes) }?; - let leaves_offset = KeccakCommit::FullTree.leaves_offset_bytes(lde_size); - { - let mut leaves_view = nodes_dev.slice_mut(leaves_offset..leaves_offset + lde_size * 32); - launch_keccak_base_row_major( - stream.as_ref(), - be, - &buf, - m3_u64, - lde_u64, - log_lde, - &mut leaves_view, - )?; - } - crate::merkle::build_inner_tree_levels(stream.as_ref(), be, &mut nodes_dev, lde_size)?; - - let lde_out = { - let staging_slot = be.pinned_staging(); - let mut staging = staging_slot.lock().unwrap(); - staging.ensure_capacity(lde_size * m3, &be.ctx)?; - let pinned = unsafe { staging.as_mut_slice(lde_size * m3) }; - stream.memcpy_dtoh(&buf, pinned)?; - stream.synchronize()?; - let out = pinned[..lde_size * m3].to_vec(); - drop(staging); - out - }; - - let mut nodes_out = vec![0u8; nodes_bytes]; - d2h_bytes_via_pinned_hashes(&stream, be, &nodes_dev, &mut nodes_out)?; - - let col_major_dev = launch_row_to_col_major(&stream, be, &buf, lde_size, m3, lde_u64)?; - stream.synchronize()?; - let handle = GpuLdeExt3 { buf: Arc::new(col_major_dev), m, - lde_size, + lde_size: n * blowup_factor, }; Ok((nodes_out, handle, lde_out)) } From b0a38f9ec19727fc63e5affdb908d13889c52542 Mon Sep 17 00:00:00 2001 From: MauroFab Date: Mon, 29 Jun 2026 18:02:06 -0300 Subject: [PATCH 5/5] Fix stale R2 composition-LDE assertion in gpu_path_fires_end_to_end MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The assert checked gpu_parts_lde_calls() > 0 with a comment claiming branch/shift tables are degree-3 — both false: fib_iterative_1M tables all have number_of_parts <= 2, and the common degree-2 case fires the fused two-halves path (gpu_extend_halves_calls), counted separately from the parts>2 path (gpu_parts_lde_calls) since #700. Assert on the sum so either composition-LDE path satisfies it. Validated on RTX 5090 / CUDA 13.1: make test-math-cuda 78/78, make test-cuda-integration green, proof verifies. --- prover/tests/cuda_path_integration.rs | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/prover/tests/cuda_path_integration.rs b/prover/tests/cuda_path_integration.rs index 0f7c1f3c7..cf9bc742c 100644 --- a/prover/tests/cuda_path_integration.rs +++ b/prover/tests/cuda_path_integration.rs @@ -12,7 +12,8 @@ use lambda_vm_prover::test_utils::asm_elf_bytes; use lambda_vm_prover::{prove, verify}; use stark::gpu_lde::{ gpu_bary_calls, gpu_batch_invert_calls, gpu_comp_poly_tree_calls, gpu_deep_calls, - gpu_fri_calls, gpu_lde_calls, gpu_parts_lde_calls, reset_all_gpu_call_counters, + gpu_extend_halves_calls, gpu_fri_calls, gpu_lde_calls, gpu_parts_lde_calls, + reset_all_gpu_call_counters, }; #[test] @@ -36,10 +37,15 @@ fn gpu_path_fires_end_to_end() { // path. assert!(gpu_bary_calls() > 0, "R3 GPU barycentric did not fire"); - // R2 ext3 LDE of composition-poly parts. Only fires when an AIR's - // `number_of_parts > 2`. The branch and shift tables have degree-3 - // transition constraints, so this triggers on any non-trivial prove. - assert!(gpu_parts_lde_calls() > 0, "R2 GPU parts LDE did not fire"); + // R2 GPU composition-poly LDE. Fires via one of two paths depending on the + // AIR's `number_of_parts`: the fused two-halves quotient decomposition for + // the common degree-2 case (`== 2`, counted by `gpu_extend_halves_calls`), + // or the batched parts LDE for `> 2` (counted by `gpu_parts_lde_calls`). + // fib_iterative_1M only exercises the degree-2 path, so assert on either. + assert!( + gpu_extend_halves_calls() + gpu_parts_lde_calls() > 0, + "R2 GPU composition LDE did not fire (neither two-halves d2 nor parts>2 path)" + ); // R2 comp-poly Merkle tree build, paired with the parts LDE above. assert!(