From fe50a9e327cb49c41efe68340439e2525e135484 Mon Sep 17 00:00:00 2001 From: diegokingston Date: Fri, 19 Jun 2026 16:28:39 -0300 Subject: [PATCH 01/19] refactor(stark): extract Merkle commitment into a commitment module Unifies the two near-identical bit-reversed leaf hashers (per-row + per-row-pair) into one keccak_leaves_bit_reversed_grouped(columns, rows_per_leaf), and the two near-identical commit fns (commit_columns_bit_reversed + commit_composition_polynomial) into commit_bit_reversed(columns, rows_per_leaf), in a new crypto/stark/src/commitment.rs. Removes them from the IsStarkProver trait (they used no self). prover.rs re-exports the named leaf hashers for the math-cuda GPU parity test. Byte-identical (stark 128/128). --- crypto/stark/src/commitment.rs | 147 ++++++++++++++++++++++++++ crypto/stark/src/lib.rs | 1 + crypto/stark/src/prover.rs | 183 ++------------------------------- 3 files changed, 156 insertions(+), 175 deletions(-) create mode 100644 crypto/stark/src/commitment.rs diff --git a/crypto/stark/src/commitment.rs b/crypto/stark/src/commitment.rs new file mode 100644 index 000000000..979c0cb7f --- /dev/null +++ b/crypto/stark/src/commitment.rs @@ -0,0 +1,147 @@ +//! Merkle-tree commitment to bit-reversed, column-major LDE evaluations. +//! +//! This is the commitment layer the prover uses for the main/aux trace LDEs and +//! the composition-polynomial parts. It is decoupled from `IsStarkProver`: the +//! prover only orchestrates *when* to commit; the *how* (leaf layout, bit-reverse +//! permutation, Keccak hashing, tree build) lives here. +//! +//! ## Leaf layout +//! +//! For each leaf `i` we hash `rows_per_leaf` consecutive (bit-reversed) rows, +//! big-endian-concatenated column-by-column: +//! +//! ```text +//! leaf(i) = keccak( col_0[br(R·i)]‖col_1[br(R·i)]‖… ‖ col_0[br(R·i+1)]‖… ‖ … ) +//! where R = rows_per_leaf and br(j) = reverse_index(j, num_rows) +//! ``` +//! +//! - `rows_per_leaf == 1`: one row per leaf — main/aux trace LDE columns. +//! - `rows_per_leaf == 2`: a row pair per leaf — composition-polynomial parts +//! (Round 2/3), where leaf `i` hashes rows `2i` and `2i+1`. +//! +//! The field-element serialization (`write_bytes_be`) + `hash_bytes` path is kept +//! exactly as before, so commitments are byte-identical to the previous +//! per-row / per-row-pair implementations. + +use math::fft::bit_reversing::reverse_index; +use math::field::element::FieldElement; +use math::field::traits::IsField; +use math::traits::{AsBytes, ByteConversion}; + +#[cfg(feature = "parallel")] +use rayon::prelude::{IntoParallelIterator, ParallelIterator}; + +use crate::config::{BatchedMerkleTree, BatchedMerkleTreeBackend, Commitment}; + +/// Computes the Keccak-256 leaf hashes for a bit-reversed, column-major commitment, +/// grouping `rows_per_leaf` consecutive bit-reversed rows into each leaf. +/// +/// Returns one `Commitment` per leaf (`columns[0].len() / rows_per_leaf` leaves), +/// or an empty `Vec` when there is nothing to hash. See the module docs for the +/// exact leaf byte layout. This is the single code path behind both the per-row +/// ([`keccak_leaves_bit_reversed`]) and per-row-pair +/// ([`keccak_leaves_row_pair_bit_reversed`]) commitments. +pub fn keccak_leaves_bit_reversed_grouped( + columns: &[Vec>], + rows_per_leaf: usize, +) -> Vec +where + E: IsField, + FieldElement: AsBytes + Sync + Send + ByteConversion, +{ + if columns.is_empty() || columns[0].is_empty() { + return Vec::new(); + } + + let num_rows = columns[0].len(); + let byte_len = as ByteConversion>::BYTE_LEN; + + debug_assert!( + num_rows.is_power_of_two(), + "num_rows must be a power of two for reverse_index" + ); + debug_assert!( + rows_per_leaf >= 1 && num_rows % rows_per_leaf == 0, + "num_rows must be a multiple of rows_per_leaf" + ); + + let num_leaves = num_rows / rows_per_leaf; + let total_bytes = rows_per_leaf * columns.len() * byte_len; + + // Leaf `i`: the `rows_per_leaf` bit-reversed rows starting at `R·i`, each row + // written column-by-column in big-endian, then hashed once. + let hash_leaf = |buf: &mut [u8], leaf_idx: usize| -> Commitment { + let mut offset = 0; + for k in 0..rows_per_leaf { + let br = reverse_index(rows_per_leaf * leaf_idx + k, num_rows as u64); + for col in columns { + col[br].write_bytes_be(&mut buf[offset..offset + byte_len]); + offset += byte_len; + } + } + BatchedMerkleTreeBackend::::hash_bytes(buf) + }; + + // Per-thread buffer reuse (map_init) avoids millions of small allocations. + #[cfg(feature = "parallel")] + let result: Vec = (0..num_leaves) + .into_par_iter() + .map_init(|| vec![0u8; total_bytes], |buf, i| hash_leaf(buf, i)) + .collect(); + + #[cfg(not(feature = "parallel"))] + let result: Vec = { + let mut buf = vec![0u8; total_bytes]; + (0..num_leaves).map(|i| hash_leaf(&mut buf, i)).collect() + }; + + result +} + +/// Per-row Keccak-256 leaf hashes (one leaf per bit-reversed row). Used for the +/// main/aux trace LDE commitments. Thin wrapper over +/// [`keccak_leaves_bit_reversed_grouped`] with `rows_per_leaf = 1`. +/// +/// Kept as a named public function because the GPU parity tests in dependent +/// crates compare against this exact code path. +pub fn keccak_leaves_bit_reversed(columns: &[Vec>]) -> Vec +where + E: IsField, + FieldElement: AsBytes + Sync + Send + ByteConversion, +{ + keccak_leaves_bit_reversed_grouped(columns, 1) +} + +/// Per-row-pair Keccak-256 leaf hashes (leaf `i` hashes bit-reversed rows `2i`, +/// `2i+1`). Used for the composition-polynomial-parts commitment. Thin wrapper +/// over [`keccak_leaves_bit_reversed_grouped`] with `rows_per_leaf = 2`. +pub fn keccak_leaves_row_pair_bit_reversed(parts: &[Vec>]) -> Vec +where + E: IsField, + FieldElement: AsBytes + Sync + Send + ByteConversion, +{ + keccak_leaves_bit_reversed_grouped(parts, 2) +} + +/// Builds the Merkle tree committing to `columns`' bit-reversed, column-major LDE +/// evaluations, grouping `rows_per_leaf` rows per leaf, and returns the tree and +/// its root. `None` when there is nothing to commit. +/// +/// Replaces the prover's former `commit_columns_bit_reversed` (`rows_per_leaf = 1`) +/// and `commit_composition_polynomial` (`rows_per_leaf = 2`). +pub fn commit_bit_reversed( + columns: &[Vec>], + rows_per_leaf: usize, +) -> Option<(BatchedMerkleTree, Commitment)> +where + E: IsField, + FieldElement: AsBytes + Sync + Send + ByteConversion, +{ + if columns.is_empty() || columns[0].is_empty() { + return None; + } + let hashed_leaves = keccak_leaves_bit_reversed_grouped(columns, rows_per_leaf); + let tree = BatchedMerkleTree::::build_from_hashed_leaves(hashed_leaves)?; + let root = tree.root; + Some((tree, root)) +} diff --git a/crypto/stark/src/lib.rs b/crypto/stark/src/lib.rs index e9f6a1cda..87236c5f9 100644 --- a/crypto/stark/src/lib.rs +++ b/crypto/stark/src/lib.rs @@ -5,6 +5,7 @@ compile_error!("the `disk-spill` feature requires memmap2, which does not compil #[cfg(feature = "debug-checks")] pub mod bus_debug; +pub mod commitment; pub mod constraints; pub mod context; pub mod debug; diff --git a/crypto/stark/src/prover.rs b/crypto/stark/src/prover.rs index 46261103e..d6166f745 100644 --- a/crypto/stark/src/prover.rs +++ b/crypto/stark/src/prover.rs @@ -43,6 +43,8 @@ use super::proof::stark::{DeepPolynomialOpening, MultiProof, StarkProof}; use super::trace::TraceTable; use super::traits::AIR; +pub use crate::commitment::{keccak_leaves_bit_reversed, keccak_leaves_row_pair_bit_reversed}; + /// A triple of (AIR, TraceTable, PublicInputs) for proving. type AirTracePair<'a, Field, FieldExtension, PI> = ( &'a dyn AIR, @@ -389,128 +391,6 @@ where } } -/// Compute Keccak-256 leaf hashes for `commit_columns_bit_reversed`: one -/// leaf per row, where each row is read at `reverse_index(row_idx)` and the -/// columns are concatenated as big-endian bytes before hashing. -/// -/// Returns `Vec` with the same length as `columns[0]`. Exposed -/// (instead of being a closure inside `commit_columns_bit_reversed`) so -/// parity tests in dependent crates can compare against the same code path -/// the prover uses. -pub fn keccak_leaves_bit_reversed(columns: &[Vec>]) -> Vec -where - E: IsField, - FieldElement: AsBytes + Sync + Send + ByteConversion, -{ - if columns.is_empty() || columns[0].is_empty() { - return Vec::new(); - } - - let num_rows = columns[0].len(); - let num_cols = columns.len(); - let byte_len = as ByteConversion>::BYTE_LEN; - - debug_assert!( - num_rows.is_power_of_two(), - "num_rows must be a power of two for reverse_index" - ); - - let total_bytes = num_cols * byte_len; - - let hash_leaf = |buf: &mut [u8], row_idx: usize| -> Commitment { - let br_idx = reverse_index(row_idx, num_rows as u64); - for col_idx in 0..num_cols { - columns[col_idx][br_idx] - .write_bytes_be(&mut buf[col_idx * byte_len..(col_idx + 1) * byte_len]); - } - BatchedMerkleTreeBackend::::hash_bytes(buf) - }; - - #[cfg(feature = "parallel")] - let iter = (0..num_rows).into_par_iter(); - #[cfg(not(feature = "parallel"))] - let iter = 0..num_rows; - - // Per-thread buffer reuse: map_init allocates one buffer per Rayon thread, - // eliminating millions of small heap allocations under parallel contention. - #[cfg(feature = "parallel")] - let result: Vec = iter - .map_init(|| vec![0u8; total_bytes], |buf, i| hash_leaf(buf, i)) - .collect(); - - #[cfg(not(feature = "parallel"))] - let result: Vec = { - let mut buf = vec![0u8; total_bytes]; - iter.map(|i| hash_leaf(&mut buf, i)).collect() - }; - - result -} - -/// Compute Keccak-256 leaf hashes for `commit_composition_polynomial`: one -/// leaf per row-pair, where leaf `i` hashes the BE concatenation of -/// `parts[..][br_0] ++ parts[..][br_1]` with -/// `br_k = reverse_index(2*i + k, num_rows)`. -/// -/// Returns `Vec` of length `parts[0].len() / 2`. -pub fn keccak_leaves_row_pair_bit_reversed(parts: &[Vec>]) -> Vec -where - E: IsField, - FieldElement: AsBytes + Sync + Send + ByteConversion, -{ - let num_parts = parts.len(); - if num_parts == 0 { - return Vec::new(); - } - let num_rows = parts[0].len(); - if num_rows == 0 { - return Vec::new(); - } - - let num_leaves = num_rows / 2; - debug_assert!( - num_rows.is_power_of_two(), - "num_rows must be a power of two for reverse_index" - ); - - let byte_len = as ByteConversion>::BYTE_LEN; - - let total_bytes = 2 * num_parts * byte_len; - - let hash_leaf_pair = |buf: &mut [u8], leaf_idx: usize| -> Commitment { - let br_0 = reverse_index(2 * leaf_idx, num_rows as u64); - let br_1 = reverse_index(2 * leaf_idx + 1, num_rows as u64); - let mut offset = 0; - for part in parts.iter() { - part[br_0].write_bytes_be(&mut buf[offset..offset + byte_len]); - offset += byte_len; - } - for part in parts.iter() { - part[br_1].write_bytes_be(&mut buf[offset..offset + byte_len]); - offset += byte_len; - } - BatchedMerkleTreeBackend::::hash_bytes(buf) - }; - - #[cfg(feature = "parallel")] - let iter = (0..num_leaves).into_par_iter(); - #[cfg(not(feature = "parallel"))] - let iter = 0..num_leaves; - - #[cfg(feature = "parallel")] - let result: Vec = iter - .map_init(|| vec![0u8; total_bytes], |buf, i| hash_leaf_pair(buf, i)) - .collect(); - - #[cfg(not(feature = "parallel"))] - let result: Vec = { - let mut buf = vec![0u8; total_bytes]; - iter.map(|i| hash_leaf_pair(&mut buf, i)).collect() - }; - - result -} - /// The functionality of a STARK prover providing methods to run the STARK Prove protocol /// https://lambdaclass.github.io/lambdaworks/starks/protocol.html /// The default implementation is complete and is compatible with Stone prover @@ -529,29 +409,6 @@ pub trait IsStarkProver< FieldElement: math::traits::ByteConversion, FieldElement: math::traits::ByteConversion, { - /// Builds a Merkle tree commitment from column-major LDE evaluations with - /// bit-reverse permutation, without cloning the full evaluation matrix. - /// - /// For each row index `i`, we hash `col_0[br(i)] || col_1[br(i)] || ...` - /// where `br(i)` is the bit-reversal of `i`. This produces the same Merkle - /// tree as the old clone + bit-reverse + columns2rows + batch_commit flow, - /// but avoids allocating the cloned and transposed matrices entirely. - fn commit_columns_bit_reversed( - columns: &[Vec>], - ) -> Option<(BatchedMerkleTree, Commitment)> - where - FieldElement: AsBytes + Sync + Send + math::traits::ByteConversion, - E: IsField, - { - if columns.is_empty() || columns[0].is_empty() { - return None; - } - let hashed_leaves = keccak_leaves_bit_reversed(columns); - let tree = BatchedMerkleTree::::build_from_hashed_leaves(hashed_leaves)?; - let root = tree.root; - Some((tree, root)) - } - /// Compute the LDE commitment for a subset of columns from a trace (for testing). /// /// This helper computes the same commitment the prover generates internally, @@ -574,7 +431,7 @@ pub trait IsStarkProver< let twiddles = LdeTwiddles::new(&domain); let evals = Self::compute_lde_from_columns_cached::(&precomputed, &domain, &twiddles); - let (_, commitment) = Self::commit_columns_bit_reversed(&evals)?; + let (_, commitment) = crate::commitment::commit_bit_reversed(&evals, 1)?; Some(commitment) } @@ -728,7 +585,7 @@ pub trait IsStarkProver< let commit = match precomputed { None => { #[allow(unused_mut)] - let (mut tree, root) = Self::commit_columns_bit_reversed(&columns) + let (mut tree, root) = crate::commitment::commit_bit_reversed(&columns, 1) .ok_or(ProvingError::EmptyCommitment)?; #[cfg(feature = "disk-spill")] if storage_mode == StorageMode::Disk { @@ -740,11 +597,11 @@ pub trait IsStarkProver< Some((expected_precomputed_root, num_cols)) => { #[allow(unused_mut)] let (mut precomputed_tree, precomputed_root) = - Self::commit_columns_bit_reversed(&columns[..num_cols]) + crate::commitment::commit_bit_reversed(&columns[..num_cols], 1) .ok_or(ProvingError::EmptyCommitment)?; #[allow(unused_mut)] let (mut mult_tree, mult_root) = - Self::commit_columns_bit_reversed(&columns[num_cols..]) + crate::commitment::commit_bit_reversed(&columns[num_cols..], 1) .ok_or(ProvingError::EmptyCommitment)?; if precomputed_root != expected_precomputed_root { return Err(ProvingError::PrecomputedCommitmentMismatch); @@ -866,30 +723,6 @@ pub trait IsStarkProver< } } - /// Returns the Merkle tree and the commitment to the evaluations of the parts of the - /// composition polynomial. - fn commit_composition_polynomial( - lde_composition_poly_parts_evaluations: &[Vec>], - ) -> Option<(BatchedMerkleTree, Commitment)> - where - FieldElement: AsBytes + Sync + Send, - FieldElement: AsBytes + Sync + Send + math::traits::ByteConversion, - { - let num_parts = lde_composition_poly_parts_evaluations.len(); - if num_parts == 0 { - return None; - } - let num_rows = lde_composition_poly_parts_evaluations[0].len(); - if num_rows == 0 { - return None; - } - let hashed_leaves = - keccak_leaves_row_pair_bit_reversed(lde_composition_poly_parts_evaluations); - let tree = BatchedMerkleTree::::build_from_hashed_leaves(hashed_leaves)?; - let root = tree.root; - Some((tree, root)) - } - /// Algebraically decompose H(x) = H₀(x²) + x·H₁(x²) on the LDE coset, then /// extend each half to the full LDE domain. This replaces the expensive /// iFFT(2N) + break_in_parts + FFT(2N)×2 pipeline with: @@ -1097,7 +930,7 @@ pub trait IsStarkProver< let root = tree.root; (tree, root) } - None => Self::commit_composition_polynomial(&lde_composition_poly_parts_evaluations) + None => crate::commitment::commit_bit_reversed(&lde_composition_poly_parts_evaluations, 2) .ok_or(ProvingError::EmptyCommitment)?, }; #[cfg(feature = "instruments")] @@ -1983,7 +1816,7 @@ pub trait IsStarkProver< #[cfg(feature = "instruments")] let t_sub = Instant::now(); #[allow(unused_mut)] - let (mut tree, root) = Self::commit_columns_bit_reversed(&columns) + let (mut tree, root) = crate::commitment::commit_bit_reversed(&columns, 1) .ok_or(ProvingError::EmptyCommitment)?; #[cfg(feature = "instruments")] crate::instruments::accum_r1_aux(aux_lde_dur, t_sub.elapsed()); From 5cd227ab642bf2cb0a4c09d3571b9041f21989dd Mon Sep 17 00:00:00 2001 From: diegokingston Date: Fri, 19 Jun 2026 16:55:36 -0300 Subject: [PATCH 02/19] perf(stark): row-pair the trace commitment (one Merkle path per query) Every FRI query opens a value and its symmetric counterpart (LDE positions 2*iota, 2*iota+1). The composition-poly commitment already grouped that pair into one leaf; the trace (main/aux/precomputed) committed one row per leaf and opened TWO leaves (proof + proof_sym) per query. Commit the trace with rows_per_leaf=2 too, so each query opens ONE leaf/path; drop the now-redundant proof_sym from PolynomialOpenings (also shrinks the composition opening, which stored the path twice). verify_opening_pair now reconstructs the paired leaf and verifies once (mirrors verify_composition_poly_opening); the dead verify_opening is removed. Halves trace Merkle authentication-path data per query (smaller proofs + less verifier hashing). stark 128/128 (prove+verify). NOTE: proof FORMAT change (not byte-identical). cuda follow-up: the GPU trace leaf+tree builders (gpu_lde::try_expand_leaf_and_tree_batched_keep/_ext3_keep + math-cuda kernels) still build 1-row leaves and must switch to the row-pair pattern (the GPU composition builder already pairs) or cuda proofs will fail verification. --- crypto/stark/src/commitment.rs | 2 +- crypto/stark/src/proof/stark.rs | 7 ++++++- crypto/stark/src/prover.rs | 32 +++++++++++++++-------------- crypto/stark/src/verifier.rs | 36 +++++++++------------------------ 4 files changed, 34 insertions(+), 43 deletions(-) diff --git a/crypto/stark/src/commitment.rs b/crypto/stark/src/commitment.rs index 979c0cb7f..59d68aa7c 100644 --- a/crypto/stark/src/commitment.rs +++ b/crypto/stark/src/commitment.rs @@ -61,7 +61,7 @@ where "num_rows must be a power of two for reverse_index" ); debug_assert!( - rows_per_leaf >= 1 && num_rows % rows_per_leaf == 0, + rows_per_leaf >= 1 && num_rows.is_multiple_of(rows_per_leaf), "num_rows must be a multiple of rows_per_leaf" ); diff --git a/crypto/stark/src/proof/stark.rs b/crypto/stark/src/proof/stark.rs index 1751d60fe..851c0b37a 100644 --- a/crypto/stark/src/proof/stark.rs +++ b/crypto/stark/src/proof/stark.rs @@ -10,9 +10,14 @@ use crate::{ #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] #[serde(bound = "")] +/// Opening of a bit-reversed, row-paired commitment at one FRI query. +/// +/// The queried row and its symmetric counterpart (LDE positions `2·iota`, +/// `2·iota+1`) are committed together as a single leaf at position `iota`, so one +/// Merkle `proof` authenticates both `evaluations` (the row) and +/// `evaluations_sym` (its symmetric). Same layout used for trace and composition. pub struct PolynomialOpenings { pub proof: Proof, - pub proof_sym: Proof, pub evaluations: Vec>, pub evaluations_sym: Vec>, } diff --git a/crypto/stark/src/prover.rs b/crypto/stark/src/prover.rs index d6166f745..533f49154 100644 --- a/crypto/stark/src/prover.rs +++ b/crypto/stark/src/prover.rs @@ -11,7 +11,7 @@ use math::fft::errors::FFTError; use log::info; use math::field::traits::{IsField, IsSubFieldOf}; use math::spill_safe::SpillSafe; -use math::traits::{AsBytes, ByteConversion}; +use math::traits::AsBytes; use math::{ field::{element::FieldElement, traits::IsFFTField}, polynomial::Polynomial, @@ -33,7 +33,10 @@ use crate::storage_mode::StorageMode; use crate::table::Table; use crate::trace::LDETraceTable; -use super::config::{BatchedMerkleTree, BatchedMerkleTreeBackend, Commitment}; +use super::config::{BatchedMerkleTree, Commitment}; +// `BatchedMerkleTreeBackend` is only named by the GPU leaf+tree builders. +#[cfg(feature = "cuda")] +use super::config::BatchedMerkleTreeBackend; use super::constraints::evaluator::ConstraintEvaluator; use super::domain::{Domain, DomainConstants}; use super::fri::fri_decommit::FriDecommitment; @@ -431,7 +434,7 @@ pub trait IsStarkProver< let twiddles = LdeTwiddles::new(&domain); let evals = Self::compute_lde_from_columns_cached::(&precomputed, &domain, &twiddles); - let (_, commitment) = crate::commitment::commit_bit_reversed(&evals, 1)?; + let (_, commitment) = crate::commitment::commit_bit_reversed(&evals, 2)?; Some(commitment) } @@ -585,7 +588,7 @@ pub trait IsStarkProver< let commit = match precomputed { None => { #[allow(unused_mut)] - let (mut tree, root) = crate::commitment::commit_bit_reversed(&columns, 1) + let (mut tree, root) = crate::commitment::commit_bit_reversed(&columns, 2) .ok_or(ProvingError::EmptyCommitment)?; #[cfg(feature = "disk-spill")] if storage_mode == StorageMode::Disk { @@ -597,11 +600,11 @@ pub trait IsStarkProver< Some((expected_precomputed_root, num_cols)) => { #[allow(unused_mut)] let (mut precomputed_tree, precomputed_root) = - crate::commitment::commit_bit_reversed(&columns[..num_cols], 1) + crate::commitment::commit_bit_reversed(&columns[..num_cols], 2) .ok_or(ProvingError::EmptyCommitment)?; #[allow(unused_mut)] let (mut mult_tree, mult_root) = - crate::commitment::commit_bit_reversed(&columns[num_cols..], 1) + crate::commitment::commit_bit_reversed(&columns[num_cols..], 2) .ok_or(ProvingError::EmptyCommitment)?; if precomputed_root != expected_precomputed_root { return Err(ProvingError::PrecomputedCommitmentMismatch); @@ -1357,8 +1360,7 @@ pub trait IsStarkProver< .collect(); PolynomialOpenings { - proof: proof.clone(), - proof_sym: proof, + proof, evaluations: lde_composition_poly_parts_evaluation .clone() .into_iter() @@ -1388,13 +1390,13 @@ pub trait IsStarkProver< G: Fn(usize) -> Vec>, { let domain_size = domain.lde_roots_of_unity_coset.len() as u64; - let index = challenge * 2; - let index_sym = challenge * 2 + 1; + // Rows `2·challenge` and `2·challenge+1` are committed together as the + // single leaf at position `challenge`; one Merkle path authenticates both + // the queried row and its symmetric counterpart. PolynomialOpenings { - proof: tree.get_proof_by_pos(index).unwrap(), - proof_sym: tree.get_proof_by_pos(index_sym).unwrap(), - evaluations: gather(reverse_index(index, domain_size)), - evaluations_sym: gather(reverse_index(index_sym, domain_size)), + proof: tree.get_proof_by_pos(challenge).unwrap(), + evaluations: gather(reverse_index(challenge * 2, domain_size)), + evaluations_sym: gather(reverse_index(challenge * 2 + 1, domain_size)), } } @@ -1816,7 +1818,7 @@ pub trait IsStarkProver< #[cfg(feature = "instruments")] let t_sub = Instant::now(); #[allow(unused_mut)] - let (mut tree, root) = crate::commitment::commit_bit_reversed(&columns, 1) + let (mut tree, root) = crate::commitment::commit_bit_reversed(&columns, 2) .ok_or(ProvingError::EmptyCommitment)?; #[cfg(feature = "instruments")] crate::instruments::accum_r1_aux(aux_lde_dur, t_sub.elapsed()); diff --git a/crypto/stark/src/verifier.rs b/crypto/stark/src/verifier.rs index 8091c8b32..3ba3b7c26 100644 --- a/crypto/stark/src/verifier.rs +++ b/crypto/stark/src/verifier.rs @@ -301,25 +301,11 @@ pub trait IsStarkVerifier< domain.lde_coset_element(reverse_index(raw, domain.lde_length as u64)) } - /// Verifies the validity of the opening proof. - fn verify_opening( - proof: &Proof, - root: &Commitment, - index: usize, - value: &[FieldElement], - ) -> bool - where - FieldElement: AsBytes + Sync + Send, - FieldElement: AsBytes + Sync + Send, - E: IsField, - Field: IsSubFieldOf, - { - proof.verify::>(root, index, &value.to_owned()) - } - - /// Verify both (proof, evaluations) and (proof_sym, evaluations_sym) openings - /// of a `PolynomialOpenings` against the given `root` at iota positions - /// `iota*2` and `iota*2 + 1`. + /// Verify a row-paired `PolynomialOpenings` against `root`. The row pair + /// (`2·iota`, `2·iota+1`) is committed as the single leaf at position `iota`, + /// so one Merkle path authenticates both rows: reconstruct the leaf from + /// `evaluations ‖ evaluations_sym` and verify once. (Same as the composition + /// opening check.) fn verify_opening_pair( opening: &PolynomialOpenings, root: &Commitment, @@ -331,13 +317,11 @@ pub trait IsStarkVerifier< E: IsField, Field: IsSubFieldOf, { - Self::verify_opening::(&opening.proof, root, iota * 2, &opening.evaluations) - && Self::verify_opening::( - &opening.proof_sym, - root, - iota * 2 + 1, - &opening.evaluations_sym, - ) + let mut value = opening.evaluations.clone(); + value.extend_from_slice(&opening.evaluations_sym); + opening + .proof + .verify::>(root, iota, &value) } /// Verify opening Open(tⱼ(D_LDE), 𝜐) and Open(tⱼ(D_LDE), -𝜐) for all trace polynomials tⱼ, From f26a13a9fa826b5ac374a5d22793fdf840a61ff4 Mon Sep 17 00:00:00 2001 From: diegokingston Date: Fri, 19 Jun 2026 17:31:36 -0300 Subject: [PATCH 03/19] =?UTF-8?q?refactor(stark):=20prover=20cleanup=20?= =?UTF-8?q?=E2=80=94=20par=20helpers,=20ROWS=5FPER=5FLEAF,=20error=20propa?= =?UTF-8?q?gation,=20doc/log=20fixes?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crypto/stark/src/commitment.rs | 5 ++ crypto/stark/src/gpu_lde.rs | 4 +- crypto/stark/src/instruments.rs | 6 +- crypto/stark/src/par.rs | 52 ++++++++++++++ crypto/stark/src/prover.rs | 121 ++++++++++++-------------------- 5 files changed, 107 insertions(+), 81 deletions(-) diff --git a/crypto/stark/src/commitment.rs b/crypto/stark/src/commitment.rs index 59d68aa7c..a55d050b2 100644 --- a/crypto/stark/src/commitment.rs +++ b/crypto/stark/src/commitment.rs @@ -33,6 +33,11 @@ use rayon::prelude::{IntoParallelIterator, ParallelIterator}; use crate::config::{BatchedMerkleTree, BatchedMerkleTreeBackend, Commitment}; +/// Number of consecutive (bit-reversed) rows packed into one Merkle leaf for the +/// composition-polynomial commitment: the row-pair leaf packing the FRI openings +/// rely on (leaf `i` hashes rows `2i` and `2i+1`). +pub const ROWS_PER_LEAF: usize = 2; + /// Computes the Keccak-256 leaf hashes for a bit-reversed, column-major commitment, /// grouping `rows_per_leaf` consecutive bit-reversed rows into each leaf. /// diff --git a/crypto/stark/src/gpu_lde.rs b/crypto/stark/src/gpu_lde.rs index 36756b40b..be6d5ccfe 100644 --- a/crypto/stark/src/gpu_lde.rs +++ b/crypto/stark/src/gpu_lde.rs @@ -733,8 +733,8 @@ where /// host-side ext3 LDE eval Vecs produced by /// [`try_evaluate_parts_on_lde_gpu_keep`] (or the CPU path). Uses the same /// row-pair leaf pattern as the CPU -/// `commit_composition_polynomial`: each leaf hashes 2 consecutive -/// bit-reversed rows. +/// `commit_bit_reversed` (composition-polynomial commit path): each leaf hashes +/// 2 consecutive bit-reversed rows. /// /// Returns `None` to fall through to the CPU path when the type or size /// conditions don't hold; returns `None` on a math-cuda `Err` so the caller diff --git a/crypto/stark/src/instruments.rs b/crypto/stark/src/instruments.rs index 16ff95082..aa5cc5436 100644 --- a/crypto/stark/src/instruments.rs +++ b/crypto/stark/src/instruments.rs @@ -33,7 +33,7 @@ pub struct TableSubOps { pub constraints: Duration, /// decompose_and_extend_d2 pub comp_decompose: Duration, - /// commit_composition_polynomial + /// commit_bit_reversed (composition-polynomial commit step) pub comp_commit: Duration, /// Round 3: barycentric OOD evaluation pub ood: Duration, @@ -52,11 +52,11 @@ pub struct TableSubOps { pub struct Round1SubOps { /// Main trace: expand_columns_to_lde (LDE/FFT) pub main_lde: Duration, - /// Main trace: commit_columns_bit_reversed (Merkle) + /// Main trace: commit_bit_reversed (Merkle) pub main_merkle: Duration, /// Aux trace: expand_columns_to_lde (LDE/FFT) pub aux_lde: Duration, - /// Aux trace: commit_columns_bit_reversed (Merkle) + /// Aux trace: commit_bit_reversed (Merkle) pub aux_merkle: Duration, } diff --git a/crypto/stark/src/par.rs b/crypto/stark/src/par.rs index a20a452b6..051651fde 100644 --- a/crypto/stark/src/par.rs +++ b/crypto/stark/src/par.rs @@ -37,3 +37,55 @@ where (a(), b()) } } + +/// Map `f(i)` over `range` and collect into a `Vec`, preserving index order. +/// Parallel when `feature = "parallel"`, sequential otherwise. Rayon's +/// `collect()` is index-ordered, so the result is identical either way. +pub(crate) fn par_map_collect( + range: std::ops::Range, + f: impl Fn(usize) -> R + Sync + Send, +) -> Vec { + #[cfg(feature = "parallel")] + { + use rayon::prelude::*; + range.into_par_iter().map(f).collect() + } + #[cfg(not(feature = "parallel"))] + { + range.map(f).collect() + } +} + +/// Run `f(&mut item)` for each element of `slice`. Parallel when +/// `feature = "parallel"`, sequential otherwise (ordering is irrelevant). +pub(crate) fn par_for_each_mut(slice: &mut [T], f: impl Fn(&mut T) + Sync + Send) { + #[cfg(feature = "parallel")] + { + use rayon::prelude::*; + slice.par_iter_mut().for_each(f); + } + #[cfg(not(feature = "parallel"))] + { + slice.iter_mut().for_each(f); + } +} + +/// Run `f(&mut item)` for each element of `slice`, short-circuiting on the +/// first `Err`. Parallel when `feature = "parallel"`, sequential otherwise. +// Only called from `disk-spill`-gated paths; keep it available without warning +// when that feature is off. +#[cfg_attr(not(feature = "disk-spill"), allow(dead_code))] +pub(crate) fn par_try_for_each_mut( + slice: &mut [T], + f: impl Fn(&mut T) -> Result<(), E> + Sync + Send, +) -> Result<(), E> { + #[cfg(feature = "parallel")] + { + use rayon::prelude::*; + slice.par_iter_mut().try_for_each(f) + } + #[cfg(not(feature = "parallel"))] + { + slice.iter_mut().try_for_each(f) + } +} diff --git a/crypto/stark/src/prover.rs b/crypto/stark/src/prover.rs index 533f49154..7117fd708 100644 --- a/crypto/stark/src/prover.rs +++ b/crypto/stark/src/prover.rs @@ -92,6 +92,12 @@ pub enum ProvingError { DiskSpill(String), } +impl From for ProvingError { + fn from(e: FFTError) -> Self { + ProvingError::WrongParameter(format!("{e}")) + } +} + /// Commitment artifacts for one trace table (main or auxiliary). Used for both /// plain and preprocessed tables. Preprocessed tables additionally carry a /// separate Merkle tree over their precomputed columns, hence the optional @@ -351,7 +357,7 @@ where /// A container for the results of the third round of the STARK Prove protocol. pub(crate) struct Round3 { - /// Evaluations of the trace polynomials, main ans auxiliary, at the out-of-domain challenge. + /// Evaluations of the trace polynomials, main and auxiliary, at the out-of-domain challenge. trace_ood_evaluations: Table, /// Evaluations of the composition polynomial parts at the out-of-domain challenge. composition_poly_parts_ood_evaluation: Vec>, @@ -434,7 +440,7 @@ pub trait IsStarkProver< let twiddles = LdeTwiddles::new(&domain); let evals = Self::compute_lde_from_columns_cached::(&precomputed, &domain, &twiddles); - let (_, commitment) = crate::commitment::commit_bit_reversed(&evals, 2)?; + let (_, commitment) = crate::commitment::commit_bit_reversed(&evals, crate::commitment::ROWS_PER_LEAF)?; Some(commitment) } @@ -456,23 +462,16 @@ pub trait IsStarkProver< return Vec::new(); } - #[cfg(not(feature = "parallel"))] - let columns_iter = columns.iter(); - #[cfg(feature = "parallel")] - let columns_iter = columns.par_iter(); - - columns_iter - .map(|col| { - Polynomial::coset_lde_full::( - col, - domain.blowup_factor, - &twiddles.coset_weights, - &twiddles.inv, - &twiddles.fwd, - ) - }) - .collect::>>, _>>() + crate::par::par_map_collect(0..columns.len(), |i| { + Polynomial::coset_lde_full::( + &columns[i], + domain.blowup_factor, + &twiddles.coset_weights, + &twiddles.inv, + &twiddles.fwd, + ) .expect("coset LDE computation") + }) } /// Expand each column in-place from N evaluations to N×blowup LDE evaluations. @@ -507,11 +506,7 @@ pub trait IsStarkProver< return; } - #[cfg(feature = "parallel")] - let iter = columns.par_iter_mut(); - #[cfg(not(feature = "parallel"))] - let iter = columns.iter_mut(); - iter.for_each(|buf| { + crate::par::par_for_each_mut(columns, |buf| { Polynomial::coset_lde_full_expand::( buf, domain.blowup_factor, @@ -588,7 +583,7 @@ pub trait IsStarkProver< let commit = match precomputed { None => { #[allow(unused_mut)] - let (mut tree, root) = crate::commitment::commit_bit_reversed(&columns, 2) + let (mut tree, root) = crate::commitment::commit_bit_reversed(&columns, crate::commitment::ROWS_PER_LEAF) .ok_or(ProvingError::EmptyCommitment)?; #[cfg(feature = "disk-spill")] if storage_mode == StorageMode::Disk { @@ -600,11 +595,11 @@ pub trait IsStarkProver< Some((expected_precomputed_root, num_cols)) => { #[allow(unused_mut)] let (mut precomputed_tree, precomputed_root) = - crate::commitment::commit_bit_reversed(&columns[..num_cols], 2) + crate::commitment::commit_bit_reversed(&columns[..num_cols], crate::commitment::ROWS_PER_LEAF) .ok_or(ProvingError::EmptyCommitment)?; #[allow(unused_mut)] let (mut mult_tree, mult_root) = - crate::commitment::commit_bit_reversed(&columns[num_cols..], 2) + crate::commitment::commit_bit_reversed(&columns[num_cols..], crate::commitment::ROWS_PER_LEAF) .ok_or(ProvingError::EmptyCommitment)?; if precomputed_root != expected_precomputed_root { return Err(ProvingError::PrecomputedCommitmentMismatch); @@ -868,11 +863,10 @@ pub trait IsStarkProver< } else { // Fallback for any future AIR with d > 2. let composition_poly = - Polynomial::interpolate_offset_fft(&constraint_evaluations, &domain.coset_offset) - .unwrap(); + Polynomial::interpolate_offset_fft(&constraint_evaluations, &domain.coset_offset)?; let composition_poly_parts = composition_poly.break_in_parts(number_of_parts); - let cpu_eval = || -> Vec>> { + let cpu_eval = || -> Result>>, ProvingError> { composition_poly_parts .iter() .map(|part| { @@ -882,7 +876,7 @@ pub trait IsStarkProver< domain.interpolation_domain_size, &domain.coset_offset, ) - .unwrap() + .map_err(ProvingError::from) }) .collect() }; @@ -907,11 +901,11 @@ pub trait IsStarkProver< gpu_composition_parts = Some(handle); evals } - None => cpu_eval(), + None => cpu_eval()?, } } #[cfg(not(feature = "cuda"))] - cpu_eval() + cpu_eval()? }; #[cfg(feature = "instruments")] let fft_dur = t_sub.elapsed(); @@ -933,7 +927,7 @@ pub trait IsStarkProver< let root = tree.root; (tree, root) } - None => crate::commitment::commit_bit_reversed(&lde_composition_poly_parts_evaluations, 2) + None => crate::commitment::commit_bit_reversed(&lde_composition_poly_parts_evaluations, crate::commitment::ROWS_PER_LEAF) .ok_or(ProvingError::EmptyCommitment)?, }; #[cfg(feature = "instruments")] @@ -1299,12 +1293,7 @@ pub trait IsStarkProver< }) .collect(); - #[cfg(feature = "parallel")] - let iter = (0..lde_size).into_par_iter(); - #[cfg(not(feature = "parallel"))] - let iter = 0..lde_size; - - iter.map(|i| { + crate::par::par_map_collect(0..lde_size, |i| { let mut result = FieldElement::::zero(); // H terms @@ -1330,7 +1319,6 @@ pub trait IsStarkProver< result }) - .collect() } /// Computes values and validity proofs of the evaluations of the composition polynomial parts @@ -1347,7 +1335,7 @@ pub trait IsStarkProver< { let proof = composition_poly_merkle_tree .get_proof_by_pos(index) - .unwrap(); + .expect("FRI query index in bounds"); let lde_composition_poly_parts_evaluation: Vec<_> = lde_composition_poly_evaluations .iter() @@ -1394,7 +1382,8 @@ pub trait IsStarkProver< // single leaf at position `challenge`; one Merkle path authenticates both // the queried row and its symmetric counterpart. PolynomialOpenings { - proof: tree.get_proof_by_pos(challenge).unwrap(), + proof: tree.get_proof_by_pos(challenge) + .expect("FRI query index in bounds"), evaluations: gather(reverse_index(challenge * 2, domain_size)), evaluations_sym: gather(reverse_index(challenge * 2 + 1, domain_size)), } @@ -1462,7 +1451,7 @@ pub trait IsStarkProver< openings } - // TODO: propagate errors instead of unwrap() in commit_columns, reconstruct_round1, and expand_columns_to_lde + // TODO: propagate errors instead of unwrap() in commit_main_trace, reconstruct_round1, and expand_columns_to_lde /// Generates STARK proofs for one or more AIRs with a shared transcript. /// /// # Multi-Table Proving with LogUp @@ -1561,11 +1550,7 @@ pub trait IsStarkProver< // Spill main traces to mmap before Round 1 LDE. #[cfg(feature = "disk-spill")] if storage_mode == StorageMode::Disk { - #[cfg(feature = "parallel")] - let spill_iter = air_trace_pairs.par_iter_mut(); - #[cfg(not(feature = "parallel"))] - let mut spill_iter = air_trace_pairs.iter_mut(); - spill_iter.try_for_each(|(_, trace, _)| { + crate::par::par_try_for_each_mut(&mut air_trace_pairs, |(_, trace, _)| { trace .main_table .spill_to_disk() @@ -1602,13 +1587,8 @@ pub trait IsStarkProver< let chunk_end = (chunk_start + k).min(num_airs); let chunk_range = chunk_start..chunk_end; - #[cfg(feature = "parallel")] - let iter = chunk_range.into_par_iter(); - #[cfg(not(feature = "parallel"))] - let iter = chunk_range; - - let chunk_results: Vec> = iter - .map(|idx| { + let chunk_results: Vec> = + crate::par::par_map_collect(chunk_range, |idx| { let (air, trace, _) = &air_trace_pairs[idx]; let domain = &domains[idx]; let twiddles = &twiddle_caches[idx]; @@ -1624,8 +1604,7 @@ pub trait IsStarkProver< #[cfg(feature = "disk-spill")] storage_mode, ) - }) - .collect(); + }); // Sequential: append roots to shared transcript (Fiat-Shamir ordering) for result in chunk_results { @@ -1697,17 +1676,13 @@ pub trait IsStarkProver< // Spill all aux trace tables to mmap before any Round 1 aux LDE work. #[cfg(feature = "disk-spill")] if storage_mode == StorageMode::Disk { - #[cfg(feature = "parallel")] - let spill_iter = air_trace_pairs.par_iter_mut(); - #[cfg(not(feature = "parallel"))] - let mut spill_iter = air_trace_pairs.iter_mut(); - spill_iter.try_for_each(|(air, trace, _)| { + crate::par::par_try_for_each_mut(&mut air_trace_pairs, |(air, trace, _)| { if air.has_aux_trace() { trace .spill_aux_to_disk() .map_err(|e| ProvingError::DiskSpill(format!("aux trace: {e}")))?; } - Ok(()) + Ok::<(), ProvingError>(()) })?; } @@ -1753,14 +1728,9 @@ pub trait IsStarkProver< let chunk_end = (chunk_start + k).min(num_airs); let chunk_range = chunk_start..chunk_end; - #[cfg(feature = "parallel")] - let iter = chunk_range.into_par_iter(); - #[cfg(not(feature = "parallel"))] - let iter = chunk_range; - #[allow(clippy::type_complexity)] - let chunk_aux: Vec, ProvingError>> = iter - .map(|idx| { + let chunk_aux: Vec, ProvingError>> = + crate::par::par_map_collect(chunk_range, |idx| { let (air, trace, _) = &air_trace_pairs[idx]; let domain = &domains[idx]; let twiddles = &twiddle_caches[idx]; @@ -1818,7 +1788,7 @@ pub trait IsStarkProver< #[cfg(feature = "instruments")] let t_sub = Instant::now(); #[allow(unused_mut)] - let (mut tree, root) = crate::commitment::commit_bit_reversed(&columns, 2) + let (mut tree, root) = crate::commitment::commit_bit_reversed(&columns, crate::commitment::ROWS_PER_LEAF) .ok_or(ProvingError::EmptyCommitment)?; #[cfg(feature = "instruments")] crate::instruments::accum_r1_aux(aux_lde_dur, t_sub.elapsed()); @@ -1839,8 +1809,7 @@ pub trait IsStarkProver< #[cfg(not(feature = "cuda"))] Ok((None, Vec::new())) } - }) - .collect(); + }); // Sequential: append aux roots to forked transcripts. for (j, result) in chunk_aux.into_iter().enumerate() { @@ -2055,7 +2024,7 @@ pub trait IsStarkProver< // TODO: propagate errors instead of unwrap() in open_deep_composition_poly and FRI operations /// Executes rounds 2-4 and generates a STARK proof for the trace `main_trace` with public inputs `pub_inputs`. - /// Warning: the transcript must be safely initializated before passing it to this method. + /// Warning: the transcript must be safely initialized before passing it to this method. fn prove_rounds_2_to_4( air: &dyn AIR, pub_inputs: &PI, @@ -2068,7 +2037,7 @@ pub trait IsStarkProver< FieldElement: AsBytes, PI: Send + Sync + Clone, { - info!("Started proof generation..."); + log::debug!("Started proof generation..."); // =================================== // ==========| Round 2 |========== @@ -2181,7 +2150,7 @@ pub trait IsStarkProver< }); } - info!("End proof generation"); + log::debug!("End proof generation"); Ok(StarkProof { // [t] From 256e69c9a59360b559b19e11ebb6a309d62b8ec2 Mon Sep 17 00:00:00 2001 From: diegokingston Date: Fri, 19 Jun 2026 17:33:39 -0300 Subject: [PATCH 04/19] docs(stark): fix commitment.rs leaf-layout docs after trace pairing The trace commitment now uses the row-pair leaf layout (ROWS_PER_LEAF=2), same as composition; rows_per_leaf=1 is only kept for the GPU parity test. Update the module/const/wrapper docs that still described the pre-pairing per-row trace. --- crypto/stark/src/commitment.rs | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/crypto/stark/src/commitment.rs b/crypto/stark/src/commitment.rs index a55d050b2..d4a6dbdbe 100644 --- a/crypto/stark/src/commitment.rs +++ b/crypto/stark/src/commitment.rs @@ -15,13 +15,15 @@ //! where R = rows_per_leaf and br(j) = reverse_index(j, num_rows) //! ``` //! -//! - `rows_per_leaf == 1`: one row per leaf — main/aux trace LDE columns. -//! - `rows_per_leaf == 2`: a row pair per leaf — composition-polynomial parts -//! (Round 2/3), where leaf `i` hashes rows `2i` and `2i+1`. +//! - `rows_per_leaf == 2` (`ROWS_PER_LEAF`): a row pair per leaf (leaf `i` hashes +//! rows `2i` and `2i+1`). Used by BOTH the main/aux trace LDE and the +//! composition-polynomial parts: a FRI query opens a value and its symmetric +//! counterpart — exactly this pair — so one Merkle path authenticates both. +//! - `rows_per_leaf == 1`: one row per leaf. No longer used by the prover; kept +//! only so the GPU parity tests can compare against the per-row code path. //! //! The field-element serialization (`write_bytes_be`) + `hash_bytes` path is kept -//! exactly as before, so commitments are byte-identical to the previous -//! per-row / per-row-pair implementations. +//! exactly as before. use math::fft::bit_reversing::reverse_index; use math::field::element::FieldElement; @@ -34,8 +36,9 @@ use rayon::prelude::{IntoParallelIterator, ParallelIterator}; use crate::config::{BatchedMerkleTree, BatchedMerkleTreeBackend, Commitment}; /// Number of consecutive (bit-reversed) rows packed into one Merkle leaf for the -/// composition-polynomial commitment: the row-pair leaf packing the FRI openings -/// rely on (leaf `i` hashes rows `2i` and `2i+1`). +/// trace AND composition-polynomial commitments: the row-pair leaf the FRI +/// openings rely on (leaf `i` hashes rows `2i` and `2i+1`, so one Merkle path +/// authenticates both a value and its symmetric counterpart). pub const ROWS_PER_LEAF: usize = 2; /// Computes the Keccak-256 leaf hashes for a bit-reversed, column-major commitment, @@ -103,12 +106,12 @@ where result } -/// Per-row Keccak-256 leaf hashes (one leaf per bit-reversed row). Used for the -/// main/aux trace LDE commitments. Thin wrapper over -/// [`keccak_leaves_bit_reversed_grouped`] with `rows_per_leaf = 1`. +/// Per-row Keccak-256 leaf hashes (one leaf per bit-reversed row). Thin wrapper +/// over [`keccak_leaves_bit_reversed_grouped`] with `rows_per_leaf = 1`. /// -/// Kept as a named public function because the GPU parity tests in dependent -/// crates compare against this exact code path. +/// The prover no longer commits per-row (trace and composition both use the +/// row-pair layout, `ROWS_PER_LEAF`); this stays a named public function only so +/// the GPU parity tests in dependent crates can compare the per-row code path. pub fn keccak_leaves_bit_reversed(columns: &[Vec>]) -> Vec where E: IsField, From f34a6f6ce4e4ead7320a663818a916cadc2f0a56 Mon Sep 17 00:00:00 2001 From: diegokingston Date: Fri, 19 Jun 2026 17:45:44 -0300 Subject: [PATCH 05/19] refactor(prover): dedup commit pipeline (commit_plain + spill_tree) (B) Extract two helpers on IsStarkProver, collapsing the near-duplicate main-trace and aux-trace commit code: - spill_tree: the identical-except-label disk-spill block, shared by the main / preprocessed-split / aux commit sites (4 call sites). - commit_plain: commit_bit_reversed + spill_tree + TableCommit::plain, shared by the main-trace (non-preprocessed None arm) and aux-trace plain-commit paths. Proof output is byte-identical (stark 128/128, +disk-spill 133/133). The only behavioral delta is in the instruments profiling feature: the aux commit's timing bucket now includes its (tiny) disk spill, matching what the main path already measured. clippy clean on default / disk-spill / instruments; builds on the combined feature set. --- crypto/stark/src/prover.rs | 88 ++++++++++++++++++++++++++------------ 1 file changed, 60 insertions(+), 28 deletions(-) diff --git a/crypto/stark/src/prover.rs b/crypto/stark/src/prover.rs index 7117fd708..74026c112 100644 --- a/crypto/stark/src/prover.rs +++ b/crypto/stark/src/prover.rs @@ -581,17 +581,13 @@ pub trait IsStarkProver< let t_sub = Instant::now(); let commit = match precomputed { - None => { - #[allow(unused_mut)] - let (mut tree, root) = crate::commitment::commit_bit_reversed(&columns, crate::commitment::ROWS_PER_LEAF) - .ok_or(ProvingError::EmptyCommitment)?; + None => Self::commit_plain::( + &columns, #[cfg(feature = "disk-spill")] - if storage_mode == StorageMode::Disk { - tree.spill_nodes_to_disk() - .map_err(|e| ProvingError::DiskSpill(format!("main Merkle tree: {e}")))?; - } - TableCommit::plain(tree, root) - } + storage_mode, + #[cfg(feature = "disk-spill")] + "main Merkle tree", + )?, Some((expected_precomputed_root, num_cols)) => { #[allow(unused_mut)] let (mut precomputed_tree, precomputed_root) = @@ -605,13 +601,9 @@ pub trait IsStarkProver< return Err(ProvingError::PrecomputedCommitmentMismatch); } #[cfg(feature = "disk-spill")] - if storage_mode == StorageMode::Disk { - precomputed_tree.spill_nodes_to_disk().map_err(|e| { - ProvingError::DiskSpill(format!("precomputed Merkle tree: {e}")) - })?; - mult_tree - .spill_nodes_to_disk() - .map_err(|e| ProvingError::DiskSpill(format!("mult Merkle tree: {e}")))?; + { + Self::spill_tree(&mut precomputed_tree, storage_mode, "precomputed Merkle tree")?; + Self::spill_tree(&mut mult_tree, storage_mode, "mult Merkle tree")?; } TableCommit::preprocessed( mult_tree, @@ -632,6 +624,48 @@ pub trait IsStarkProver< Ok((commit, columns)) } + /// Spill a committed Merkle tree to disk when `storage_mode` is `Disk`, + /// tagging any I/O error with `label`. No-op otherwise. Shared by every commit + /// site (main / preprocessed split / aux). + #[cfg(feature = "disk-spill")] + fn spill_tree( + tree: &mut BatchedMerkleTree, + storage_mode: StorageMode, + label: &str, + ) -> Result<(), ProvingError> + where + C: IsField, + FieldElement: AsBytes + Sync + Send, + { + if storage_mode == StorageMode::Disk { + tree.spill_nodes_to_disk() + .map_err(|e| ProvingError::DiskSpill(format!("{label}: {e}")))?; + } + Ok(()) + } + + /// Commit an already-LDE-expanded plain column set: build the row-paired + /// Merkle tree, spill it to disk if requested, and wrap it as a plain + /// `TableCommit`. Shared by the main-trace (non-preprocessed) and aux-trace + /// commit paths. + fn commit_plain( + columns: &[Vec>], + #[cfg(feature = "disk-spill")] storage_mode: StorageMode, + #[cfg(feature = "disk-spill")] spill_label: &str, + ) -> Result, ProvingError> + where + C: IsField, + FieldElement: AsBytes + Sync + Send + math::traits::ByteConversion, + { + #[allow(unused_mut)] + let (mut tree, root) = + crate::commitment::commit_bit_reversed(columns, crate::commitment::ROWS_PER_LEAF) + .ok_or(ProvingError::EmptyCommitment)?; + #[cfg(feature = "disk-spill")] + Self::spill_tree(&mut tree, storage_mode, spill_label)?; + Ok(TableCommit::plain(tree, root)) + } + /// Recompute Round1 from the trace, reusing the Merkle trees stored in commitments. /// /// Only used by `run_debug_checks` — Phase D consumes the cached LDE @@ -1787,22 +1821,20 @@ pub trait IsStarkProver< let aux_lde_dur = t_sub.elapsed(); #[cfg(feature = "instruments")] let t_sub = Instant::now(); - #[allow(unused_mut)] - let (mut tree, root) = crate::commitment::commit_bit_reversed(&columns, crate::commitment::ROWS_PER_LEAF) - .ok_or(ProvingError::EmptyCommitment)?; + let commit = Self::commit_plain::( + &columns, + #[cfg(feature = "disk-spill")] + storage_mode, + #[cfg(feature = "disk-spill")] + "aux Merkle tree", + )?; #[cfg(feature = "instruments")] crate::instruments::accum_r1_aux(aux_lde_dur, t_sub.elapsed()); - #[cfg(feature = "disk-spill")] - if storage_mode == StorageMode::Disk { - tree.spill_nodes_to_disk().map_err(|e| { - ProvingError::DiskSpill(format!("aux Merkle tree: {e}")) - })?; - } #[cfg(feature = "cuda")] - return Ok((Some(TableCommit::plain(tree, root)), columns, None)); + return Ok((Some(commit), columns, None)); #[cfg(not(feature = "cuda"))] - Ok((Some(TableCommit::plain(tree, root)), columns)) + Ok((Some(commit), columns)) } else { #[cfg(feature = "cuda")] return Ok((None, Vec::new(), None)); From b4784e64f2dec8570d1d8b924666ee618e44e172 Mon Sep 17 00:00:00 2001 From: diegokingston Date: Mon, 22 Jun 2026 11:05:57 -0300 Subject: [PATCH 06/19] fix(prover): row-pair the preprocessed-table commitments (CI fix) The trace Merkle commitment moved to a row-pair leaf layout (ROWS_PER_LEAF=2), but the 5 preprocessed-table commitment computers (bitwise/keccak_rc/page/decode/register) still built a 1-row-per-leaf tree manually, so the prover's row-pair precomputed root no longer matched the computed/hardcoded one -> PrecomputedCommitmentMismatch at prove time. - Route all 5 compute_*commitment fns through the shared stark::commitment::commit_bit_reversed(.., ROWS_PER_LEAF), dropping the manual bit-reverse + columns2rows + 1-row BatchedMerkleTree::build. - Regenerate the hardcoded bitwise/keccak_rc/zero_page static commitments for the row-pair layout (via compute_static_commitments). - cargo fmt (prover.rs + touched tables). Verified: static_commitments drift tests 5/5, stark 128/128, clippy + fmt clean, no PrecomputedCommitmentMismatch. Full ELF prove/verify runs in CI (guest artifacts absent locally). --- crypto/stark/src/prover.rs | 34 +++++++++++++++++------- prover/src/tables/bitwise.rs | 48 ++++++++++++---------------------- prover/src/tables/decode.rs | 22 +++++----------- prover/src/tables/keccak_rc.rs | 38 +++++++++++---------------- prover/src/tables/page.rs | 35 +++++++++++-------------- prover/src/tables/register.rs | 17 +++++------- 6 files changed, 82 insertions(+), 112 deletions(-) diff --git a/crypto/stark/src/prover.rs b/crypto/stark/src/prover.rs index 74026c112..ab1b3275b 100644 --- a/crypto/stark/src/prover.rs +++ b/crypto/stark/src/prover.rs @@ -440,7 +440,8 @@ pub trait IsStarkProver< let twiddles = LdeTwiddles::new(&domain); let evals = Self::compute_lde_from_columns_cached::(&precomputed, &domain, &twiddles); - let (_, commitment) = crate::commitment::commit_bit_reversed(&evals, crate::commitment::ROWS_PER_LEAF)?; + let (_, commitment) = + crate::commitment::commit_bit_reversed(&evals, crate::commitment::ROWS_PER_LEAF)?; Some(commitment) } @@ -591,18 +592,27 @@ pub trait IsStarkProver< Some((expected_precomputed_root, num_cols)) => { #[allow(unused_mut)] let (mut precomputed_tree, precomputed_root) = - crate::commitment::commit_bit_reversed(&columns[..num_cols], crate::commitment::ROWS_PER_LEAF) - .ok_or(ProvingError::EmptyCommitment)?; + crate::commitment::commit_bit_reversed( + &columns[..num_cols], + crate::commitment::ROWS_PER_LEAF, + ) + .ok_or(ProvingError::EmptyCommitment)?; #[allow(unused_mut)] - let (mut mult_tree, mult_root) = - crate::commitment::commit_bit_reversed(&columns[num_cols..], crate::commitment::ROWS_PER_LEAF) - .ok_or(ProvingError::EmptyCommitment)?; + let (mut mult_tree, mult_root) = crate::commitment::commit_bit_reversed( + &columns[num_cols..], + crate::commitment::ROWS_PER_LEAF, + ) + .ok_or(ProvingError::EmptyCommitment)?; if precomputed_root != expected_precomputed_root { return Err(ProvingError::PrecomputedCommitmentMismatch); } #[cfg(feature = "disk-spill")] { - Self::spill_tree(&mut precomputed_tree, storage_mode, "precomputed Merkle tree")?; + Self::spill_tree( + &mut precomputed_tree, + storage_mode, + "precomputed Merkle tree", + )?; Self::spill_tree(&mut mult_tree, storage_mode, "mult Merkle tree")?; } TableCommit::preprocessed( @@ -961,8 +971,11 @@ pub trait IsStarkProver< let root = tree.root; (tree, root) } - None => crate::commitment::commit_bit_reversed(&lde_composition_poly_parts_evaluations, crate::commitment::ROWS_PER_LEAF) - .ok_or(ProvingError::EmptyCommitment)?, + None => crate::commitment::commit_bit_reversed( + &lde_composition_poly_parts_evaluations, + crate::commitment::ROWS_PER_LEAF, + ) + .ok_or(ProvingError::EmptyCommitment)?, }; #[cfg(feature = "instruments")] let merkle_dur = t_sub.elapsed(); @@ -1416,7 +1429,8 @@ pub trait IsStarkProver< // single leaf at position `challenge`; one Merkle path authenticates both // the queried row and its symmetric counterpart. PolynomialOpenings { - proof: tree.get_proof_by_pos(challenge) + proof: tree + .get_proof_by_pos(challenge) .expect("FRI query index in bounds"), evaluations: gather(reverse_index(challenge * 2, domain_size)), evaluations_sym: gather(reverse_index(challenge * 2 + 1, domain_size)), diff --git a/prover/src/tables/bitwise.rs b/prover/src/tables/bitwise.rs index cb92e37ce..4788f65af 100644 --- a/prover/src/tables/bitwise.rs +++ b/prover/src/tables/bitwise.rs @@ -25,13 +25,13 @@ //! All lookups are provided as receivers with negative multiplicity, //! meaning other tables send to this table. -use math::fft::bit_reversing::in_place_bit_reverse_permute; use math::polynomial::Polynomial; -use stark::config::{BatchedMerkleTree, Commitment}; +use stark::commitment::{ROWS_PER_LEAF, commit_bit_reversed}; +use stark::config::Commitment; use stark::lookup::{BusInteraction, BusValue, Multiplicity, Packing}; use stark::proof::options::ProofOptions; use stark::prover::evaluate_polynomial_on_lde_domain; -use stark::trace::{TraceTable, columns2rows}; +use stark::trace::TraceTable; #[cfg(feature = "parallel")] use rayon::prelude::*; @@ -195,19 +195,19 @@ pub const fn is_preprocessed() -> bool { fn static_commitment(blowup_factor: u8) -> Option { match blowup_factor { 2 => Some([ - 0xfb, 0x46, 0xff, 0x1c, 0xed, 0x4c, 0x97, 0xfb, 0xb2, 0x17, 0x55, 0x24, 0x08, 0x04, - 0x15, 0xee, 0xbe, 0xa6, 0xee, 0x86, 0x69, 0xaf, 0x3a, 0x4f, 0x9e, 0x2a, 0x44, 0x81, - 0xf9, 0xb0, 0xf3, 0xff, + 0xfa, 0x3e, 0xcf, 0x80, 0xfd, 0x95, 0xe5, 0x09, 0x74, 0xd4, 0x55, 0x23, 0xf6, 0x42, + 0xb6, 0x4b, 0x05, 0xc4, 0xf9, 0x66, 0xc2, 0x4d, 0xff, 0xda, 0x31, 0x47, 0xab, 0x7b, + 0x0c, 0x6d, 0xc4, 0xcf, ]), 4 => Some([ - 0xb5, 0xc4, 0xc0, 0x80, 0x03, 0x5b, 0xb6, 0x12, 0x78, 0x8c, 0x4d, 0xd4, 0x9e, 0x3d, - 0xc4, 0xe2, 0xef, 0x95, 0xf0, 0xbf, 0xe8, 0x1d, 0x98, 0xec, 0x7f, 0x58, 0x3a, 0x47, - 0x18, 0x03, 0x7e, 0xa5, + 0xff, 0x76, 0x8e, 0x85, 0x4b, 0xdc, 0x32, 0x61, 0x96, 0x16, 0x15, 0x19, 0x73, 0x70, + 0xf0, 0x64, 0x81, 0xfd, 0x4f, 0x5c, 0xbd, 0x9c, 0x30, 0x26, 0xd5, 0xc0, 0x81, 0xf3, + 0xce, 0x38, 0x50, 0x3e, ]), 8 => Some([ - 0x8a, 0x18, 0x70, 0x51, 0x34, 0x1a, 0x65, 0xaa, 0x79, 0x17, 0x07, 0x9a, 0xf3, 0x0b, - 0xcb, 0xd0, 0x7c, 0xe3, 0x2a, 0xce, 0x89, 0x9a, 0xfd, 0xc8, 0x0d, 0x6b, 0x48, 0x43, - 0x83, 0x5d, 0x18, 0xb8, + 0x0e, 0x1b, 0xc1, 0x0d, 0xae, 0x64, 0xe7, 0xca, 0xe0, 0x2a, 0x3b, 0xab, 0xd7, 0xd2, + 0xbb, 0x80, 0xd5, 0x24, 0x5a, 0xce, 0x25, 0xb6, 0x84, 0x77, 0x9c, 0xb5, 0xeb, 0x67, + 0x61, 0x82, 0x78, 0x3d, ]), _ => None, } @@ -283,7 +283,7 @@ pub fn compute_preprocessed_commitment(options: &ProofOptions) -> Commitment { let coset_offset = FE::from(options.coset_offset); #[cfg(feature = "parallel")] - let mut lde_columns: Vec> = polys + let lde_columns: Vec> = polys .par_iter() .map(|poly| { evaluate_polynomial_on_lde_domain(poly, blowup_factor, NUM_ROWS, &coset_offset) @@ -292,7 +292,7 @@ pub fn compute_preprocessed_commitment(options: &ProofOptions) -> Commitment { .collect(); #[cfg(not(feature = "parallel"))] - let mut lde_columns: Vec> = polys + let lde_columns: Vec> = polys .iter() .map(|poly| { evaluate_polynomial_on_lde_domain(poly, blowup_factor, NUM_ROWS, &coset_offset) @@ -300,25 +300,9 @@ pub fn compute_preprocessed_commitment(options: &ProofOptions) -> Commitment { }) .collect(); - // Step 4: Bit-reverse permute (parallel) - #[cfg(feature = "parallel")] - lde_columns.par_iter_mut().for_each(|col| { - in_place_bit_reverse_permute(col); - }); - - #[cfg(not(feature = "parallel"))] - for col in lde_columns.iter_mut() { - in_place_bit_reverse_permute(col); - } - - // Step 5: Convert columns to rows for Merkle tree - let lde_rows = columns2rows(lde_columns); - - // Step 6: Build Merkle tree over LDE (N * blowup leaves) - let tree = BatchedMerkleTree::::build(&lde_rows) + let (_, root) = commit_bit_reversed(&lde_columns, ROWS_PER_LEAF) .expect("Failed to build Merkle tree for bitwise LDE"); - - tree.root + root } /// Returns the preprocessed commitment for the bitwise table. diff --git a/prover/src/tables/decode.rs b/prover/src/tables/decode.rs index f1fe14e03..b70115a32 100644 --- a/prover/src/tables/decode.rs +++ b/prover/src/tables/decode.rs @@ -34,13 +34,13 @@ use executor::elf::Elf; use executor::vm::instruction::decoding::{Instruction, InstructionError}; use executor::vm::memory::U64HashMap; -use math::fft::bit_reversing::in_place_bit_reverse_permute; use math::polynomial::Polynomial; -use stark::config::{BatchedMerkleTree, Commitment}; +use stark::commitment::{ROWS_PER_LEAF, commit_bit_reversed}; +use stark::config::Commitment; use stark::lookup::{BusInteraction, BusValue, Multiplicity, Packing}; use stark::proof::options::ProofOptions; use stark::prover::evaluate_polynomial_on_lde_domain; -use stark::trace::{TraceTable, columns2rows}; +use stark::trace::TraceTable; use super::types::{BusId, FE, GoldilocksExtension, GoldilocksField}; @@ -286,7 +286,7 @@ pub fn compute_precomputed_commitment( // Step 4: Evaluate polynomials on LDE domain (N * blowup_factor points) let blowup_factor = options.blowup_factor as usize; let coset_offset = FE::from(options.coset_offset); - let mut lde_columns: Vec> = polys + let lde_columns: Vec> = polys .iter() .map(|poly| { evaluate_polynomial_on_lde_domain(poly, blowup_factor, num_rows, &coset_offset) @@ -294,19 +294,9 @@ pub fn compute_precomputed_commitment( }) .collect(); - // Step 5: Bit-reverse permute (same as prover) - for col in lde_columns.iter_mut() { - in_place_bit_reverse_permute(col); - } - - // Step 6: Convert columns to rows for Merkle tree - let lde_rows = columns2rows(lde_columns); - - // Step 7: Build Merkle tree over LDE (N * blowup leaves) - let tree = BatchedMerkleTree::::build(&lde_rows) + let (_, root) = commit_bit_reversed(&lde_columns, ROWS_PER_LEAF) .expect("Failed to build Merkle tree for decode LDE"); - - tree.root + root } // ========================================================================= diff --git a/prover/src/tables/keccak_rc.rs b/prover/src/tables/keccak_rc.rs index 3bcdf3428..99cd875b6 100644 --- a/prover/src/tables/keccak_rc.rs +++ b/prover/src/tables/keccak_rc.rs @@ -8,14 +8,14 @@ //! committed via a static lookup table (with recompute as fallback for //! `ProofOptions` not covered by the static table). -use math::fft::bit_reversing::in_place_bit_reverse_permute; use math::field::element::FieldElement; use math::polynomial::Polynomial; -use stark::config::{BatchedMerkleTree, Commitment}; +use stark::commitment::{ROWS_PER_LEAF, commit_bit_reversed}; +use stark::config::Commitment; use stark::lookup::{BusInteraction, BusValue, Multiplicity, Packing}; use stark::proof::options::ProofOptions; use stark::prover::evaluate_polynomial_on_lde_domain; -use stark::trace::{TraceTable, columns2rows}; +use stark::trace::TraceTable; use executor::vm::instruction::execution::KECCAK_RC; @@ -98,19 +98,19 @@ pub const fn generate_row(round: usize) -> [u64; NUM_PRECOMPUTED_COLS] { fn static_commitment(blowup_factor: u8) -> Option { match blowup_factor { 2 => Some([ - 0xe8, 0x06, 0x8b, 0xb2, 0xbd, 0x3d, 0x80, 0xf3, 0x92, 0x95, 0x31, 0x1a, 0xfd, 0x55, - 0xba, 0x12, 0x3f, 0x76, 0xeb, 0x44, 0x32, 0x57, 0x9d, 0xb7, 0x7f, 0x1e, 0x63, 0xb4, - 0x98, 0xb5, 0xb0, 0xb7, + 0xab, 0x7a, 0xad, 0xf5, 0xbf, 0xa2, 0xd5, 0x5c, 0x29, 0x83, 0x83, 0xe6, 0x2e, 0x47, + 0xa0, 0xa5, 0x22, 0xf9, 0x57, 0x89, 0x5a, 0x5c, 0xbb, 0x1f, 0x34, 0xbc, 0x21, 0x72, + 0xa9, 0x2c, 0x85, 0xe3, ]), 4 => Some([ - 0xa9, 0xfb, 0xc9, 0x15, 0x1c, 0x22, 0x75, 0xe7, 0x56, 0xeb, 0x6d, 0xf9, 0xfe, 0x83, - 0x2a, 0xb1, 0xa7, 0x1a, 0x20, 0x71, 0x9b, 0x0c, 0xff, 0x6b, 0x3f, 0x57, 0xc6, 0x84, - 0x3e, 0xbf, 0xc8, 0xaa, + 0xfb, 0x42, 0x58, 0x76, 0xf4, 0x30, 0x98, 0x04, 0xef, 0x8c, 0x4e, 0x65, 0xf7, 0x1a, + 0x29, 0x03, 0xd2, 0xc6, 0x12, 0x0d, 0x18, 0xe1, 0x28, 0x6e, 0x70, 0xeb, 0xa8, 0x11, + 0x3c, 0x3e, 0xe1, 0xdd, ]), 8 => Some([ - 0x5c, 0x30, 0xf6, 0xa0, 0xcf, 0x78, 0x43, 0x15, 0x5b, 0x5d, 0x18, 0x34, 0x44, 0xba, - 0x81, 0x9a, 0x64, 0x05, 0x5c, 0x79, 0x26, 0x18, 0x09, 0x24, 0x6b, 0xa2, 0x3f, 0x5f, - 0x77, 0x09, 0xd5, 0xfc, + 0x9f, 0x91, 0xaf, 0xb6, 0x5b, 0x75, 0x1e, 0xfb, 0x73, 0x93, 0x2c, 0xc4, 0xa8, 0xe1, + 0xb5, 0x21, 0x91, 0x5d, 0x6a, 0x19, 0x2e, 0x1d, 0xa8, 0x80, 0x21, 0x1f, 0x36, 0x76, + 0x9b, 0x8e, 0x3d, 0xb6, ]), _ => None, } @@ -145,7 +145,7 @@ pub fn compute_preprocessed_commitment(options: &ProofOptions) -> Commitment { // Evaluate on LDE domain let blowup_factor = options.blowup_factor as usize; let coset_offset = FE::from(options.coset_offset); - let mut lde_columns: Vec> = polys + let lde_columns: Vec> = polys .iter() .map(|poly| { evaluate_polynomial_on_lde_domain(poly, blowup_factor, NUM_ROWS, &coset_offset) @@ -153,17 +153,9 @@ pub fn compute_preprocessed_commitment(options: &ProofOptions) -> Commitment { }) .collect(); - // Bit-reverse permute - for col in lde_columns.iter_mut() { - in_place_bit_reverse_permute(col); - } - - // Build Merkle tree - let lde_rows = columns2rows(lde_columns); - let tree = BatchedMerkleTree::::build(&lde_rows) + let (_, root) = commit_bit_reversed(&lde_columns, ROWS_PER_LEAF) .expect("Failed to build Merkle tree for keccak_rc LDE"); - - tree.root + root } /// Returns the preprocessed commitment for the keccak_rc table. diff --git a/prover/src/tables/page.rs b/prover/src/tables/page.rs index 3997e8c22..e8f7c578e 100644 --- a/prover/src/tables/page.rs +++ b/prover/src/tables/page.rs @@ -32,13 +32,13 @@ use std::collections::HashMap; -use math::fft::bit_reversing::in_place_bit_reverse_permute; use math::polynomial::Polynomial; -use stark::config::{BatchedMerkleTree, Commitment}; +use stark::commitment::{ROWS_PER_LEAF, commit_bit_reversed}; +use stark::config::Commitment; use stark::lookup::{BusInteraction, BusValue, LinearTerm, Multiplicity, Packing}; use stark::proof::options::ProofOptions; use stark::prover::evaluate_polynomial_on_lde_domain; -use stark::trace::{TraceTable, columns2rows}; +use stark::trace::TraceTable; use super::types::{BusId, FE, GoldilocksExtension, GoldilocksField}; @@ -247,19 +247,19 @@ pub fn generate_page_trace( pub(crate) fn static_zero_page_commitment(blowup_factor: u8) -> Option { match blowup_factor { 2 => Some([ - 0xf9, 0x80, 0x0e, 0x45, 0x72, 0x5a, 0x8e, 0x8e, 0x5e, 0xd7, 0x5b, 0x60, 0xce, 0xd0, - 0x8e, 0xa3, 0x27, 0x3b, 0x8a, 0xb5, 0x98, 0xc0, 0xe3, 0x16, 0xf6, 0x86, 0x75, 0x39, - 0x4c, 0xe5, 0x88, 0x5e, + 0x7d, 0x74, 0x85, 0xf0, 0x2b, 0x74, 0xe0, 0x3f, 0x14, 0x99, 0xb3, 0xa0, 0x5f, 0x1d, + 0x6e, 0xf2, 0x21, 0xff, 0xaf, 0x24, 0x7e, 0x30, 0xb0, 0xda, 0x48, 0x79, 0xe1, 0x43, + 0xee, 0xea, 0x6a, 0x0f, ]), 4 => Some([ - 0x0f, 0xb5, 0x0c, 0xa8, 0x3b, 0x69, 0x4f, 0x91, 0x60, 0xbf, 0x0d, 0x0d, 0xd3, 0x33, - 0x25, 0x38, 0x11, 0xbb, 0xf8, 0xfd, 0x54, 0xbd, 0x06, 0x7d, 0xd1, 0xeb, 0xa3, 0x58, - 0xe8, 0x37, 0x45, 0x56, + 0x5c, 0xcc, 0x5b, 0xb1, 0xe8, 0x11, 0x91, 0x81, 0xbd, 0xdd, 0x39, 0x40, 0x77, 0x87, + 0xdc, 0x98, 0x06, 0x06, 0x8c, 0x63, 0xcd, 0xfd, 0xf1, 0xda, 0x4a, 0x55, 0x31, 0x4d, + 0x6a, 0x16, 0x18, 0xd0, ]), 8 => Some([ - 0x4a, 0xfb, 0xc9, 0x6d, 0x46, 0x29, 0xa3, 0xc2, 0x36, 0x14, 0xd8, 0x24, 0x3e, 0xef, - 0x97, 0x3f, 0xe1, 0xda, 0x2b, 0xf7, 0x87, 0xb6, 0x54, 0xe1, 0xc6, 0x46, 0xc0, 0x85, - 0x96, 0x7f, 0x7f, 0x48, + 0xf0, 0xc0, 0x69, 0xed, 0xf8, 0x59, 0xd6, 0x56, 0x15, 0x3c, 0x2f, 0x93, 0x65, 0xd6, + 0xe9, 0xe9, 0x8e, 0xd1, 0x83, 0x94, 0xf9, 0x75, 0x59, 0xd1, 0xec, 0x16, 0xe1, 0x37, + 0xd5, 0x32, 0xd6, 0xd9, ]), _ => None, } @@ -312,7 +312,7 @@ pub fn compute_precomputed_commitment(config: &PageConfig, options: &ProofOption let blowup_factor = options.blowup_factor as usize; let coset_offset = FE::from(options.coset_offset); - let mut lde_columns: Vec> = polys + let lde_columns: Vec> = polys .iter() .map(|poly| { evaluate_polynomial_on_lde_domain(poly, blowup_factor, num_rows, &coset_offset) @@ -320,14 +320,9 @@ pub fn compute_precomputed_commitment(config: &PageConfig, options: &ProofOption }) .collect(); - for col in lde_columns.iter_mut() { - in_place_bit_reverse_permute(col); - } - - let lde_rows = columns2rows(lde_columns); - let tree = BatchedMerkleTree::::build(&lde_rows) + let (_, root) = commit_bit_reversed(&lde_columns, ROWS_PER_LEAF) .expect("Failed to build Merkle tree for page LDE"); - tree.root + root } /// Returns the zero-init PAGE preprocessed commitment. diff --git a/prover/src/tables/register.rs b/prover/src/tables/register.rs index 2907c924a..662996dca 100644 --- a/prover/src/tables/register.rs +++ b/prover/src/tables/register.rs @@ -20,13 +20,13 @@ use std::collections::HashMap; -use math::fft::bit_reversing::in_place_bit_reverse_permute; use math::polynomial::Polynomial; -use stark::config::{BatchedMerkleTree, Commitment}; +use stark::commitment::{ROWS_PER_LEAF, commit_bit_reversed}; +use stark::config::Commitment; use stark::lookup::{BusInteraction, BusValue, Multiplicity, Packing}; use stark::proof::options::ProofOptions; use stark::prover::evaluate_polynomial_on_lde_domain; -use stark::trace::{TraceTable, columns2rows}; +use stark::trace::TraceTable; use super::page::STACK_TOP; use super::types::{BusId, FE, GoldilocksExtension, GoldilocksField}; @@ -219,7 +219,7 @@ pub fn compute_precomputed_commitment(options: &ProofOptions, entry_point: u64) let blowup_factor = options.blowup_factor as usize; let coset_offset = FE::from(options.coset_offset); - let mut lde_columns: Vec> = polys + let lde_columns: Vec> = polys .iter() .map(|poly| { evaluate_polynomial_on_lde_domain(poly, blowup_factor, num_rows, &coset_offset) @@ -227,14 +227,9 @@ pub fn compute_precomputed_commitment(options: &ProofOptions, entry_point: u64) }) .collect(); - for col in lde_columns.iter_mut() { - in_place_bit_reverse_permute(col); - } - - let lde_rows = columns2rows(lde_columns); - let tree = BatchedMerkleTree::::build(&lde_rows) + let (_, root) = commit_bit_reversed(&lde_columns, ROWS_PER_LEAF) .expect("Failed to build Merkle tree for register LDE"); - tree.root + root } /// Returns the preprocessed commitment for the REGISTER table. From c6b9832ad2faaf25444a56fed2d098c7fc7e97be Mon Sep 17 00:00:00 2001 From: diegokingston Date: Mon, 22 Jun 2026 11:36:12 -0300 Subject: [PATCH 07/19] test(prover): regenerate SUB_DECODE_COMMITMENT_BLOWUP_2 for row-pair layout The compile-time decode-commitment const for sub.elf shifted with the row-pair preprocessed commitment; regenerated via commitment_from_elf (the print_decode_commitment_for_sub regen path). --- prover/src/tests/decode_tests.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/prover/src/tests/decode_tests.rs b/prover/src/tests/decode_tests.rs index 43e6991cf..e2169da5a 100644 --- a/prover/src/tests/decode_tests.rs +++ b/prover/src/tests/decode_tests.rs @@ -242,8 +242,8 @@ fn decode_commitment_zero_bytes_rejects() { /// AIR or FFT pipeline changes, this drifts and the test fails — /// regenerate via the `print_decode_commitment_for_sub` helper below. const SUB_DECODE_COMMITMENT_BLOWUP_2: [u8; 32] = [ - 0x60, 0x66, 0x0b, 0x18, 0x0d, 0x41, 0x08, 0xb3, 0x3a, 0x03, 0x99, 0x03, 0x8c, 0x9d, 0x12, 0x57, - 0x68, 0x8d, 0xed, 0x13, 0x60, 0xeb, 0x1d, 0x2b, 0xa8, 0xea, 0x1c, 0x76, 0xc9, 0xdd, 0x25, 0xaf, + 0x9b, 0x94, 0x41, 0x2d, 0xf0, 0x7f, 0xde, 0x2a, 0x0c, 0x31, 0x99, 0x79, 0x43, 0xef, 0x8b, 0xc1, + 0x57, 0x05, 0x5f, 0xc6, 0x34, 0x22, 0x06, 0x06, 0x7c, 0x31, 0x73, 0xa0, 0x3b, 0xd8, 0xe3, 0x95, ]; #[test] From acd06e623257bbe3a422926061ea58571f2fd970 Mon Sep 17 00:00:00 2001 From: diegokingston Date: Mon, 22 Jun 2026 11:47:12 -0300 Subject: [PATCH 08/19] test(prover): TEMP print actual decode commitment to regenerate const from CI --- prover/src/tests/decode_tests.rs | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/prover/src/tests/decode_tests.rs b/prover/src/tests/decode_tests.rs index e2169da5a..618bfb672 100644 --- a/prover/src/tests/decode_tests.rs +++ b/prover/src/tests/decode_tests.rs @@ -252,6 +252,16 @@ fn decode_commitment_compile_time_const_accepts() { let vm_proof = prove(&elf_bytes).expect("prove failed"); let options = GoldilocksCubicProofOptions::with_blowup(2).expect("blowup=2 valid"); + // TEMP(ci-regen): surface the actual row-pair decode commitment computed from + // CI's sub.elf so SUB_DECODE_COMMITMENT_BLOWUP_2 can be regenerated. Remove. + { + let elf = Elf::load(&elf_bytes).expect("ELF load"); + eprintln!( + "ACTUAL_SUB_DECODE_COMMITMENT_BLOWUP_2 = {:02x?}", + commitment_from_elf(&elf, &options).expect("decode commitment") + ); + } + // Pass the OFFLINE-COMPUTED const directly — mimics the recursion guest's // workflow where the value lives in the caller's compiled binary. let result = verify_with_options( From c1ba23fc9f23faac8cd3fac6a221e9f0f84110ec Mon Sep 17 00:00:00 2001 From: diegokingston Date: Mon, 22 Jun 2026 11:56:47 -0300 Subject: [PATCH 09/19] test(prover): set SUB_DECODE_COMMITMENT_BLOWUP_2 to CI-computed row-pair value Regenerated from CI's sub.elf (local riscv toolchain unavailable); removed the temporary print instrumentation. --- prover/src/tests/decode_tests.rs | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/prover/src/tests/decode_tests.rs b/prover/src/tests/decode_tests.rs index 618bfb672..a761ac929 100644 --- a/prover/src/tests/decode_tests.rs +++ b/prover/src/tests/decode_tests.rs @@ -242,8 +242,8 @@ fn decode_commitment_zero_bytes_rejects() { /// AIR or FFT pipeline changes, this drifts and the test fails — /// regenerate via the `print_decode_commitment_for_sub` helper below. const SUB_DECODE_COMMITMENT_BLOWUP_2: [u8; 32] = [ - 0x9b, 0x94, 0x41, 0x2d, 0xf0, 0x7f, 0xde, 0x2a, 0x0c, 0x31, 0x99, 0x79, 0x43, 0xef, 0x8b, 0xc1, - 0x57, 0x05, 0x5f, 0xc6, 0x34, 0x22, 0x06, 0x06, 0x7c, 0x31, 0x73, 0xa0, 0x3b, 0xd8, 0xe3, 0x95, + 0xe9, 0x71, 0x68, 0xd6, 0x2e, 0xb1, 0xf6, 0x56, 0x61, 0x9d, 0x04, 0x6e, 0x65, 0xed, 0x63, 0x4a, + 0x27, 0xa3, 0x4d, 0xcb, 0x6c, 0x02, 0x11, 0xd7, 0x65, 0xc9, 0xc9, 0xfd, 0x59, 0x34, 0x41, 0x5f, ]; #[test] @@ -252,16 +252,6 @@ fn decode_commitment_compile_time_const_accepts() { let vm_proof = prove(&elf_bytes).expect("prove failed"); let options = GoldilocksCubicProofOptions::with_blowup(2).expect("blowup=2 valid"); - // TEMP(ci-regen): surface the actual row-pair decode commitment computed from - // CI's sub.elf so SUB_DECODE_COMMITMENT_BLOWUP_2 can be regenerated. Remove. - { - let elf = Elf::load(&elf_bytes).expect("ELF load"); - eprintln!( - "ACTUAL_SUB_DECODE_COMMITMENT_BLOWUP_2 = {:02x?}", - commitment_from_elf(&elf, &options).expect("decode commitment") - ); - } - // Pass the OFFLINE-COMPUTED const directly — mimics the recursion guest's // workflow where the value lives in the caller's compiled binary. let result = verify_with_options( From a1afb67ce9f65638196df5d08e96a8548957ce60 Mon Sep 17 00:00:00 2001 From: Joaquin Carletti Date: Fri, 26 Jun 2026 10:54:35 -0300 Subject: [PATCH 10/19] mplement changes in GPU --- crypto/math-cuda/kernels/keccak.cu | 45 ++++++++ crypto/math-cuda/src/device.rs | 3 + crypto/math-cuda/src/lde.rs | 113 ++++++++++++++------ crypto/math-cuda/src/merkle.rs | 133 ++++++++++++++++++++++++ crypto/math-cuda/tests/keccak_leaves.rs | 81 +++++++++++++++ crypto/stark/src/gpu_lde.rs | 24 +++-- 6 files changed, 361 insertions(+), 38 deletions(-) diff --git a/crypto/math-cuda/kernels/keccak.cu b/crypto/math-cuda/kernels/keccak.cu index c22bc4d05..272b847dc 100644 --- a/crypto/math-cuda/kernels/keccak.cu +++ b/crypto/math-cuda/kernels/keccak.cu @@ -181,6 +181,51 @@ extern "C" __global__ void keccak256_leaves_base_batched( finalize_keccak256(st, rate_pos, hashed_leaves_out + tid * 32); } +// --------------------------------------------------------------------------- +// Goldilocks BASE-FIELD row-pair leaf hashing. +// +// Leaf `leaf_idx` hashes TWO consecutive bit-reversed rows +// br_0 = reverse_index(2*leaf_idx), br_1 = reverse_index(2*leaf_idx + 1) +// each written column-by-column in canonical BE (same per-row byte layout as +// `keccak256_leaves_base_batched`), in (br_0 row: col 0..K-1) then (br_1 row: +// col 0..K-1) order. `num_leaves = num_rows / 2`; writes 32 bytes to +// `hashed_leaves_out[leaf_idx * 32 ..]`. Matches the CPU +// `keccak_leaves_row_pair_bit_reversed` (rows_per_leaf = 2) — the base-field +// analog of `keccak_comp_poly_leaves_ext3`. +// --------------------------------------------------------------------------- +extern "C" __global__ void keccak256_leaves_base_row_pair_batched( + const uint64_t *columns_base_ptr, + uint64_t col_stride, + uint64_t num_cols, + uint64_t num_rows, + uint64_t log_num_rows, + uint8_t *hashed_leaves_out) { + uint64_t tid = (uint64_t)blockIdx.x * blockDim.x + threadIdx.x; + uint64_t num_leaves = num_rows >> 1; + if (tid >= num_leaves) return; + + uint64_t br_0 = __brevll(2 * tid) >> (64 - log_num_rows); + uint64_t br_1 = __brevll(2 * tid + 1) >> (64 - log_num_rows); + + uint64_t st[25]; + #pragma unroll + for (int i = 0; i < 25; ++i) st[i] = 0; + + uint32_t rate_pos = 0; + // First row (br_0): col 0..K-1. + for (uint64_t c = 0; c < num_cols; ++c) { + uint64_t v = columns_base_ptr[c * col_stride + br_0]; + absorb_lane(st, rate_pos, bswap64(goldilocks::canonical(v))); + } + // Second row (br_1): col 0..K-1. + for (uint64_t c = 0; c < num_cols; ++c) { + uint64_t v = columns_base_ptr[c * col_stride + br_1]; + absorb_lane(st, rate_pos, bswap64(goldilocks::canonical(v))); + } + + finalize_keccak256(st, rate_pos, hashed_leaves_out + tid * 32); +} + // --------------------------------------------------------------------------- // Goldilocks EXT3 leaf hashing (3 base-field components per ext3 element). // diff --git a/crypto/math-cuda/src/device.rs b/crypto/math-cuda/src/device.rs index 17e2f9f82..d2a98652b 100644 --- a/crypto/math-cuda/src/device.rs +++ b/crypto/math-cuda/src/device.rs @@ -143,6 +143,7 @@ pub struct Backend { // keccak.ptx pub keccak256_leaves_base_batched: CudaFunction, + pub keccak256_leaves_base_row_pair_batched: CudaFunction, pub keccak256_leaves_ext3_batched: CudaFunction, pub keccak_comp_poly_leaves_ext3: CudaFunction, pub keccak_fri_leaves_ext3: CudaFunction, @@ -238,6 +239,8 @@ impl Backend { pointwise_mul_batched: ntt.load_function("pointwise_mul_batched")?, scalar_mul_batched: ntt.load_function("scalar_mul_batched")?, keccak256_leaves_base_batched: keccak.load_function("keccak256_leaves_base_batched")?, + keccak256_leaves_base_row_pair_batched: keccak + .load_function("keccak256_leaves_base_row_pair_batched")?, keccak256_leaves_ext3_batched: keccak.load_function("keccak256_leaves_ext3_batched")?, keccak_comp_poly_leaves_ext3: keccak.load_function("keccak_comp_poly_leaves_ext3")?, keccak_fri_leaves_ext3: keccak.load_function("keccak_fri_leaves_ext3")?, diff --git a/crypto/math-cuda/src/lde.rs b/crypto/math-cuda/src/lde.rs index ee5dc3fce..59e01245f 100644 --- a/crypto/math-cuda/src/lde.rs +++ b/crypto/math-cuda/src/lde.rs @@ -16,7 +16,10 @@ use cudarc::driver::{CudaSlice, CudaStream, LaunchConfig, PushKernelArg}; use crate::Result; use crate::device::{Backend, backend}; -use crate::merkle::{keccak_launch_cfg, launch_keccak_base, launch_keccak_ext3}; +use crate::merkle::{ + keccak_launch_cfg, launch_keccak_base, launch_keccak_base_row_pair, launch_keccak_ext3, + launch_keccak_ext3_row_pair, +}; use crate::ntt::run_ntt_body; /// Goldilocks `TWO_ADICITY = 32` puts the theoretical domain ceiling at @@ -43,17 +46,17 @@ enum KeccakCommit { } impl KeccakCommit { - fn total_nodes_bytes(self, lde_size: usize) -> usize { + fn total_nodes_bytes(self, num_leaves: usize) -> usize { match self { - KeccakCommit::LeavesOnly => lde_size * 32, - KeccakCommit::FullTree => (2 * lde_size - 1) * 32, + KeccakCommit::LeavesOnly => num_leaves * 32, + KeccakCommit::FullTree => (2 * num_leaves - 1) * 32, } } - fn leaves_offset_bytes(self, lde_size: usize) -> usize { + fn leaves_offset_bytes(self, num_leaves: usize) -> usize { match self { KeccakCommit::LeavesOnly => 0, - KeccakCommit::FullTree => (lde_size - 1) * 32, + KeccakCommit::FullTree => (num_leaves - 1) * 32, } } } @@ -613,6 +616,7 @@ pub fn coset_lde_batch_base_into_with_leaf_hash( hashed_leaves_out, KeccakCommit::LeavesOnly, false, + 1, ) .map(|_| ()) } @@ -640,6 +644,7 @@ pub fn coset_lde_batch_base_into_with_merkle_tree( merkle_nodes_out, KeccakCommit::FullTree, false, + 1, ) .map(|_| ()) } @@ -662,6 +667,9 @@ pub fn coset_lde_batch_base_into_with_merkle_tree_keep( merkle_nodes_out, KeccakCommit::FullTree, true, + // Trace commit: one leaf per bit-reversed row pair, matching the CPU + // `commit_bit_reversed(.., ROWS_PER_LEAF=2)` and `verify_opening_pair`. + 2, )?; let handle = opt.expect("keep_device_buf=true must return Some"); Ok(handle) @@ -675,6 +683,9 @@ fn coset_lde_batch_base_into_with_merkle_tree_inner( nodes_out: &mut [u8], commit: KeccakCommit, keep_device_buf: bool, + // 1 = one leaf per bit-reversed row; 2 = one leaf per row pair (2i, 2i+1), + // matching the CPU `commit_bit_reversed(.., 2)` used for the trace commit. + rows_per_leaf: usize, ) -> Result> { if columns.is_empty() { assert_eq!(outputs.len(), 0); @@ -698,7 +709,13 @@ fn coset_lde_batch_base_into_with_merkle_tree_inner( for o in outputs.iter() { assert_eq!(o.len(), lde_size); } - let nodes_dev_bytes = commit.total_nodes_bytes(lde_size); + assert!( + rows_per_leaf == 1 || rows_per_leaf == 2, + "rows_per_leaf must be 1 or 2" + ); + assert_eq!(lde_size % rows_per_leaf, 0); + let num_leaves = lde_size / rows_per_leaf; + let nodes_dev_bytes = commit.total_nodes_bytes(num_leaves); assert_eq!(nodes_out.len(), nodes_dev_bytes); let log_n = n.trailing_zeros() as u64; let log_lde = lde_size.trailing_zeros() as u64; @@ -787,22 +804,33 @@ fn coset_lde_batch_base_into_with_merkle_tree_inner( // written before any reader sees it: the keccak kernel fills the // leaves slab, the inner-tree pass (when present) fills the head. let mut nodes_dev = unsafe { stream.alloc::(nodes_dev_bytes) }?; - let leaves_offset_bytes = commit.leaves_offset_bytes(lde_size); + let leaves_offset_bytes = commit.leaves_offset_bytes(num_leaves); { let mut leaves_view = - nodes_dev.slice_mut(leaves_offset_bytes..leaves_offset_bytes + lde_size * 32); - launch_keccak_base( - stream.as_ref(), - &buf, - col_stride_u64, - m as u64, - lde_u64, - &mut leaves_view, - )?; + nodes_dev.slice_mut(leaves_offset_bytes..leaves_offset_bytes + num_leaves * 32); + if rows_per_leaf == 2 { + launch_keccak_base_row_pair( + stream.as_ref(), + &buf, + col_stride_u64, + m as u64, + lde_u64, + &mut leaves_view, + )?; + } else { + launch_keccak_base( + stream.as_ref(), + &buf, + col_stride_u64, + m as u64, + lde_u64, + &mut leaves_view, + )?; + } } if commit == KeccakCommit::FullTree { - crate::merkle::build_inner_tree_levels(stream.as_ref(), be, &mut nodes_dev, lde_size)?; + crate::merkle::build_inner_tree_levels(stream.as_ref(), be, &mut nodes_dev, num_leaves)?; } // D2H the LDE and the tree/leaves nodes via pinned staging. @@ -848,6 +876,7 @@ pub fn coset_lde_batch_ext3_into_with_leaf_hash( hashed_leaves_out, KeccakCommit::LeavesOnly, false, + 1, ) .map(|_| ()) } @@ -872,6 +901,7 @@ pub fn coset_lde_batch_ext3_into_with_merkle_tree( merkle_nodes_out, KeccakCommit::FullTree, false, + 1, ) .map(|_| ()) } @@ -896,6 +926,9 @@ pub fn coset_lde_batch_ext3_into_with_merkle_tree_keep( merkle_nodes_out, KeccakCommit::FullTree, true, + // Trace commit: one leaf per bit-reversed row pair, matching the CPU + // `commit_bit_reversed(.., ROWS_PER_LEAF=2)` and `verify_opening_pair`. + 2, )?; Ok(opt.expect("keep_device_buf=true must return Some")) } @@ -910,6 +943,9 @@ fn coset_lde_batch_ext3_into_with_merkle_tree_inner( nodes_out: &mut [u8], commit: KeccakCommit, keep_device_buf: bool, + // 1 = one leaf per bit-reversed row; 2 = one leaf per row pair (2i, 2i+1), + // matching the CPU `commit_bit_reversed(.., 2)` used for the trace commit. + rows_per_leaf: usize, ) -> Result> { if columns.is_empty() { assert_eq!(outputs.len(), 0); @@ -935,7 +971,13 @@ fn coset_lde_batch_ext3_into_with_merkle_tree_inner( for o in outputs.iter() { assert_eq!(o.len(), 3 * lde_size); } - let nodes_dev_bytes = commit.total_nodes_bytes(lde_size); + assert!( + rows_per_leaf == 1 || rows_per_leaf == 2, + "rows_per_leaf must be 1 or 2" + ); + assert_eq!(lde_size % rows_per_leaf, 0); + let num_leaves = lde_size / rows_per_leaf; + let nodes_dev_bytes = commit.total_nodes_bytes(num_leaves); assert_eq!(nodes_out.len(), nodes_dev_bytes); let log_n = n.trailing_zeros() as u64; let log_lde = lde_size.trailing_zeros() as u64; @@ -1016,22 +1058,33 @@ fn coset_lde_batch_ext3_into_with_merkle_tree_inner( // (2*lde_size - 1)*32). Leaf kernel writes to the leaves slab; the // inner-tree pass (when present) fills the head. let mut nodes_dev = unsafe { stream.alloc::(nodes_dev_bytes) }?; - let leaves_offset_bytes = commit.leaves_offset_bytes(lde_size); + let leaves_offset_bytes = commit.leaves_offset_bytes(num_leaves); { let mut leaves_view = - nodes_dev.slice_mut(leaves_offset_bytes..leaves_offset_bytes + lde_size * 32); - launch_keccak_ext3( - stream.as_ref(), - &buf, - col_stride_u64, - m as u64, - lde_u64, - &mut leaves_view, - )?; + nodes_dev.slice_mut(leaves_offset_bytes..leaves_offset_bytes + num_leaves * 32); + if rows_per_leaf == 2 { + launch_keccak_ext3_row_pair( + stream.as_ref(), + &buf, + col_stride_u64, + m as u64, + lde_u64, + &mut leaves_view, + )?; + } else { + launch_keccak_ext3( + stream.as_ref(), + &buf, + col_stride_u64, + m as u64, + lde_u64, + &mut leaves_view, + )?; + } } if commit == KeccakCommit::FullTree { - crate::merkle::build_inner_tree_levels(stream.as_ref(), be, &mut nodes_dev, lde_size)?; + crate::merkle::build_inner_tree_levels(stream.as_ref(), be, &mut nodes_dev, num_leaves)?; } // D2H LDE (mb * lde_size u64) and tree/leaves nodes. diff --git a/crypto/math-cuda/src/merkle.rs b/crypto/math-cuda/src/merkle.rs index 932e81325..f9ca74706 100644 --- a/crypto/math-cuda/src/merkle.rs +++ b/crypto/math-cuda/src/merkle.rs @@ -94,6 +94,79 @@ pub fn keccak_leaves_ext3( Ok(out) } +/// Row-pair base-field leaf hashing: leaf `i` hashes bit-reversed rows `2i`, +/// `2i+1`. Returns `num_rows/2 * 32` hash bytes. Parity entry point for the +/// CPU `keccak_leaves_row_pair_bit_reversed` over base columns (the layout the +/// trace commit uses). +pub fn keccak_leaves_base_row_pair( + columns: &[u64], + col_stride: usize, + num_cols: usize, + num_rows: usize, +) -> Result> { + assert!(num_rows.is_power_of_two()); + assert!(num_rows >= 2); + assert!( + col_stride >= num_rows, + "col_stride must be >= num_rows to keep per-column reads in-bounds" + ); + let total = num_cols + .checked_mul(col_stride) + .expect("num_cols * col_stride overflows usize"); + assert!(columns.len() >= total); + let be = backend()?; + let stream = be.next_stream(); + let cols_dev = stream.clone_htod(&columns[..total])?; + let mut out_dev = stream.alloc_zeros::((num_rows / 2) * 32)?; + launch_keccak_base_row_pair( + stream.as_ref(), + &cols_dev, + col_stride as u64, + num_cols as u64, + num_rows as u64, + &mut out_dev.as_view_mut(), + )?; + let out = stream.clone_dtoh(&out_dev)?; + stream.synchronize()?; + Ok(out) +} + +/// Row-pair ext3 variant. Returns `num_rows/2 * 32` hash bytes. Parity entry +/// point for the CPU `keccak_leaves_row_pair_bit_reversed` over ext3 columns. +pub fn keccak_leaves_ext3_row_pair( + columns: &[u64], + col_stride: usize, + num_cols: usize, + num_rows: usize, +) -> Result> { + assert!(num_rows.is_power_of_two()); + assert!(num_rows >= 2); + assert!( + col_stride >= num_rows, + "col_stride must be >= num_rows to keep per-column reads in-bounds" + ); + let total = num_cols + .checked_mul(3) + .and_then(|v| v.checked_mul(col_stride)) + .expect("num_cols * 3 * col_stride overflows usize"); + assert!(columns.len() >= total); + let be = backend()?; + let stream = be.next_stream(); + let cols_dev = stream.clone_htod(&columns[..total])?; + let mut out_dev = stream.alloc_zeros::((num_rows / 2) * 32)?; + launch_keccak_ext3_row_pair( + stream.as_ref(), + &cols_dev, + col_stride as u64, + num_cols as u64, + num_rows as u64, + &mut out_dev.as_view_mut(), + )?; + let out = stream.clone_dtoh(&out_dev)?; + stream.synchronize()?; + Ok(out) +} + /// Block size for Keccak kernels. Per-thread register footprint is ~60 regs /// (25-lane state + auxiliaries). The default 256 threads/block pushes the /// block register file past the hardware limit on sm_120 (Blackwell). 128 @@ -170,6 +243,66 @@ pub(crate) fn launch_keccak_base( Ok(()) } +/// Row-pair base-field leaf hashing: leaf `i` hashes bit-reversed rows `2i`, +/// `2i+1` (one Merkle path per FRI query). Writes `num_rows/2` leaves of 32 +/// bytes into `out_dev`. Base-field analog of the comp-poly ext3 path; matches +/// the CPU `keccak_leaves_row_pair_bit_reversed`. +pub(crate) fn launch_keccak_base_row_pair( + stream: &CudaStream, + cols_dev: &CudaSlice, + col_stride: u64, + num_cols: u64, + num_rows: u64, + out_dev: &mut CudaViewMut<'_, u8>, +) -> Result<()> { + debug_assert!(num_rows >= 2, "keccak row-pair leaf kernel: num_rows must be >= 2"); + let be = backend()?; + let log_num_rows = num_rows.trailing_zeros() as u64; + // One thread per leaf (= row pair). + let cfg = keccak_launch_cfg(num_rows >> 1); + unsafe { + stream + .launch_builder(&be.keccak256_leaves_base_row_pair_batched) + .arg(cols_dev) + .arg(&col_stride) + .arg(&num_cols) + .arg(&num_rows) + .arg(&log_num_rows) + .arg(out_dev) + .launch(cfg)?; + } + Ok(()) +} + +/// Row-pair ext3 leaf hashing for the aux trace: reuses the comp-poly kernel +/// (`keccak_comp_poly_leaves_ext3`), which hashes bit-reversed rows `2i`, `2i+1` +/// across all ext3 columns. Writes `num_rows/2` leaves of 32 bytes. +pub(crate) fn launch_keccak_ext3_row_pair( + stream: &CudaStream, + cols_dev: &CudaSlice, + col_stride: u64, + num_cols: u64, + num_rows: u64, + out_dev: &mut CudaViewMut<'_, u8>, +) -> Result<()> { + debug_assert!(num_rows >= 2, "keccak row-pair leaf kernel: num_rows must be >= 2"); + let be = backend()?; + let log_num_rows = num_rows.trailing_zeros() as u64; + let cfg = keccak_launch_cfg(num_rows >> 1); + unsafe { + stream + .launch_builder(&be.keccak_comp_poly_leaves_ext3) + .arg(cols_dev) + .arg(&col_stride) + .arg(&num_cols) + .arg(&num_rows) + .arg(&log_num_rows) + .arg(out_dev) + .launch(cfg)?; + } + Ok(()) +} + /// Given `hashed_leaves` of length `leaves_len * 32`, build the full Merkle /// tree on device and return the complete node buffer `(2*leaves_len - 1) * /// 32` bytes in the standard layout: diff --git a/crypto/math-cuda/tests/keccak_leaves.rs b/crypto/math-cuda/tests/keccak_leaves.rs index d614e233d..526f47ac3 100644 --- a/crypto/math-cuda/tests/keccak_leaves.rs +++ b/crypto/math-cuda/tests/keccak_leaves.rs @@ -97,6 +97,87 @@ fn keccak_leaves_ext3_matches_cpu() { } } +#[test] +fn keccak_leaves_base_row_pair_matches_cpu() { + // Row-pair (trace) commit: leaf `i` hashes bit-reversed rows `2i`, `2i+1`. + // GPU `keccak_leaves_base_row_pair` must match the CPU prover helper + // `keccak_leaves_row_pair_bit_reversed` over base columns. + for log_n in [4u32, 6, 8, 10, 12] { + for num_cols in [1usize, 5, 17, 41] { + let n = 1 << log_n; + let mut rng = ChaCha8Rng::seed_from_u64(500 + log_n as u64 + num_cols as u64); + let columns: Vec> = (0..num_cols) + .map(|_| (0..n).map(|_| Fp::from_raw(rng.r#gen::())).collect()) + .collect(); + + let cpu = keccak_leaves_row_pair_bit_reversed(&columns); + assert_eq!(cpu.len(), n / 2); + + let mut flat = vec![0u64; num_cols * n]; + for (c, col) in columns.iter().enumerate() { + for (r, e) in col.iter().enumerate() { + flat[c * n + r] = *e.value(); + } + } + let gpu = math_cuda::merkle::keccak_leaves_base_row_pair(&flat, n, num_cols, n).unwrap(); + assert_eq!(gpu.len(), (n / 2) * 32); + for i in 0..n / 2 { + assert_eq!( + &gpu[i * 32..(i + 1) * 32], + &cpu[i][..], + "base row-pair leaf mismatch at i={i} (log_n={log_n}, cols={num_cols})" + ); + } + } + } +} + +#[test] +fn keccak_leaves_ext3_row_pair_matches_cpu() { + for log_n in [4u32, 6, 8, 10] { + for num_cols in [1usize, 3, 11, 20] { + let n = 1 << log_n; + let mut rng = ChaCha8Rng::seed_from_u64(600 + log_n as u64 + num_cols as u64); + let columns: Vec> = (0..num_cols) + .map(|_| { + (0..n) + .map(|_| { + Fp3::new([ + Fp::from_raw(rng.r#gen::()), + Fp::from_raw(rng.r#gen::()), + Fp::from_raw(rng.r#gen::()), + ]) + }) + .collect() + }) + .collect(); + + let cpu = keccak_leaves_row_pair_bit_reversed(&columns); + assert_eq!(cpu.len(), n / 2); + + // De-interleaved 3-slab layout per ext3 column (same as the 1-row + // ext3 leaf path): [col*3+k] each a contiguous slab of n u64s. + let mut flat = vec![0u64; num_cols * 3 * n]; + for (c, col) in columns.iter().enumerate() { + for (r, e) in col.iter().enumerate() { + flat[(c * 3) * n + r] = *e.value()[0].value(); + flat[(c * 3 + 1) * n + r] = *e.value()[1].value(); + flat[(c * 3 + 2) * n + r] = *e.value()[2].value(); + } + } + let gpu = math_cuda::merkle::keccak_leaves_ext3_row_pair(&flat, n, num_cols, n).unwrap(); + assert_eq!(gpu.len(), (n / 2) * 32); + for i in 0..n / 2 { + assert_eq!( + &gpu[i * 32..(i + 1) * 32], + &cpu[i][..], + "ext3 row-pair leaf mismatch at i={i} (log_n={log_n}, cols={num_cols})" + ); + } + } + } +} + #[test] fn keccak_comp_poly_leaves_matches_cpu() { // Built tree's leaves live at byte offset `(num_leaves - 1) * 32` and diff --git a/crypto/stark/src/gpu_lde.rs b/crypto/stark/src/gpu_lde.rs index be6d5ccfe..5f09cd3b2 100644 --- a/crypto/stark/src/gpu_lde.rs +++ b/crypto/stark/src/gpu_lde.rs @@ -269,17 +269,17 @@ fn restore_columns_on_err(columns: &mut [Vec>], n: u } } -/// Allocate the `[u8; 32]` Merkle node buffer for a tree of `lde_size` leaves +/// Allocate the `[u8; 32]` Merkle node buffer for a tree of `num_leaves` leaves /// and return the node `Vec` (length-initialised, contents undefined) together -/// with its node count `total_nodes` (`2 * lde_size - 1`). Returns `None` if -/// the layout would be invalid (`lde_size < 2` or `total_nodes * 32` overflows +/// with its node count `total_nodes` (`2 * num_leaves - 1`). Returns `None` if +/// the layout would be invalid (`num_leaves < 2` or `total_nodes * 32` overflows /// `usize`). The caller builds the `&mut [u8]` byte view of length /// `total_nodes * 32` and must overwrite every byte via the GPU D2H. -fn alloc_merkle_nodes(lde_size: usize) -> Option<(Vec<[u8; 32]>, usize)> { - if lde_size < 2 { +fn alloc_merkle_nodes(num_leaves: usize) -> Option<(Vec<[u8; 32]>, usize)> { + if num_leaves < 2 { return None; } - let total_nodes = 2usize.saturating_mul(lde_size).checked_sub(1)?; + let total_nodes = 2usize.saturating_mul(num_leaves).checked_sub(1)?; let _byte_len = total_nodes.checked_mul(32)?; let mut nodes: Vec<[u8; 32]> = Vec::with_capacity(total_nodes); // SAFETY: every byte will be overwritten via the GPU D2H before the @@ -471,7 +471,11 @@ where LayoutDispatch::Run { n, lde_size } => (n, lde_size), }; let num_columns = columns.len(); - let (mut nodes, total_nodes) = alloc_merkle_nodes(lde_size)?; + // Row-pair trace commit: one Merkle leaf per bit-reversed row pair, so the + // tree has `lde_size / ROWS_PER_LEAF` leaves (matches the CPU + // `commit_bit_reversed(.., ROWS_PER_LEAF)` and `verify_opening_pair`). + let num_leaves = lde_size / crate::commitment::ROWS_PER_LEAF; + let (mut nodes, total_nodes) = alloc_merkle_nodes(num_leaves)?; let node_byte_len = total_nodes .checked_mul(32) .expect("node byte length overflow"); @@ -528,7 +532,11 @@ where LayoutDispatch::Run { n, lde_size } => (n, lde_size), }; let num_columns = columns.len(); - let (mut nodes, total_nodes) = alloc_merkle_nodes(lde_size)?; + // Row-pair trace commit: one Merkle leaf per bit-reversed row pair, so the + // tree has `lde_size / ROWS_PER_LEAF` leaves (matches the CPU + // `commit_bit_reversed(.., ROWS_PER_LEAF)` and `verify_opening_pair`). + let num_leaves = lde_size / crate::commitment::ROWS_PER_LEAF; + let (mut nodes, total_nodes) = alloc_merkle_nodes(num_leaves)?; let node_byte_len = total_nodes .checked_mul(32) .expect("node byte length overflow"); From 7446a9b1b1ee4e5ba8df69ed7050f1216af11887 Mon Sep 17 00:00:00 2001 From: Joaquin Carletti Date: Fri, 26 Jun 2026 11:09:44 -0300 Subject: [PATCH 11/19] refactor --- crypto/math-cuda/src/merkle.rs | 104 ++++++------------------ crypto/math-cuda/tests/keccak_leaves.rs | 12 +-- 2 files changed, 30 insertions(+), 86 deletions(-) diff --git a/crypto/math-cuda/src/merkle.rs b/crypto/math-cuda/src/merkle.rs index f9ca74706..5e7e4dae5 100644 --- a/crypto/math-cuda/src/merkle.rs +++ b/crypto/math-cuda/src/merkle.rs @@ -25,15 +25,19 @@ use crate::lde::pack_ext3_to_pinned_slabs; /// Run GPU Keccak-256 leaf hashing on a base-field column buffer. /// /// `columns` must hold `num_cols * col_stride` u64s with column `c`'s data -/// at `[c*col_stride .. c*col_stride + num_rows]`. Returns `num_rows * 32` -/// hash bytes in natural (non-bit-reversed) row order. +/// at `[c*col_stride .. c*col_stride + num_rows]`. `rows_per_leaf` selects the +/// leaf layout: `1` = one leaf per bit-reversed row (`num_rows` leaves), `2` = +/// one leaf per bit-reversed row pair `2i`,`2i+1` (`num_rows/2` leaves, the +/// trace-commit layout). Returns `(num_rows / rows_per_leaf) * 32` hash bytes. pub fn keccak_leaves_base( columns: &[u64], col_stride: usize, num_cols: usize, num_rows: usize, + rows_per_leaf: usize, ) -> Result> { assert!(num_rows.is_power_of_two()); + assert!(rows_per_leaf == 1 || rows_per_leaf == 2); assert!( col_stride >= num_rows, "col_stride must be >= num_rows to keep per-column reads in-bounds" @@ -45,8 +49,13 @@ pub fn keccak_leaves_base( let be = backend()?; let stream = be.next_stream(); let cols_dev = stream.clone_htod(&columns[..total])?; - let mut out_dev = stream.alloc_zeros::(num_rows * 32)?; - launch_keccak_base( + let mut out_dev = stream.alloc_zeros::((num_rows / rows_per_leaf) * 32)?; + let launch = if rows_per_leaf == 2 { + launch_keccak_base_row_pair + } else { + launch_keccak_base + }; + launch( stream.as_ref(), &cols_dev, col_stride as u64, @@ -60,14 +69,17 @@ pub fn keccak_leaves_base( } /// Ext3 variant. Columns interleaved as three base slabs per ext3 column. -/// `columns.len() >= num_cols * 3 * col_stride`. +/// `columns.len() >= num_cols * 3 * col_stride`. `rows_per_leaf` as in +/// [`keccak_leaves_base`]. pub fn keccak_leaves_ext3( columns: &[u64], col_stride: usize, num_cols: usize, num_rows: usize, + rows_per_leaf: usize, ) -> Result> { assert!(num_rows.is_power_of_two()); + assert!(rows_per_leaf == 1 || rows_per_leaf == 2); assert!( col_stride >= num_rows, "col_stride must be >= num_rows to keep per-column reads in-bounds" @@ -80,81 +92,13 @@ pub fn keccak_leaves_ext3( let be = backend()?; let stream = be.next_stream(); let cols_dev = stream.clone_htod(&columns[..total])?; - let mut out_dev = stream.alloc_zeros::(num_rows * 32)?; - launch_keccak_ext3( - stream.as_ref(), - &cols_dev, - col_stride as u64, - num_cols as u64, - num_rows as u64, - &mut out_dev.as_view_mut(), - )?; - let out = stream.clone_dtoh(&out_dev)?; - stream.synchronize()?; - Ok(out) -} - -/// Row-pair base-field leaf hashing: leaf `i` hashes bit-reversed rows `2i`, -/// `2i+1`. Returns `num_rows/2 * 32` hash bytes. Parity entry point for the -/// CPU `keccak_leaves_row_pair_bit_reversed` over base columns (the layout the -/// trace commit uses). -pub fn keccak_leaves_base_row_pair( - columns: &[u64], - col_stride: usize, - num_cols: usize, - num_rows: usize, -) -> Result> { - assert!(num_rows.is_power_of_two()); - assert!(num_rows >= 2); - assert!( - col_stride >= num_rows, - "col_stride must be >= num_rows to keep per-column reads in-bounds" - ); - let total = num_cols - .checked_mul(col_stride) - .expect("num_cols * col_stride overflows usize"); - assert!(columns.len() >= total); - let be = backend()?; - let stream = be.next_stream(); - let cols_dev = stream.clone_htod(&columns[..total])?; - let mut out_dev = stream.alloc_zeros::((num_rows / 2) * 32)?; - launch_keccak_base_row_pair( - stream.as_ref(), - &cols_dev, - col_stride as u64, - num_cols as u64, - num_rows as u64, - &mut out_dev.as_view_mut(), - )?; - let out = stream.clone_dtoh(&out_dev)?; - stream.synchronize()?; - Ok(out) -} - -/// Row-pair ext3 variant. Returns `num_rows/2 * 32` hash bytes. Parity entry -/// point for the CPU `keccak_leaves_row_pair_bit_reversed` over ext3 columns. -pub fn keccak_leaves_ext3_row_pair( - columns: &[u64], - col_stride: usize, - num_cols: usize, - num_rows: usize, -) -> Result> { - assert!(num_rows.is_power_of_two()); - assert!(num_rows >= 2); - assert!( - col_stride >= num_rows, - "col_stride must be >= num_rows to keep per-column reads in-bounds" - ); - let total = num_cols - .checked_mul(3) - .and_then(|v| v.checked_mul(col_stride)) - .expect("num_cols * 3 * col_stride overflows usize"); - assert!(columns.len() >= total); - let be = backend()?; - let stream = be.next_stream(); - let cols_dev = stream.clone_htod(&columns[..total])?; - let mut out_dev = stream.alloc_zeros::((num_rows / 2) * 32)?; - launch_keccak_ext3_row_pair( + let mut out_dev = stream.alloc_zeros::((num_rows / rows_per_leaf) * 32)?; + let launch = if rows_per_leaf == 2 { + launch_keccak_ext3_row_pair + } else { + launch_keccak_ext3 + }; + launch( stream.as_ref(), &cols_dev, col_stride as u64, diff --git a/crypto/math-cuda/tests/keccak_leaves.rs b/crypto/math-cuda/tests/keccak_leaves.rs index 526f47ac3..61a861f32 100644 --- a/crypto/math-cuda/tests/keccak_leaves.rs +++ b/crypto/math-cuda/tests/keccak_leaves.rs @@ -38,7 +38,7 @@ fn keccak_leaves_base_matches_cpu() { flat[c * n + r] = *e.value(); } } - let gpu = math_cuda::merkle::keccak_leaves_base(&flat, n, num_cols, n).unwrap(); + let gpu = math_cuda::merkle::keccak_leaves_base(&flat, n, num_cols, n, 1).unwrap(); assert_eq!(gpu.len(), n * 32); for i in 0..n { assert_eq!( @@ -84,7 +84,7 @@ fn keccak_leaves_ext3_matches_cpu() { flat[(c * 3 + 2) * n + r] = *e.value()[2].value(); } } - let gpu = math_cuda::merkle::keccak_leaves_ext3(&flat, n, num_cols, n).unwrap(); + let gpu = math_cuda::merkle::keccak_leaves_ext3(&flat, n, num_cols, n, 1).unwrap(); assert_eq!(gpu.len(), n * 32); for i in 0..n { assert_eq!( @@ -100,8 +100,8 @@ fn keccak_leaves_ext3_matches_cpu() { #[test] fn keccak_leaves_base_row_pair_matches_cpu() { // Row-pair (trace) commit: leaf `i` hashes bit-reversed rows `2i`, `2i+1`. - // GPU `keccak_leaves_base_row_pair` must match the CPU prover helper - // `keccak_leaves_row_pair_bit_reversed` over base columns. + // GPU `keccak_leaves_base(.., rows_per_leaf=2)` must match the CPU prover + // helper `keccak_leaves_row_pair_bit_reversed` over base columns. for log_n in [4u32, 6, 8, 10, 12] { for num_cols in [1usize, 5, 17, 41] { let n = 1 << log_n; @@ -119,7 +119,7 @@ fn keccak_leaves_base_row_pair_matches_cpu() { flat[c * n + r] = *e.value(); } } - let gpu = math_cuda::merkle::keccak_leaves_base_row_pair(&flat, n, num_cols, n).unwrap(); + let gpu = math_cuda::merkle::keccak_leaves_base(&flat, n, num_cols, n, 2).unwrap(); assert_eq!(gpu.len(), (n / 2) * 32); for i in 0..n / 2 { assert_eq!( @@ -165,7 +165,7 @@ fn keccak_leaves_ext3_row_pair_matches_cpu() { flat[(c * 3 + 2) * n + r] = *e.value()[2].value(); } } - let gpu = math_cuda::merkle::keccak_leaves_ext3_row_pair(&flat, n, num_cols, n).unwrap(); + let gpu = math_cuda::merkle::keccak_leaves_ext3(&flat, n, num_cols, n, 2).unwrap(); assert_eq!(gpu.len(), (n / 2) * 32); for i in 0..n / 2 { assert_eq!( From ade38c1570439a9319540b120ba82ae3ee0554b1 Mon Sep 17 00:00:00 2001 From: MauroFab Date: Fri, 26 Jun 2026 15:29:20 -0300 Subject: [PATCH 12/19] Fix CUDA LDE clippy lint --- AGENTS.md | 4 ++++ crypto/math-cuda/src/lde.rs | 1 + 2 files changed, 5 insertions(+) create mode 100644 AGENTS.md diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 000000000..767d84610 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,4 @@ +# Agent Rules + +- Before pushing code or conflict-resolution commits to a PR branch, run `make lint` from the repository root. `cargo fmt --check` is not a substitute because CI's `Lint` job runs the full `make lint` target. +- If `make lint` cannot complete, do not push unless the user explicitly accepts that risk, and report the incomplete lint result. diff --git a/crypto/math-cuda/src/lde.rs b/crypto/math-cuda/src/lde.rs index 59e01245f..8dc05ba01 100644 --- a/crypto/math-cuda/src/lde.rs +++ b/crypto/math-cuda/src/lde.rs @@ -675,6 +675,7 @@ pub fn coset_lde_batch_base_into_with_merkle_tree_keep( Ok(handle) } +#[allow(clippy::too_many_arguments)] fn coset_lde_batch_base_into_with_merkle_tree_inner( columns: &[&[u64]], blowup_factor: usize, From 316dda9f6f9772c71f4783f776db5f2c8b809daf Mon Sep 17 00:00:00 2001 From: Mauro Toscano <12560266+MauroToscano@users.noreply.github.com> Date: Fri, 26 Jun 2026 16:26:38 -0300 Subject: [PATCH 13/19] fix(cuda): align review cleanup with row-pair commits (#723) --- crypto/math-cuda/kernels/keccak.cu | 4 +-- crypto/math-cuda/src/lde.rs | 45 ++++++++++++------------ crypto/math-cuda/src/merkle.rs | 18 +++++++++- crypto/math-cuda/tests/comp_poly_tree.rs | 4 +-- 4 files changed, 44 insertions(+), 27 deletions(-) diff --git a/crypto/math-cuda/kernels/keccak.cu b/crypto/math-cuda/kernels/keccak.cu index 272b847dc..32dd75d47 100644 --- a/crypto/math-cuda/kernels/keccak.cu +++ b/crypto/math-cuda/kernels/keccak.cu @@ -159,8 +159,8 @@ extern "C" __global__ void keccak256_leaves_base_batched( uint64_t tid = (uint64_t)blockIdx.x * blockDim.x + threadIdx.x; if (tid >= num_rows) return; - // Bit-reverse the row index so we read columns at `br` but write the - // hashed leaf at `tid` — matching the CPU `commit_columns_bit_reversed`. + // Bit-reverse the row index so we read columns at `br` but write the hashed + // leaf at `tid` — matching the CPU per-row `commit_bit_reversed(.., 1)`. uint64_t br = __brevll(tid) >> (64 - log_num_rows); uint64_t st[25]; diff --git a/crypto/math-cuda/src/lde.rs b/crypto/math-cuda/src/lde.rs index 8dc05ba01..c3f59fd90 100644 --- a/crypto/math-cuda/src/lde.rs +++ b/crypto/math-cuda/src/lde.rs @@ -37,11 +37,11 @@ fn assert_u32_domain(n: usize, what: &str) { /// Output shape requested from the fused LDE + Keccak entry points. #[derive(Copy, Clone, PartialEq, Eq)] enum KeccakCommit { - /// Only the `lde_size` keccak-256 leaves; no inner-tree build. Caller - /// receives `lde_size * 32` bytes. + /// Only the keccak-256 leaves; no inner-tree build. Caller receives + /// `num_leaves * 32` bytes. LeavesOnly, /// Full Merkle tree: leaves at the tail + inner nodes built on-device. - /// Caller receives `(2*lde_size - 1) * 32` bytes. + /// Caller receives `(2*num_leaves - 1) * 32` bytes. FullTree, } @@ -595,12 +595,12 @@ pub fn coset_lde_batch_base_into( Ok(()) } -/// Fused LDE + Keccak-256 leaf hashing. Caller receives the `lde_size * 32` -/// bytes of leaf hashes in `hashed_leaves_out` (one 32-byte digest per output -/// row, in natural row order; leaves are computed reading columns at -/// bit-reversed rows, matching `commit_columns_bit_reversed` on the CPU -/// side). Thin wrapper over `coset_lde_batch_base_into_with_merkle_tree_inner` -/// with `LeavesOnly` — no inner-tree build, no device handle. +/// Fused LDE + row-pair Keccak-256 leaf hashing. Caller receives +/// `(lde_size / 2) * 32` bytes of leaf hashes in `hashed_leaves_out` (one +/// 32-byte digest per bit-reversed row pair, in natural leaf order, matching +/// `commit_bit_reversed(.., 2)` on the CPU side). Thin wrapper over +/// `coset_lde_batch_base_into_with_merkle_tree_inner` with `LeavesOnly` — no +/// inner-tree build, no device handle. pub fn coset_lde_batch_base_into_with_leaf_hash( columns: &[&[u64]], blowup_factor: usize, @@ -616,14 +616,15 @@ pub fn coset_lde_batch_base_into_with_leaf_hash( hashed_leaves_out, KeccakCommit::LeavesOnly, false, - 1, + 2, ) .map(|_| ()) } /// Like `coset_lde_batch_base_into_with_leaf_hash`, but also builds the full -/// Merkle tree on device and returns the `2*lde_size - 1` node buffer back -/// to the caller in `merkle_nodes_out` (byte length `(2*lde_size - 1) * 32`). +/// row-pair Merkle tree on device and returns the `2*(lde_size/2) - 1` node +/// buffer back to the caller in `merkle_nodes_out` (byte length +/// `(2*(lde_size/2) - 1) * 32`). /// /// The leaf hashes are never exposed to the caller — they stay on device and /// feed straight into the pair-hash tree kernel, avoiding the @@ -644,7 +645,7 @@ pub fn coset_lde_batch_base_into_with_merkle_tree( merkle_nodes_out, KeccakCommit::FullTree, false, - 1, + 2, ) .map(|_| ()) } @@ -799,8 +800,8 @@ fn coset_lde_batch_base_into_with_merkle_tree_inner( )?; // Allocate the device output buffer. In `LeavesOnly` mode this is just - // `lde_size * 32` bytes (the leaves themselves); in `FullTree` mode it's - // `(2*lde_size - 1) * 32` bytes (leaves in the tail + inner nodes filled + // `num_leaves * 32` bytes (the leaves themselves); in `FullTree` mode it's + // `(2*num_leaves - 1) * 32` bytes (leaves in the tail + inner nodes filled // below). `alloc` (not `alloc_zeros`) is safe because every byte is // written before any reader sees it: the keccak kernel fills the // leaves slab, the inner-tree pass (when present) fills the head. @@ -857,8 +858,8 @@ fn coset_lde_batch_base_into_with_merkle_tree_inner( } } -/// Ext3 variant of `coset_lde_batch_base_into_with_leaf_hash`: fused -/// LDE + Keccak-256 leaf hashing over ext3 columns. Thin wrapper over +/// Ext3 variant of `coset_lde_batch_base_into_with_leaf_hash`: fused LDE + +/// row-pair Keccak-256 leaf hashing over ext3 columns. Thin wrapper over /// `coset_lde_batch_ext3_into_with_merkle_tree_inner` with `LeavesOnly`. pub fn coset_lde_batch_ext3_into_with_leaf_hash( columns: &[&[u64]], @@ -877,14 +878,14 @@ pub fn coset_lde_batch_ext3_into_with_leaf_hash( hashed_leaves_out, KeccakCommit::LeavesOnly, false, - 1, + 2, ) .map(|_| ()) } /// Ext3 variant of the fused `coset_lde_batch_base_into_with_merkle_tree`. /// LDE + leaf hashing + inner-tree build, all on device; D2Hs only the LDE -/// evaluations and the full `2*lde_size - 1` node buffer. +/// evaluations and the full `2*(lde_size/2) - 1` row-pair node buffer. pub fn coset_lde_batch_ext3_into_with_merkle_tree( columns: &[&[u64]], n: usize, @@ -902,7 +903,7 @@ pub fn coset_lde_batch_ext3_into_with_merkle_tree( merkle_nodes_out, KeccakCommit::FullTree, false, - 1, + 2, ) .map(|_| ()) } @@ -1055,8 +1056,8 @@ fn coset_lde_batch_ext3_into_with_merkle_tree_inner( mb_u32, )?; - // Allocate device output buffer (LeavesOnly → lde_size*32; FullTree → - // (2*lde_size - 1)*32). Leaf kernel writes to the leaves slab; the + // Allocate device output buffer (LeavesOnly -> num_leaves*32; FullTree -> + // (2*num_leaves - 1)*32). Leaf kernel writes to the leaves slab; the // inner-tree pass (when present) fills the head. let mut nodes_dev = unsafe { stream.alloc::(nodes_dev_bytes) }?; let leaves_offset_bytes = commit.leaves_offset_bytes(num_leaves); diff --git a/crypto/math-cuda/src/merkle.rs b/crypto/math-cuda/src/merkle.rs index ef84c6da4..27f38ce0a 100644 --- a/crypto/math-cuda/src/merkle.rs +++ b/crypto/math-cuda/src/merkle.rs @@ -3,7 +3,7 @@ //! Matches `FieldElementVectorBackend::hash_data` in //! `crypto/crypto/src/merkle_tree/backends/field_element_vector.rs`, combined //! with the `reverse_index` row read pattern used in -//! `commit_columns_bit_reversed` at `crypto/stark/src/prover.rs`. +//! `commit_bit_reversed` at `crypto/stark/src/commitment.rs`. //! //! Caller supplies base-field column slabs already laid out as //! `[col * col_stride + row]` (the same layout `coset_lde_batch_base_into` @@ -38,6 +38,14 @@ pub fn keccak_leaves_base( ) -> Result> { assert!(num_rows.is_power_of_two()); assert!(rows_per_leaf == 1 || rows_per_leaf == 2); + assert!( + num_rows >= rows_per_leaf, + "num_rows must be at least rows_per_leaf" + ); + assert!( + num_rows >= 2, + "num_rows must be at least 2 for bit-reversed GPU leaf hashing" + ); assert!( col_stride >= num_rows, "col_stride must be >= num_rows to keep per-column reads in-bounds" @@ -80,6 +88,14 @@ pub fn keccak_leaves_ext3( ) -> Result> { assert!(num_rows.is_power_of_two()); assert!(rows_per_leaf == 1 || rows_per_leaf == 2); + assert!( + num_rows >= rows_per_leaf, + "num_rows must be at least rows_per_leaf" + ); + assert!( + num_rows >= 2, + "num_rows must be at least 2 for bit-reversed GPU leaf hashing" + ); assert!( col_stride >= num_rows, "col_stride must be >= num_rows to keep per-column reads in-bounds" diff --git a/crypto/math-cuda/tests/comp_poly_tree.rs b/crypto/math-cuda/tests/comp_poly_tree.rs index 29e33b6fe..51b826dd1 100644 --- a/crypto/math-cuda/tests/comp_poly_tree.rs +++ b/crypto/math-cuda/tests/comp_poly_tree.rs @@ -1,6 +1,6 @@ //! Parity: GPU fused `evaluate_poly_coset_batch_ext3_into_with_merkle_tree` //! (LDE + row-pair Keccak leaves + Merkle inner tree) against the same CPU -//! pipeline produced by `commit_composition_polynomial`. +//! row-pair commitment layout used by `commit_bit_reversed(.., 2)`. use math::field::element::FieldElement; use math::field::extensions_goldilocks::Degree3GoldilocksExtensionField; @@ -83,7 +83,7 @@ fn cpu_hash_pair(left: &[u8; 32], right: &[u8; 32]) -> [u8; 32] { out } -/// CPU: `commit_composition_polynomial`-style tree root over num_rows/2 leaves. +/// CPU: `commit_bit_reversed(.., 2)`-style tree root over num_rows/2 leaves. fn cpu_tree_nodes(parts: &[Vec]) -> Vec<[u8; 32]> { let num_rows = parts[0].len(); let num_parts = parts.len(); From 8ddadf48e28fbc1d94c9301e3d7283dbf8a3ccca Mon Sep 17 00:00:00 2001 From: diegokingston Date: Mon, 29 Jun 2026 13:03:12 -0300 Subject: [PATCH 14/19] fix(stark): silence dead_code on par_for_each_mut (debug-checks-only after merge) --- crypto/stark/src/par.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/crypto/stark/src/par.rs b/crypto/stark/src/par.rs index 051651fde..cee693e3f 100644 --- a/crypto/stark/src/par.rs +++ b/crypto/stark/src/par.rs @@ -58,6 +58,9 @@ pub(crate) fn par_map_collect( /// Run `f(&mut item)` for each element of `slice`. Parallel when /// `feature = "parallel"`, sequential otherwise (ordering is irrelevant). +// Only called from the `debug-checks`-gated column-LDE reconstruct path +// (production LDE is row-major); keep it available without warning otherwise. +#[cfg_attr(not(feature = "debug-checks"), allow(dead_code))] pub(crate) fn par_for_each_mut(slice: &mut [T], f: impl Fn(&mut T) + Sync + Send) { #[cfg(feature = "parallel")] { From 9655852ae32e244852b3c2fc00a50f5ff05b9351 Mon Sep 17 00:00:00 2001 From: diegokingston Date: Mon, 29 Jun 2026 13:04:58 -0300 Subject: [PATCH 15/19] style(stark): cargo fmt after merge resolution --- crypto/stark/src/prover.rs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/crypto/stark/src/prover.rs b/crypto/stark/src/prover.rs index 189d835ae..a3c64c4fb 100644 --- a/crypto/stark/src/prover.rs +++ b/crypto/stark/src/prover.rs @@ -509,7 +509,6 @@ fn columns_to_row_major( (data, num_cols) } - /// The functionality of a STARK prover providing methods to run the STARK Prove protocol /// https://lambdaclass.github.io/lambdaworks/starks/protocol.html /// The default implementation is complete and is compatible with Stone prover @@ -615,7 +614,6 @@ pub trait IsStarkProver< Some((tree, root)) } - /// Compute the LDE commitment for a subset of columns from a trace (for testing). /// /// This helper computes the same commitment the prover generates internally, @@ -2120,8 +2118,9 @@ pub trait IsStarkProver< #[cfg(feature = "instruments")] let t_sub = Instant::now(); #[allow(unused_mut)] - let (mut tree, root) = Self::commit_rows_bit_reversed(&aux_data, total_cols) - .ok_or(ProvingError::EmptyCommitment)?; + let (mut tree, root) = + Self::commit_rows_bit_reversed(&aux_data, total_cols) + .ok_or(ProvingError::EmptyCommitment)?; #[cfg(feature = "disk-spill")] Self::spill_tree(&mut tree, storage_mode, "aux Merkle tree")?; let commit = TableCommit::plain(tree, root); From 04e1faed699388981d3d95fb5272e420e0622fcb Mon Sep 17 00:00:00 2001 From: diegokingston Date: Mon, 29 Jun 2026 13:10:28 -0300 Subject: [PATCH 16/19] test(cuda): focused GPU row-pair commitment prove+verify test --- prover/tests/cuda_path_integration.rs | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/prover/tests/cuda_path_integration.rs b/prover/tests/cuda_path_integration.rs index 0f7c1f3c7..583645379 100644 --- a/prover/tests/cuda_path_integration.rs +++ b/prover/tests/cuda_path_integration.rs @@ -66,3 +66,24 @@ fn gpu_path_fires_end_to_end() { let ok = verify(&proof, &elf).expect("verify"); assert!(ok, "GPU-produced proof failed verification"); } + +/// Focused validation of the GPU row-pair trace commitment: proves a large +/// trace with the GPU path and verifies the resulting proof. Independent of the +/// per-round counter assertions in `gpu_path_fires_end_to_end` (the R2 parts-LDE +/// assertion bit-rotted on main and cuts off before the verify). A wrong GPU +/// trace-commit leaf layout (1-row vs the new row-pair) would fail verification. +#[test] +#[ignore = "requires GPU; run with --ignored --nocapture"] +fn gpu_proof_verifies_row_pair_commitment() { + let elf = asm_elf_bytes("fib_iterative_1M"); + reset_all_gpu_call_counters(); + let proof = prove(&elf).expect("prove"); + assert!( + gpu_lde_calls() > 0, + "GPU LDE path did not fire (silent CPU fallback would not test the GPU commit)" + ); + assert!( + verify(&proof, &elf).expect("verify"), + "GPU-produced proof (row-pair commitment) failed verification" + ); +} From 05bb11ac4797fefae9f22dbd6ec9b7fdffbfc525 Mon Sep 17 00:00:00 2001 From: diegokingston Date: Mon, 29 Jun 2026 13:15:44 -0300 Subject: [PATCH 17/19] fix(cuda): drop stale R2 parts-LDE asserts (#700 fused path), silence GPU column-LDE dead_code The R2 parts-LDE / comp-poly-tree GPU dispatches no longer fire since #699/#700 route degree-3 tables through the 2-part fused coset_lde_full path (no AIR has number_of_parts > 2). try_expand_columns_batched* are debug-checks-only after the #650 row-major LDE became production. --- crypto/stark/src/gpu_lde.rs | 2 ++ prover/tests/cuda_path_integration.rs | 19 +++++++------------ 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/crypto/stark/src/gpu_lde.rs b/crypto/stark/src/gpu_lde.rs index 5f09cd3b2..5b9ad09f7 100644 --- a/crypto/stark/src/gpu_lde.rs +++ b/crypto/stark/src/gpu_lde.rs @@ -303,6 +303,7 @@ fn alloc_merkle_nodes(num_leaves: usize) -> Option<(Vec<[u8; 32]>, usize)> { /// Returns `Some(())` if the batch was handled on GPU and `columns` now holds /// the LDE evaluations, or if there were no columns to expand. Returns `None` /// to let the caller run the per-column CPU fallback. +#[cfg_attr(not(feature = "debug-checks"), allow(dead_code))] pub(crate) fn try_expand_columns_batched( columns: &mut [Vec>], blowup_factor: usize, @@ -582,6 +583,7 @@ where /// transform uses only base-field twiddles and coset weights, which act /// componentwise on ext3, so the per-component result equals the ext3 LDE the /// CPU path computes. +#[cfg_attr(not(feature = "debug-checks"), allow(dead_code))] fn try_expand_columns_batched_ext3( columns: &mut [Vec>], blowup_factor: usize, diff --git a/prover/tests/cuda_path_integration.rs b/prover/tests/cuda_path_integration.rs index 583645379..0c0bb00c2 100644 --- a/prover/tests/cuda_path_integration.rs +++ b/prover/tests/cuda_path_integration.rs @@ -11,8 +11,8 @@ use lambda_vm_prover::test_utils::asm_elf_bytes; use lambda_vm_prover::{prove, verify}; use stark::gpu_lde::{ - gpu_bary_calls, gpu_batch_invert_calls, gpu_comp_poly_tree_calls, gpu_deep_calls, - gpu_fri_calls, gpu_lde_calls, gpu_parts_lde_calls, reset_all_gpu_call_counters, + gpu_bary_calls, gpu_batch_invert_calls, gpu_deep_calls, gpu_fri_calls, gpu_lde_calls, + reset_all_gpu_call_counters, }; #[test] @@ -36,16 +36,11 @@ fn gpu_path_fires_end_to_end() { // path. assert!(gpu_bary_calls() > 0, "R3 GPU barycentric did not fire"); - // R2 ext3 LDE of composition-poly parts. Only fires when an AIR's - // `number_of_parts > 2`. The branch and shift tables have degree-3 - // transition constraints, so this triggers on any non-trivial prove. - assert!(gpu_parts_lde_calls() > 0, "R2 GPU parts LDE did not fire"); - - // R2 comp-poly Merkle tree build, paired with the parts LDE above. - assert!( - gpu_comp_poly_tree_calls() > 0, - "R2 GPU comp-poly tree did not fire" - ); + // The R2 parts-LDE and comp-poly-tree GPU dispatches are obsolete since + // #699/#700 routed degree-3 tables through the 2-part decompose_and_extend_d2 + // + fused coset_lde_full path: no VM AIR has number_of_parts > 2 anymore, so + // try_evaluate_parts_on_lde_gpu_keep (and its paired tree build) no longer + // fire. (These assertions bit-rotted against the fused composition LDE.) // DEEP fires once per table that took the R1 GPU path. assert!(gpu_deep_calls() > 0, "R4 GPU DEEP composition did not fire"); From 275eadd1478cff31dc1be47b2137984de84be328 Mon Sep 17 00:00:00 2001 From: Mauro Toscano <12560266+MauroToscano@users.noreply.github.com> Date: Mon, 29 Jun 2026 18:11:00 -0300 Subject: [PATCH 18/19] =?UTF-8?q?review(stark):=20address=20PR=20#735=20re?= =?UTF-8?q?view=20=E2=80=94=20coverage,=20dead=20code,=20cleanups=20(#740)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Follow-up to the row-pair commitment PR. Excludes the intentional proof-format break (proof_sym removal / leaf-count change), which is by design. Test coverage (in their own files under crypto/stark/src/tests/, per the crate's test structure): - tests/commitment_tests.rs: direct unit tests pinning the row-pair leaf layout (R=1 and R=2) against an independent reference, wrapper agreement, commit-root consistency, and empty-input short-circuit. Previously the leaf layout was only covered transitively via full prove->verify, and GPU parity tests compared against an inline reimpl rather than this module. - tests/row_pair_opening_tests.rs: two negative tests for the row-pair verify_opening_pair — a tampered symmetric trace evaluation and a corrupted Merkle authentication path must both be rejected. Removing proof_sym deleted the old "symmetric opening mismatch" rejection class; these restore it (an impl ignoring evaluations_sym / the auth path would otherwise pass every existing test). Reuses the now pub(crate) make_valid_simple_proof helper. - cuda_path_integration.rs: restore assert!(gpu_comp_poly_tree_calls() > 0). try_build_comp_poly_tree_gpu is dispatched unconditionally (round 2, after the parts-count branch), so it fires for the common number_of_parts == 2 (degree-3) case — it was NOT obsolete. Keep the genuinely-dead parts-LDE assertion dropped. Cleanups: - Delete dead fn commit_plain (zero callers; main/aux commit inline commit_rows_bit_reversed + spill_tree, which are row-major and incompatible with its column-major signature — the dedup never landed). - Delete orphaned pub fn columns2rows (all callers removed by #735). - Add ProvingError::Fft and map FFTError to it instead of WrongParameter (internal FFT failure is not a caller-supplied-parameter error). - Fix stale profiler label commit_composition_poly -> commit_bit_reversed. - Drop the rot-prone "// = 2" comment on the local ROWS_PER_LEAF alias. --- crypto/stark/src/prover.rs | 32 ++----- crypto/stark/src/tests/commitment_tests.rs | 96 +++++++++++++++++++ crypto/stark/src/tests/mod.rs | 2 + .../stark/src/tests/row_pair_opening_tests.rs | 73 ++++++++++++++ crypto/stark/src/tests/small_trace_tests.rs | 2 +- crypto/stark/src/trace.rs | 16 ---- prover/src/instruments.rs | 2 +- prover/tests/cuda_path_integration.rs | 23 +++-- 8 files changed, 198 insertions(+), 48 deletions(-) create mode 100644 crypto/stark/src/tests/commitment_tests.rs create mode 100644 crypto/stark/src/tests/row_pair_opening_tests.rs diff --git a/crypto/stark/src/prover.rs b/crypto/stark/src/prover.rs index a3c64c4fb..288288c06 100644 --- a/crypto/stark/src/prover.rs +++ b/crypto/stark/src/prover.rs @@ -88,11 +88,16 @@ pub enum ProvingError { /// out of disk space, fd exhaustion, or mmap failure. #[cfg(feature = "disk-spill")] DiskSpill(String), + /// An internal FFT/LDE computation failed (e.g. domain size exceeds the + /// field's two-adicity, or a degenerate coset offset). Distinct from + /// `WrongParameter` because the cause is internal prover machinery, not a + /// caller-supplied parameter. Carries the underlying `FFTError`'s message. + Fft(String), } impl From for ProvingError { fn from(e: FFTError) -> Self { - ProvingError::WrongParameter(format!("{e}")) + ProvingError::Fft(format!("{e}")) } } @@ -573,7 +578,8 @@ pub trait IsStarkProver< "num_rows must be a power of two for reverse_index" ); - const ROWS_PER_LEAF: usize = crate::commitment::ROWS_PER_LEAF; // = 2 + // Local alias for the canonical constant, used several times below. + const ROWS_PER_LEAF: usize = crate::commitment::ROWS_PER_LEAF; let num_leaves = num_rows / ROWS_PER_LEAF; let subset_cols = col_end - col_start; let byte_len = as ByteConversion>::BYTE_LEN; @@ -884,28 +890,6 @@ pub trait IsStarkProver< Ok(()) } - /// Commit an already-LDE-expanded plain column set: build the row-paired - /// Merkle tree, spill it to disk if requested, and wrap it as a plain - /// `TableCommit`. Shared by the main-trace (non-preprocessed) and aux-trace - /// commit paths. - fn commit_plain( - columns: &[Vec>], - #[cfg(feature = "disk-spill")] storage_mode: StorageMode, - #[cfg(feature = "disk-spill")] spill_label: &str, - ) -> Result, ProvingError> - where - C: IsField, - FieldElement: AsBytes + Sync + Send + math::traits::ByteConversion, - { - #[allow(unused_mut)] - let (mut tree, root) = - crate::commitment::commit_bit_reversed(columns, crate::commitment::ROWS_PER_LEAF) - .ok_or(ProvingError::EmptyCommitment)?; - #[cfg(feature = "disk-spill")] - Self::spill_tree(&mut tree, storage_mode, spill_label)?; - Ok(TableCommit::plain(tree, root)) - } - /// Recompute Round1 from the trace, reusing the Merkle trees stored in commitments. /// /// Only used by `run_debug_checks` — Phase D consumes the cached LDE diff --git a/crypto/stark/src/tests/commitment_tests.rs b/crypto/stark/src/tests/commitment_tests.rs new file mode 100644 index 000000000..f1684112b --- /dev/null +++ b/crypto/stark/src/tests/commitment_tests.rs @@ -0,0 +1,96 @@ +//! Unit tests for the Merkle commitment layer (`crate::commitment`): they pin +//! the bit-reversed, row-grouped leaf byte layout that the GPU kernels and the +//! verifier's `verify_opening_pair` must match. Previously this layout was only +//! covered transitively through full prove→verify. + +use crate::commitment::{ + ROWS_PER_LEAF, commit_bit_reversed, keccak_leaves_bit_reversed, + keccak_leaves_bit_reversed_grouped, keccak_leaves_row_pair_bit_reversed, +}; +use crate::config::{BatchedMerkleTree, BatchedMerkleTreeBackend, Commitment}; +use math::fft::bit_reversing::reverse_index; +use math::field::{element::FieldElement, goldilocks::GoldilocksField}; +use math::traits::ByteConversion; + +type F = GoldilocksField; +type Felt = FieldElement; + +/// 3 columns × 8 rows of distinct, nonzero values. +fn sample_columns() -> Vec> { + (0..3u64) + .map(|c| (0..8u64).map(|r| Felt::from(100 * c + r + 1)).collect()) + .collect() +} + +/// Independent reference for one leaf, written straight from the module-doc +/// layout (`rows_per_leaf` consecutive bit-reversed rows, column-major within +/// each row, big-endian), hashed once with the same backend the prover uses. +/// Structurally separate from the production `map_init` loop, so a transposed +/// row/column order or a wrong bit-reversal in production fails this check. +fn expected_leaf(columns: &[Vec], rows_per_leaf: usize, leaf_idx: usize) -> Commitment { + let num_rows = columns[0].len(); + let byte_len = ::BYTE_LEN; + let mut buf = vec![0u8; rows_per_leaf * columns.len() * byte_len]; + let mut offset = 0; + for k in 0..rows_per_leaf { + let br = reverse_index(rows_per_leaf * leaf_idx + k, num_rows as u64); + for col in columns { + col[br].write_bytes_be(&mut buf[offset..offset + byte_len]); + offset += byte_len; + } + } + BatchedMerkleTreeBackend::::hash_bytes(&buf) +} + +#[test] +fn grouped_leaves_match_documented_layout_for_r1_and_r2() { + let columns = sample_columns(); + let num_rows = columns[0].len(); + for &rows_per_leaf in &[1usize, 2usize] { + let leaves = keccak_leaves_bit_reversed_grouped(&columns, rows_per_leaf); + assert_eq!( + leaves.len(), + num_rows / rows_per_leaf, + "leaf count for rows_per_leaf={rows_per_leaf}" + ); + for (i, leaf) in leaves.iter().enumerate() { + assert_eq!( + *leaf, + expected_leaf(&columns, rows_per_leaf, i), + "leaf {i} for rows_per_leaf={rows_per_leaf}" + ); + } + } +} + +#[test] +fn wrappers_agree_with_grouped() { + let columns = sample_columns(); + assert_eq!( + keccak_leaves_bit_reversed(&columns), + keccak_leaves_bit_reversed_grouped(&columns, 1) + ); + assert_eq!( + keccak_leaves_row_pair_bit_reversed(&columns), + keccak_leaves_bit_reversed_grouped(&columns, ROWS_PER_LEAF) + ); +} + +#[test] +fn commit_root_matches_tree_built_over_leaves() { + let columns = sample_columns(); + let leaves = keccak_leaves_bit_reversed_grouped(&columns, ROWS_PER_LEAF); + let tree = BatchedMerkleTree::::build_from_hashed_leaves(leaves).unwrap(); + let (_, root) = commit_bit_reversed(&columns, ROWS_PER_LEAF).unwrap(); + assert_eq!(root, tree.root); +} + +#[test] +fn empty_and_zero_row_inputs_short_circuit() { + let empty: Vec> = vec![]; + assert!(keccak_leaves_bit_reversed_grouped(&empty, ROWS_PER_LEAF).is_empty()); + assert!(commit_bit_reversed(&empty, ROWS_PER_LEAF).is_none()); + let zero_rows: Vec> = vec![vec![]]; + assert!(keccak_leaves_bit_reversed_grouped(&zero_rows, ROWS_PER_LEAF).is_empty()); + assert!(commit_bit_reversed(&zero_rows, ROWS_PER_LEAF).is_none()); +} diff --git a/crypto/stark/src/tests/mod.rs b/crypto/stark/src/tests/mod.rs index 8c0897ac1..7a3884832 100644 --- a/crypto/stark/src/tests/mod.rs +++ b/crypto/stark/src/tests/mod.rs @@ -2,12 +2,14 @@ pub mod air_tests; #[cfg(feature = "debug-checks")] pub mod bus_debug_tests; pub mod bus_tests; +pub mod commitment_tests; pub mod domain_cache_stats; pub mod fri_tests; pub mod grinding_tests; pub mod proof_options_tests; pub mod prove_verify_roundtrip_tests; pub mod prover_tests; +pub mod row_pair_opening_tests; pub mod small_trace_tests; #[cfg(feature = "disk-spill")] pub mod table_disk_spill_tests; diff --git a/crypto/stark/src/tests/row_pair_opening_tests.rs b/crypto/stark/src/tests/row_pair_opening_tests.rs new file mode 100644 index 000000000..89e7dc8f1 --- /dev/null +++ b/crypto/stark/src/tests/row_pair_opening_tests.rs @@ -0,0 +1,73 @@ +//! Negative tests for the row-pair trace opening verification +//! (`verifier::verify_opening_pair`). The row pair `(2·iota, 2·iota+1)` is +//! committed as a single Merkle leaf, so one `proof` authenticates both +//! `evaluations` and `evaluations_sym`. Removing the old separate `proof_sym` +//! opening deleted the "symmetric opening mismatch" rejection class; these +//! tests restore it — an implementation that ignored `evaluations_sym` or the +//! authentication path would otherwise pass every other test. + +use super::small_trace_tests::make_valid_simple_proof; +use crate::verifier::{IsStarkVerifier, Verifier}; +use crypto::fiat_shamir::default_transcript::DefaultTranscript; +use math::field::{element::FieldElement, goldilocks::GoldilocksField}; + +type Felt = FieldElement; + +/// Tampering the value at the symmetric LDE position must break verification: +/// the committed leaf hashed `evaluations ‖ evaluations_sym`, so a perturbed +/// `evaluations_sym` no longer reconstructs the committed leaf. +#[test_log::test] +fn test_verify_rejects_tampered_main_trace_evaluations_sym() { + let (air, mut proof) = make_valid_simple_proof(); + + let opening = proof + .deep_poly_openings + .first_mut() + .expect("test precondition: a valid proof has at least one deep poly opening"); + assert!( + !opening.main_trace_polys.evaluations_sym.is_empty(), + "test precondition: the main-trace opening has at least one symmetric evaluation", + ); + // Perturb (not resize) the first symmetric evaluation. + opening.main_trace_polys.evaluations_sym[0] = + &opening.main_trace_polys.evaluations_sym[0] + Felt::one(); + + assert!( + !Verifier::verify( + &proof, + &air, + &mut DefaultTranscript::::new(&[]) + ), + "Verifier must reject a tampered symmetric trace evaluation" + ); +} + +/// The row-pair Merkle authentication path itself must be checked. Corrupting a +/// node in `main_trace_polys.proof.merkle_path` is caught ONLY by +/// `verify_opening_pair` (the deep-composition reconstruction does not touch the +/// auth path), so this proves the single row-pair path is actually authenticated +/// against the committed root rather than ignored. +#[test_log::test] +fn test_verify_rejects_tampered_main_trace_merkle_path() { + let (air, mut proof) = make_valid_simple_proof(); + + let opening = proof + .deep_poly_openings + .first_mut() + .expect("test precondition: a valid proof has at least one deep poly opening"); + let path = &mut opening.main_trace_polys.proof.merkle_path; + assert!( + !path.is_empty(), + "test precondition: the row-pair trace tree has a non-trivial authentication path", + ); + path[0][0] ^= 0x01; + + assert!( + !Verifier::verify( + &proof, + &air, + &mut DefaultTranscript::::new(&[]) + ), + "Verifier must reject a corrupted main-trace Merkle authentication path" + ); +} diff --git a/crypto/stark/src/tests/small_trace_tests.rs b/crypto/stark/src/tests/small_trace_tests.rs index 8373ae9d6..1e081910b 100644 --- a/crypto/stark/src/tests/small_trace_tests.rs +++ b/crypto/stark/src/tests/small_trace_tests.rs @@ -17,7 +17,7 @@ use crate::{ type Felt = FieldElement; -fn make_valid_simple_proof() -> ( +pub(crate) fn make_valid_simple_proof() -> ( SimpleAdditionAIR, crate::proof::stark::StarkProof< GoldilocksField, diff --git a/crypto/stark/src/trace.rs b/crypto/stark/src/trace.rs index da4a53f6e..72b77947a 100644 --- a/crypto/stark/src/trace.rs +++ b/crypto/stark/src/trace.rs @@ -645,22 +645,6 @@ where Table::new(table_data, table_width) } -pub fn columns2rows(columns: Vec>) -> Vec> -where - F: Clone, -{ - let num_rows = columns[0].len(); - let num_cols = columns.len(); - - (0..num_rows) - .map(|row_index| { - (0..num_cols) - .map(|col_index| columns[col_index][row_index].clone()) - .collect() - }) - .collect() -} - pub(crate) fn compute_frame_evaluation_points( x: &FieldElement, frame_offsets: &[usize], diff --git a/prover/src/instruments.rs b/prover/src/instruments.rs index f15223e18..b787af0da 100644 --- a/prover/src/instruments.rs +++ b/prover/src/instruments.rs @@ -173,7 +173,7 @@ pub fn print_report( let mut sub_ops: Vec<(&str, Duration)> = vec![ ("R2 evaluate", total_constraints), ("R2 decompose_and_extend_d2", total_comp_decompose), - ("R2 commit_composition_poly", total_comp_commit), + ("R2 commit_bit_reversed (comp-poly)", total_comp_commit), ("R3 OOD evaluation", total_ood), ("R4 deep_composition_poly_evals", total_deep_comp), ("R4 interpolate+evaluate_fft", total_deep_extend), diff --git a/prover/tests/cuda_path_integration.rs b/prover/tests/cuda_path_integration.rs index 0c0bb00c2..521ecd0d3 100644 --- a/prover/tests/cuda_path_integration.rs +++ b/prover/tests/cuda_path_integration.rs @@ -11,8 +11,8 @@ use lambda_vm_prover::test_utils::asm_elf_bytes; use lambda_vm_prover::{prove, verify}; use stark::gpu_lde::{ - gpu_bary_calls, gpu_batch_invert_calls, gpu_deep_calls, gpu_fri_calls, gpu_lde_calls, - reset_all_gpu_call_counters, + gpu_bary_calls, gpu_batch_invert_calls, gpu_comp_poly_tree_calls, gpu_deep_calls, + gpu_fri_calls, gpu_lde_calls, reset_all_gpu_call_counters, }; #[test] @@ -36,11 +36,22 @@ fn gpu_path_fires_end_to_end() { // path. assert!(gpu_bary_calls() > 0, "R3 GPU barycentric did not fire"); - // The R2 parts-LDE and comp-poly-tree GPU dispatches are obsolete since + // R2 parts-LDE (try_evaluate_parts_on_lde_gpu_keep) is obsolete: since // #699/#700 routed degree-3 tables through the 2-part decompose_and_extend_d2 - // + fused coset_lde_full path: no VM AIR has number_of_parts > 2 anymore, so - // try_evaluate_parts_on_lde_gpu_keep (and its paired tree build) no longer - // fire. (These assertions bit-rotted against the fused composition LDE.) + // + fused coset_lde_full path, no VM AIR has number_of_parts > 2, so that + // dispatch lives only in the dead `> 2` fallback. We intentionally do NOT + // assert gpu_parts_lde_calls() here. + // + // The comp-poly-tree build is NOT obsolete, though: try_build_comp_poly_tree_gpu + // is called unconditionally after the parts-count branch (prover.rs round 2), + // so it fires for the common number_of_parts == 2 (degree-3) case. A silent + // CPU fallback here would still verify, so the end-to-end check below cannot + // catch it — this counter assertion is what guards the GPU comp-poly-tree + // dispatch. + assert!( + gpu_comp_poly_tree_calls() > 0, + "R2 GPU comp-poly tree did not fire" + ); // DEEP fires once per table that took the R1 GPU path. assert!(gpu_deep_calls() > 0, "R4 GPU DEEP composition did not fire"); From bd1ebbbeec2bf39633af00568b00d16e08af73b1 Mon Sep 17 00:00:00 2001 From: Mauro Toscano <12560266+MauroToscano@users.noreply.github.com> Date: Mon, 29 Jun 2026 18:34:16 -0300 Subject: [PATCH 19/19] review(stark): tidy test layout + remove AGENTS.md (follow-up to #740) (#744) Two items that landed too late for #740: - Move the shared make_valid_simple_proof helper out of small_trace_tests.rs into tests/trace_test_helpers.rs, where the crate keeps shared test helpers (matching how prover_tests sources get_trace_evaluations). row_pair_opening_tests.rs and small_trace_tests.rs now both import it from there instead of one test file reaching sideways into another. - Delete AGENTS.md (added by #735). --- AGENTS.md | 4 --- .../stark/src/tests/row_pair_opening_tests.rs | 2 +- crypto/stark/src/tests/small_trace_tests.rs | 26 +------------- crypto/stark/src/tests/trace_test_helpers.rs | 36 +++++++++++++++++++ 4 files changed, 38 insertions(+), 30 deletions(-) delete mode 100644 AGENTS.md diff --git a/AGENTS.md b/AGENTS.md deleted file mode 100644 index 767d84610..000000000 --- a/AGENTS.md +++ /dev/null @@ -1,4 +0,0 @@ -# Agent Rules - -- Before pushing code or conflict-resolution commits to a PR branch, run `make lint` from the repository root. `cargo fmt --check` is not a substitute because CI's `Lint` job runs the full `make lint` target. -- If `make lint` cannot complete, do not push unless the user explicitly accepts that risk, and report the incomplete lint result. diff --git a/crypto/stark/src/tests/row_pair_opening_tests.rs b/crypto/stark/src/tests/row_pair_opening_tests.rs index 89e7dc8f1..93423f49f 100644 --- a/crypto/stark/src/tests/row_pair_opening_tests.rs +++ b/crypto/stark/src/tests/row_pair_opening_tests.rs @@ -6,7 +6,7 @@ //! tests restore it — an implementation that ignored `evaluations_sym` or the //! authentication path would otherwise pass every other test. -use super::small_trace_tests::make_valid_simple_proof; +use crate::tests::trace_test_helpers::make_valid_simple_proof; use crate::verifier::{IsStarkVerifier, Verifier}; use crypto::fiat_shamir::default_transcript::DefaultTranscript; use math::field::{element::FieldElement, goldilocks::GoldilocksField}; diff --git a/crypto/stark/src/tests/small_trace_tests.rs b/crypto/stark/src/tests/small_trace_tests.rs index 1e081910b..96e04858d 100644 --- a/crypto/stark/src/tests/small_trace_tests.rs +++ b/crypto/stark/src/tests/small_trace_tests.rs @@ -11,37 +11,13 @@ use crate::{ }, proof::options::ProofOptions, prover::{IsStarkProver, Prover}, + tests::trace_test_helpers::make_valid_simple_proof, traits::AIR, verifier::{IsStarkVerifier, Verifier}, }; type Felt = FieldElement; -pub(crate) fn make_valid_simple_proof() -> ( - SimpleAdditionAIR, - crate::proof::stark::StarkProof< - GoldilocksField, - GoldilocksField, - SimpleAdditionPublicInputs, - >, -) { - let mut trace = simple_addition_trace::(2); - let proof_options = ProofOptions::default_test_options(); - let pub_inputs = SimpleAdditionPublicInputs { - a: Felt::from(1u64), - b: Felt::from(2u64), - }; - let air = SimpleAdditionAIR::::new(&proof_options); - let proof = Prover::prove( - &air, - &mut trace, - &pub_inputs, - &mut DefaultTranscript::::new(&[]), - ) - .unwrap(); - (air, proof) -} - /// Test STARK prove/verify with a single-row trace. /// This exercises the FRI protocol with 0 FRI layers (trace_length=1, number_layers=0). #[test_log::test] diff --git a/crypto/stark/src/tests/trace_test_helpers.rs b/crypto/stark/src/tests/trace_test_helpers.rs index e62d0d3ec..4ef6455b3 100644 --- a/crypto/stark/src/tests/trace_test_helpers.rs +++ b/crypto/stark/src/tests/trace_test_helpers.rs @@ -1,8 +1,16 @@ +use crate::examples::simple_addition::{ + SimpleAdditionAIR, SimpleAdditionPublicInputs, simple_addition_trace, +}; +use crate::proof::options::ProofOptions; +use crate::prover::{IsStarkProver, Prover}; use crate::table::Table; use crate::trace::{TraceTable, compute_frame_evaluation_points}; +use crate::traits::AIR; +use crypto::fiat_shamir::default_transcript::DefaultTranscript; use itertools::Itertools; use math::field::{ element::FieldElement, + goldilocks::GoldilocksField, traits::{IsField, IsSubFieldOf}, }; use math::polynomial::Polynomial; @@ -10,6 +18,34 @@ use math::polynomial::Polynomial; #[cfg(feature = "parallel")] use rayon::prelude::{IntoParallelRefIterator, ParallelIterator}; +/// Builds a valid 2-row `SimpleAddition` proof. Shared base for the +/// proof-tamper / rejection tests in `small_trace_tests` and +/// `row_pair_opening_tests`. +pub fn make_valid_simple_proof() -> ( + SimpleAdditionAIR, + crate::proof::stark::StarkProof< + GoldilocksField, + GoldilocksField, + SimpleAdditionPublicInputs, + >, +) { + let mut trace = simple_addition_trace::(2); + let proof_options = ProofOptions::default_test_options(); + let pub_inputs = SimpleAdditionPublicInputs { + a: FieldElement::from(1u64), + b: FieldElement::from(2u64), + }; + let air = SimpleAdditionAIR::::new(&proof_options); + let proof = Prover::prove( + &air, + &mut trace, + &pub_inputs, + &mut DefaultTranscript::::new(&[]), + ) + .unwrap(); + (air, proof) +} + /// Reference Horner-based trace-evaluation used as an oracle by the prover /// tests (`tests::prover_tests`). The production prover uses the LDE-based /// barycentric `get_trace_evaluations_from_lde`; the two are