From fe50a9e327cb49c41efe68340439e2525e135484 Mon Sep 17 00:00:00 2001
From: diegokingston <dkingston@fi.uba.ar>
Date: Fri, 19 Jun 2026 16:28:39 -0300
Subject: [PATCH 01/19] refactor(stark): extract Merkle commitment into a
 commitment module

Unifies the two near-identical bit-reversed leaf hashers (per-row + per-row-pair)
into one keccak_leaves_bit_reversed_grouped(columns, rows_per_leaf), and the two
near-identical commit fns (commit_columns_bit_reversed + commit_composition_polynomial)
into commit_bit_reversed(columns, rows_per_leaf), in a new crypto/stark/src/commitment.rs.
Removes them from the IsStarkProver trait (they used no self). prover.rs re-exports
the named leaf hashers for the math-cuda GPU parity test. Byte-identical (stark 128/128).
---
 crypto/stark/src/commitment.rs | 147 ++++++++++++++++++++++++++
 crypto/stark/src/lib.rs        |   1 +
 crypto/stark/src/prover.rs     | 183 ++-------------------------------
 3 files changed, 156 insertions(+), 175 deletions(-)
 create mode 100644 crypto/stark/src/commitment.rs
diff --git a/crypto/stark/src/commitment.rs b/crypto/stark/src/commitment.rs
new file mode 100644
index 000000000..979c0cb7f
--- /dev/null
+++ b/crypto/stark/src/commitment.rs
@@ -0,0 +1,147 @@
+//! Merkle-tree commitment to bit-reversed, column-major LDE evaluations.
+//!
+//! This is the commitment layer the prover uses for the main/aux trace LDEs and
+//! the composition-polynomial parts. It is decoupled from `IsStarkProver`: the
+//! prover only orchestrates *when* to commit; the *how* (leaf layout, bit-reverse
+//! permutation, Keccak hashing, tree build) lives here.
+//!
+//! ## Leaf layout
+//!
+//! For each leaf `i` we hash `rows_per_leaf` consecutive (bit-reversed) rows,
+//! big-endian-concatenated column-by-column:
+//!
+//! ```text
+//! leaf(i) = keccak( col_0[br(R·i)]‖col_1[br(R·i)]‖…  ‖  col_0[br(R·i+1)]‖…  ‖ … )
+//!   where R = rows_per_leaf and br(j) = reverse_index(j, num_rows)
+//! ```
+//!
+//! - `rows_per_leaf == 1`: one row per leaf — main/aux trace LDE columns.
+//! - `rows_per_leaf == 2`: a row pair per leaf — composition-polynomial parts
+//!   (Round 2/3), where leaf `i` hashes rows `2i` and `2i+1`.
+//!
+//! The field-element serialization (`write_bytes_be`) + `hash_bytes` path is kept
+//! exactly as before, so commitments are byte-identical to the previous
+//! per-row / per-row-pair implementations.
+
+use math::fft::bit_reversing::reverse_index;
+use math::field::element::FieldElement;
+use math::field::traits::IsField;
+use math::traits::{AsBytes, ByteConversion};
+
+#[cfg(feature = "parallel")]
+use rayon::prelude::{IntoParallelIterator, ParallelIterator};
+
+use crate::config::{BatchedMerkleTree, BatchedMerkleTreeBackend, Commitment};
+
+/// Computes the Keccak-256 leaf hashes for a bit-reversed, column-major commitment,
+/// grouping `rows_per_leaf` consecutive bit-reversed rows into each leaf.
+///
+/// Returns one `Commitment` per leaf (`columns[0].len() / rows_per_leaf` leaves),
+/// or an empty `Vec` when there is nothing to hash. See the module docs for the
+/// exact leaf byte layout. This is the single code path behind both the per-row
+/// ([`keccak_leaves_bit_reversed`]) and per-row-pair
+/// ([`keccak_leaves_row_pair_bit_reversed`]) commitments.
+pub fn keccak_leaves_bit_reversed_grouped<E>(
+    columns: &[Vec<FieldElement<E>>],
+    rows_per_leaf: usize,
+) -> Vec<Commitment>
+where
+    E: IsField,
+    FieldElement<E>: AsBytes + Sync + Send + ByteConversion,
+{
+    if columns.is_empty() || columns[0].is_empty() {
+        return Vec::new();
+    }
+
+    let num_rows = columns[0].len();
+    let byte_len = <FieldElement<E> as ByteConversion>::BYTE_LEN;
+
+    debug_assert!(
+        num_rows.is_power_of_two(),
+        "num_rows must be a power of two for reverse_index"
+    );
+    debug_assert!(
+        rows_per_leaf >= 1 && num_rows % rows_per_leaf == 0,
+        "num_rows must be a multiple of rows_per_leaf"
+    );
+
+    let num_leaves = num_rows / rows_per_leaf;
+    let total_bytes = rows_per_leaf * columns.len() * byte_len;
+
+    // Leaf `i`: the `rows_per_leaf` bit-reversed rows starting at `R·i`, each row
+    // written column-by-column in big-endian, then hashed once.
+    let hash_leaf = |buf: &mut [u8], leaf_idx: usize| -> Commitment {
+        let mut offset = 0;
+        for k in 0..rows_per_leaf {
+            let br = reverse_index(rows_per_leaf * leaf_idx + k, num_rows as u64);
+            for col in columns {
+                col[br].write_bytes_be(&mut buf[offset..offset + byte_len]);
+                offset += byte_len;
+            }
+        }
+        BatchedMerkleTreeBackend::<E>::hash_bytes(buf)
+    };
+
+    // Per-thread buffer reuse (map_init) avoids millions of small allocations.
+    #[cfg(feature = "parallel")]
+    let result: Vec<Commitment> = (0..num_leaves)
+        .into_par_iter()
+        .map_init(|| vec![0u8; total_bytes], |buf, i| hash_leaf(buf, i))
+        .collect();
+
+    #[cfg(not(feature = "parallel"))]
+    let result: Vec<Commitment> = {
+        let mut buf = vec![0u8; total_bytes];
+        (0..num_leaves).map(|i| hash_leaf(&mut buf, i)).collect()
+    };
+
+    result
+}
+
+/// Per-row Keccak-256 leaf hashes (one leaf per bit-reversed row). Used for the
+/// main/aux trace LDE commitments. Thin wrapper over
+/// [`keccak_leaves_bit_reversed_grouped`] with `rows_per_leaf = 1`.
+///
+/// Kept as a named public function because the GPU parity tests in dependent
+/// crates compare against this exact code path.
+pub fn keccak_leaves_bit_reversed<E>(columns: &[Vec<FieldElement<E>>]) -> Vec<Commitment>
+where
+    E: IsField,
+    FieldElement<E>: AsBytes + Sync + Send + ByteConversion,
+{
+    keccak_leaves_bit_reversed_grouped(columns, 1)
+}
+
+/// Per-row-pair Keccak-256 leaf hashes (leaf `i` hashes bit-reversed rows `2i`,
+/// `2i+1`). Used for the composition-polynomial-parts commitment. Thin wrapper
+/// over [`keccak_leaves_bit_reversed_grouped`] with `rows_per_leaf = 2`.
+pub fn keccak_leaves_row_pair_bit_reversed<E>(parts: &[Vec<FieldElement<E>>]) -> Vec<Commitment>
+where
+    E: IsField,
+    FieldElement<E>: AsBytes + Sync + Send + ByteConversion,
+{
+    keccak_leaves_bit_reversed_grouped(parts, 2)
+}
+
+/// Builds the Merkle tree committing to `columns`' bit-reversed, column-major LDE
+/// evaluations, grouping `rows_per_leaf` rows per leaf, and returns the tree and
+/// its root. `None` when there is nothing to commit.
+///
+/// Replaces the prover's former `commit_columns_bit_reversed` (`rows_per_leaf = 1`)
+/// and `commit_composition_polynomial` (`rows_per_leaf = 2`).
+pub fn commit_bit_reversed<E>(
+    columns: &[Vec<FieldElement<E>>],
+    rows_per_leaf: usize,
+) -> Option<(BatchedMerkleTree<E>, Commitment)>
+where
+    E: IsField,
+    FieldElement<E>: AsBytes + Sync + Send + ByteConversion,
+{
+    if columns.is_empty() || columns[0].is_empty() {
+        return None;
+    }
+    let hashed_leaves = keccak_leaves_bit_reversed_grouped(columns, rows_per_leaf);
+    let tree = BatchedMerkleTree::<E>::build_from_hashed_leaves(hashed_leaves)?;
+    let root = tree.root;
+    Some((tree, root))
+}
diff --git a/crypto/stark/src/lib.rs b/crypto/stark/src/lib.rs
index e9f6a1cda..87236c5f9 100644
--- a/crypto/stark/src/lib.rs
+++ b/crypto/stark/src/lib.rs
@@ -5,6 +5,7 @@ compile_error!("the `disk-spill` feature requires memmap2, which does not compil
 
 #[cfg(feature = "debug-checks")]
 pub mod bus_debug;
+pub mod commitment;
 pub mod constraints;
 pub mod context;
 pub mod debug;
diff --git a/crypto/stark/src/prover.rs b/crypto/stark/src/prover.rs
index 46261103e..d6166f745 100644
--- a/crypto/stark/src/prover.rs
+++ b/crypto/stark/src/prover.rs
@@ -43,6 +43,8 @@ use super::proof::stark::{DeepPolynomialOpening, MultiProof, StarkProof};
 use super::trace::TraceTable;
 use super::traits::AIR;
 
+pub use crate::commitment::{keccak_leaves_bit_reversed, keccak_leaves_row_pair_bit_reversed};
+
 /// A triple of (AIR, TraceTable, PublicInputs) for proving.
 type AirTracePair<'a, Field, FieldExtension, PI> = (
     &'a dyn AIR<Field = Field, FieldExtension = FieldExtension, PublicInputs = PI>,
@@ -389,128 +391,6 @@ where
     }
 }
 
-/// Compute Keccak-256 leaf hashes for `commit_columns_bit_reversed`: one
-/// leaf per row, where each row is read at `reverse_index(row_idx)` and the
-/// columns are concatenated as big-endian bytes before hashing.
-///
-/// Returns `Vec<Commitment>` with the same length as `columns[0]`. Exposed
-/// (instead of being a closure inside `commit_columns_bit_reversed`) so
-/// parity tests in dependent crates can compare against the same code path
-/// the prover uses.
-pub fn keccak_leaves_bit_reversed<E>(columns: &[Vec<FieldElement<E>>]) -> Vec<Commitment>
-where
-    E: IsField,
-    FieldElement<E>: AsBytes + Sync + Send + ByteConversion,
-{
-    if columns.is_empty() || columns[0].is_empty() {
-        return Vec::new();
-    }
-
-    let num_rows = columns[0].len();
-    let num_cols = columns.len();
-    let byte_len = <FieldElement<E> as ByteConversion>::BYTE_LEN;
-
-    debug_assert!(
-        num_rows.is_power_of_two(),
-        "num_rows must be a power of two for reverse_index"
-    );
-
-    let total_bytes = num_cols * byte_len;
-
-    let hash_leaf = |buf: &mut [u8], row_idx: usize| -> Commitment {
-        let br_idx = reverse_index(row_idx, num_rows as u64);
-        for col_idx in 0..num_cols {
-            columns[col_idx][br_idx]
-                .write_bytes_be(&mut buf[col_idx * byte_len..(col_idx + 1) * byte_len]);
-        }
-        BatchedMerkleTreeBackend::<E>::hash_bytes(buf)
-    };
-
-    #[cfg(feature = "parallel")]
-    let iter = (0..num_rows).into_par_iter();
-    #[cfg(not(feature = "parallel"))]
-    let iter = 0..num_rows;
-
-    // Per-thread buffer reuse: map_init allocates one buffer per Rayon thread,
-    // eliminating millions of small heap allocations under parallel contention.
-    #[cfg(feature = "parallel")]
-    let result: Vec<Commitment> = iter
-        .map_init(|| vec![0u8; total_bytes], |buf, i| hash_leaf(buf, i))
-        .collect();
-
-    #[cfg(not(feature = "parallel"))]
-    let result: Vec<Commitment> = {
-        let mut buf = vec![0u8; total_bytes];
-        iter.map(|i| hash_leaf(&mut buf, i)).collect()
-    };
-
-    result
-}
-
-/// Compute Keccak-256 leaf hashes for `commit_composition_polynomial`: one
-/// leaf per row-pair, where leaf `i` hashes the BE concatenation of
-/// `parts[..][br_0] ++ parts[..][br_1]` with
-/// `br_k = reverse_index(2*i + k, num_rows)`.
-///
-/// Returns `Vec<Commitment>` of length `parts[0].len() / 2`.
-pub fn keccak_leaves_row_pair_bit_reversed<E>(parts: &[Vec<FieldElement<E>>]) -> Vec<Commitment>
-where
-    E: IsField,
-    FieldElement<E>: AsBytes + Sync + Send + ByteConversion,
-{
-    let num_parts = parts.len();
-    if num_parts == 0 {
-        return Vec::new();
-    }
-    let num_rows = parts[0].len();
-    if num_rows == 0 {
-        return Vec::new();
-    }
-
-    let num_leaves = num_rows / 2;
-    debug_assert!(
-        num_rows.is_power_of_two(),
-        "num_rows must be a power of two for reverse_index"
-    );
-
-    let byte_len = <FieldElement<E> as ByteConversion>::BYTE_LEN;
-
-    let total_bytes = 2 * num_parts * byte_len;
-
-    let hash_leaf_pair = |buf: &mut [u8], leaf_idx: usize| -> Commitment {
-        let br_0 = reverse_index(2 * leaf_idx, num_rows as u64);
-        let br_1 = reverse_index(2 * leaf_idx + 1, num_rows as u64);
-        let mut offset = 0;
-        for part in parts.iter() {
-            part[br_0].write_bytes_be(&mut buf[offset..offset + byte_len]);
-            offset += byte_len;
-        }
-        for part in parts.iter() {
-            part[br_1].write_bytes_be(&mut buf[offset..offset + byte_len]);
-            offset += byte_len;
-        }
-        BatchedMerkleTreeBackend::<E>::hash_bytes(buf)
-    };
-
-    #[cfg(feature = "parallel")]
-    let iter = (0..num_leaves).into_par_iter();
-    #[cfg(not(feature = "parallel"))]
-    let iter = 0..num_leaves;
-
-    #[cfg(feature = "parallel")]
-    let result: Vec<Commitment> = iter
-        .map_init(|| vec![0u8; total_bytes], |buf, i| hash_leaf_pair(buf, i))
-        .collect();
-
-    #[cfg(not(feature = "parallel"))]
-    let result: Vec<Commitment> = {
-        let mut buf = vec![0u8; total_bytes];
-        iter.map(|i| hash_leaf_pair(&mut buf, i)).collect()
-    };
-
-    result
-}
-
 /// The functionality of a STARK prover providing methods to run the STARK Prove protocol
 /// https://lambdaclass.github.io/lambdaworks/starks/protocol.html
 /// The default implementation is complete and is compatible with Stone prover
@@ -529,29 +409,6 @@ pub trait IsStarkProver<
     FieldElement<Field>: math::traits::ByteConversion,
     FieldElement<FieldExtension>: math::traits::ByteConversion,
 {
-    /// Builds a Merkle tree commitment from column-major LDE evaluations with
-    /// bit-reverse permutation, without cloning the full evaluation matrix.
-    ///
-    /// For each row index `i`, we hash `col_0[br(i)] || col_1[br(i)] || ...`
-    /// where `br(i)` is the bit-reversal of `i`. This produces the same Merkle
-    /// tree as the old clone + bit-reverse + columns2rows + batch_commit flow,
-    /// but avoids allocating the cloned and transposed matrices entirely.
-    fn commit_columns_bit_reversed<E>(
-        columns: &[Vec<FieldElement<E>>],
-    ) -> Option<(BatchedMerkleTree<E>, Commitment)>
-    where
-        FieldElement<E>: AsBytes + Sync + Send + math::traits::ByteConversion,
-        E: IsField,
-    {
-        if columns.is_empty() || columns[0].is_empty() {
-            return None;
-        }
-        let hashed_leaves = keccak_leaves_bit_reversed(columns);
-        let tree = BatchedMerkleTree::<E>::build_from_hashed_leaves(hashed_leaves)?;
-        let root = tree.root;
-        Some((tree, root))
-    }
-
     /// Compute the LDE commitment for a subset of columns from a trace (for testing).
     ///
     /// This helper computes the same commitment the prover generates internally,
@@ -574,7 +431,7 @@ pub trait IsStarkProver<
         let twiddles = LdeTwiddles::new(&domain);
         let evals =
             Self::compute_lde_from_columns_cached::<Field>(&precomputed, &domain, &twiddles);
-        let (_, commitment) = Self::commit_columns_bit_reversed(&evals)?;
+        let (_, commitment) = crate::commitment::commit_bit_reversed(&evals, 1)?;
         Some(commitment)
     }
 
@@ -728,7 +585,7 @@ pub trait IsStarkProver<
         let commit = match precomputed {
             None => {
                 #[allow(unused_mut)]
-                let (mut tree, root) = Self::commit_columns_bit_reversed(&columns)
+                let (mut tree, root) = crate::commitment::commit_bit_reversed(&columns, 1)
                     .ok_or(ProvingError::EmptyCommitment)?;
                 #[cfg(feature = "disk-spill")]
                 if storage_mode == StorageMode::Disk {
@@ -740,11 +597,11 @@ pub trait IsStarkProver<
             Some((expected_precomputed_root, num_cols)) => {
                 #[allow(unused_mut)]
                 let (mut precomputed_tree, precomputed_root) =
-                    Self::commit_columns_bit_reversed(&columns[..num_cols])
+                    crate::commitment::commit_bit_reversed(&columns[..num_cols], 1)
                         .ok_or(ProvingError::EmptyCommitment)?;
                 #[allow(unused_mut)]
                 let (mut mult_tree, mult_root) =
-                    Self::commit_columns_bit_reversed(&columns[num_cols..])
+                    crate::commitment::commit_bit_reversed(&columns[num_cols..], 1)
                         .ok_or(ProvingError::EmptyCommitment)?;
                 if precomputed_root != expected_precomputed_root {
                     return Err(ProvingError::PrecomputedCommitmentMismatch);
@@ -866,30 +723,6 @@ pub trait IsStarkProver<
         }
     }
 
-    /// Returns the Merkle tree and the commitment to the evaluations of the parts of the
-    /// composition polynomial.
-    fn commit_composition_polynomial(
-        lde_composition_poly_parts_evaluations: &[Vec<FieldElement<FieldExtension>>],
-    ) -> Option<(BatchedMerkleTree<FieldExtension>, Commitment)>
-    where
-        FieldElement<Field>: AsBytes + Sync + Send,
-        FieldElement<FieldExtension>: AsBytes + Sync + Send + math::traits::ByteConversion,
-    {
-        let num_parts = lde_composition_poly_parts_evaluations.len();
-        if num_parts == 0 {
-            return None;
-        }
-        let num_rows = lde_composition_poly_parts_evaluations[0].len();
-        if num_rows == 0 {
-            return None;
-        }
-        let hashed_leaves =
-            keccak_leaves_row_pair_bit_reversed(lde_composition_poly_parts_evaluations);
-        let tree = BatchedMerkleTree::<FieldExtension>::build_from_hashed_leaves(hashed_leaves)?;
-        let root = tree.root;
-        Some((tree, root))
-    }
-
     /// Algebraically decompose H(x) = H₀(x²) + x·H₁(x²) on the LDE coset, then
     /// extend each half to the full LDE domain. This replaces the expensive
     /// iFFT(2N) + break_in_parts + FFT(2N)×2 pipeline with:
@@ -1097,7 +930,7 @@ pub trait IsStarkProver<
                 let root = tree.root;
                 (tree, root)
             }
-            None => Self::commit_composition_polynomial(&lde_composition_poly_parts_evaluations)
+            None => crate::commitment::commit_bit_reversed(&lde_composition_poly_parts_evaluations, 2)
                 .ok_or(ProvingError::EmptyCommitment)?,
         };
         #[cfg(feature = "instruments")]
@@ -1983,7 +1816,7 @@ pub trait IsStarkProver<
                         #[cfg(feature = "instruments")]
                         let t_sub = Instant::now();
                         #[allow(unused_mut)]
-                        let (mut tree, root) = Self::commit_columns_bit_reversed(&columns)
+                        let (mut tree, root) = crate::commitment::commit_bit_reversed(&columns, 1)
                             .ok_or(ProvingError::EmptyCommitment)?;
                         #[cfg(feature = "instruments")]
                         crate::instruments::accum_r1_aux(aux_lde_dur, t_sub.elapsed());

From 5cd227ab642bf2cb0a4c09d3571b9041f21989dd Mon Sep 17 00:00:00 2001
From: diegokingston <dkingston@fi.uba.ar>
Date: Fri, 19 Jun 2026 16:55:36 -0300
Subject: [PATCH 02/19] perf(stark): row-pair the trace commitment (one Merkle
 path per query)

Every FRI query opens a value and its symmetric counterpart (LDE positions
2*iota, 2*iota+1). The composition-poly commitment already grouped that pair into
one leaf; the trace (main/aux/precomputed) committed one row per leaf and opened
TWO leaves (proof + proof_sym) per query. Commit the trace with rows_per_leaf=2
too, so each query opens ONE leaf/path; drop the now-redundant proof_sym from
PolynomialOpenings (also shrinks the composition opening, which stored the path
twice). verify_opening_pair now reconstructs the paired leaf and verifies once
(mirrors verify_composition_poly_opening); the dead verify_opening is removed.

Halves trace Merkle authentication-path data per query (smaller proofs + less
verifier hashing). stark 128/128 (prove+verify). NOTE: proof FORMAT change (not
byte-identical). cuda follow-up: the GPU trace leaf+tree builders
(gpu_lde::try_expand_leaf_and_tree_batched_keep/_ext3_keep + math-cuda kernels)
still build 1-row leaves and must switch to the row-pair pattern (the GPU
composition builder already pairs) or cuda proofs will fail verification.
---
 crypto/stark/src/commitment.rs  |  2 +-
 crypto/stark/src/proof/stark.rs |  7 ++++++-
 crypto/stark/src/prover.rs      | 32 +++++++++++++++--------------
 crypto/stark/src/verifier.rs    | 36 +++++++++------------------------
 4 files changed, 34 insertions(+), 43 deletions(-)

diff --git a/crypto/stark/src/commitment.rs b/crypto/stark/src/commitment.rs
index 979c0cb7f..59d68aa7c 100644
--- a/crypto/stark/src/commitment.rs
+++ b/crypto/stark/src/commitment.rs
@@ -61,7 +61,7 @@ where
         "num_rows must be a power of two for reverse_index"
     );
     debug_assert!(
-        rows_per_leaf >= 1 && num_rows % rows_per_leaf == 0,
+        rows_per_leaf >= 1 && num_rows.is_multiple_of(rows_per_leaf),
         "num_rows must be a multiple of rows_per_leaf"
     );
 
diff --git a/crypto/stark/src/proof/stark.rs b/crypto/stark/src/proof/stark.rs
index 1751d60fe..851c0b37a 100644
--- a/crypto/stark/src/proof/stark.rs
+++ b/crypto/stark/src/proof/stark.rs
@@ -10,9 +10,14 @@ use crate::{
 
 #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
 #[serde(bound = "")]
+/// Opening of a bit-reversed, row-paired commitment at one FRI query.
+///
+/// The queried row and its symmetric counterpart (LDE positions `2·iota`,
+/// `2·iota+1`) are committed together as a single leaf at position `iota`, so one
+/// Merkle `proof` authenticates both `evaluations` (the row) and
+/// `evaluations_sym` (its symmetric). Same layout used for trace and composition.
 pub struct PolynomialOpenings<F: IsField> {
     pub proof: Proof<Commitment>,
-    pub proof_sym: Proof<Commitment>,
     pub evaluations: Vec<FieldElement<F>>,
     pub evaluations_sym: Vec<FieldElement<F>>,
 }
diff --git a/crypto/stark/src/prover.rs b/crypto/stark/src/prover.rs
index d6166f745..533f49154 100644
--- a/crypto/stark/src/prover.rs
+++ b/crypto/stark/src/prover.rs
@@ -11,7 +11,7 @@ use math::fft::errors::FFTError;
 use log::info;
 use math::field::traits::{IsField, IsSubFieldOf};
 use math::spill_safe::SpillSafe;
-use math::traits::{AsBytes, ByteConversion};
+use math::traits::AsBytes;
 use math::{
     field::{element::FieldElement, traits::IsFFTField},
     polynomial::Polynomial,
@@ -33,7 +33,10 @@ use crate::storage_mode::StorageMode;
 use crate::table::Table;
 use crate::trace::LDETraceTable;
 
-use super::config::{BatchedMerkleTree, BatchedMerkleTreeBackend, Commitment};
+use super::config::{BatchedMerkleTree, Commitment};
+// `BatchedMerkleTreeBackend` is only named by the GPU leaf+tree builders.
+#[cfg(feature = "cuda")]
+use super::config::BatchedMerkleTreeBackend;
 use super::constraints::evaluator::ConstraintEvaluator;
 use super::domain::{Domain, DomainConstants};
 use super::fri::fri_decommit::FriDecommitment;
@@ -431,7 +434,7 @@ pub trait IsStarkProver<
         let twiddles = LdeTwiddles::new(&domain);
         let evals =
             Self::compute_lde_from_columns_cached::<Field>(&precomputed, &domain, &twiddles);
-        let (_, commitment) = crate::commitment::commit_bit_reversed(&evals, 1)?;
+        let (_, commitment) = crate::commitment::commit_bit_reversed(&evals, 2)?;
         Some(commitment)
     }
 
@@ -585,7 +588,7 @@ pub trait IsStarkProver<
         let commit = match precomputed {
             None => {
                 #[allow(unused_mut)]
-                let (mut tree, root) = crate::commitment::commit_bit_reversed(&columns, 1)
+                let (mut tree, root) = crate::commitment::commit_bit_reversed(&columns, 2)
                     .ok_or(ProvingError::EmptyCommitment)?;
                 #[cfg(feature = "disk-spill")]
                 if storage_mode == StorageMode::Disk {
@@ -597,11 +600,11 @@ pub trait IsStarkProver<
             Some((expected_precomputed_root, num_cols)) => {
                 #[allow(unused_mut)]
                 let (mut precomputed_tree, precomputed_root) =
-                    crate::commitment::commit_bit_reversed(&columns[..num_cols], 1)
+                    crate::commitment::commit_bit_reversed(&columns[..num_cols], 2)
                         .ok_or(ProvingError::EmptyCommitment)?;
                 #[allow(unused_mut)]
                 let (mut mult_tree, mult_root) =
-                    crate::commitment::commit_bit_reversed(&columns[num_cols..], 1)
+                    crate::commitment::commit_bit_reversed(&columns[num_cols..], 2)
                         .ok_or(ProvingError::EmptyCommitment)?;
                 if precomputed_root != expected_precomputed_root {
                     return Err(ProvingError::PrecomputedCommitmentMismatch);
@@ -1357,8 +1360,7 @@ pub trait IsStarkProver<
             .collect();
 
         PolynomialOpenings {
-            proof: proof.clone(),
-            proof_sym: proof,
+            proof,
             evaluations: lde_composition_poly_parts_evaluation
                 .clone()
                 .into_iter()
@@ -1388,13 +1390,13 @@ pub trait IsStarkProver<
         G: Fn(usize) -> Vec<FieldElement<C>>,
     {
         let domain_size = domain.lde_roots_of_unity_coset.len() as u64;
-        let index = challenge * 2;
-        let index_sym = challenge * 2 + 1;
+        // Rows `2·challenge` and `2·challenge+1` are committed together as the
+        // single leaf at position `challenge`; one Merkle path authenticates both
+        // the queried row and its symmetric counterpart.
         PolynomialOpenings {
-            proof: tree.get_proof_by_pos(index).unwrap(),
-            proof_sym: tree.get_proof_by_pos(index_sym).unwrap(),
-            evaluations: gather(reverse_index(index, domain_size)),
-            evaluations_sym: gather(reverse_index(index_sym, domain_size)),
+            proof: tree.get_proof_by_pos(challenge).unwrap(),
+            evaluations: gather(reverse_index(challenge * 2, domain_size)),
+            evaluations_sym: gather(reverse_index(challenge * 2 + 1, domain_size)),
         }
     }
 
@@ -1816,7 +1818,7 @@ pub trait IsStarkProver<
                         #[cfg(feature = "instruments")]
                         let t_sub = Instant::now();
                         #[allow(unused_mut)]
-                        let (mut tree, root) = crate::commitment::commit_bit_reversed(&columns, 1)
+                        let (mut tree, root) = crate::commitment::commit_bit_reversed(&columns, 2)
                             .ok_or(ProvingError::EmptyCommitment)?;
                         #[cfg(feature = "instruments")]
                         crate::instruments::accum_r1_aux(aux_lde_dur, t_sub.elapsed());
diff --git a/crypto/stark/src/verifier.rs b/crypto/stark/src/verifier.rs
index 8091c8b32..3ba3b7c26 100644
--- a/crypto/stark/src/verifier.rs
+++ b/crypto/stark/src/verifier.rs
@@ -301,25 +301,11 @@ pub trait IsStarkVerifier<
         domain.lde_coset_element(reverse_index(raw, domain.lde_length as u64))
     }
 
-    /// Verifies the validity of the opening proof.
-    fn verify_opening<E>(
-        proof: &Proof<Commitment>,
-        root: &Commitment,
-        index: usize,
-        value: &[FieldElement<E>],
-    ) -> bool
-    where
-        FieldElement<Field>: AsBytes + Sync + Send,
-        FieldElement<E>: AsBytes + Sync + Send,
-        E: IsField,
-        Field: IsSubFieldOf<E>,
-    {
-        proof.verify::<BatchedMerkleTreeBackend<E>>(root, index, &value.to_owned())
-    }
-
-    /// Verify both (proof, evaluations) and (proof_sym, evaluations_sym) openings
-    /// of a `PolynomialOpenings` against the given `root` at iota positions
-    /// `iota*2` and `iota*2 + 1`.
+    /// Verify a row-paired `PolynomialOpenings` against `root`. The row pair
+    /// (`2·iota`, `2·iota+1`) is committed as the single leaf at position `iota`,
+    /// so one Merkle path authenticates both rows: reconstruct the leaf from
+    /// `evaluations ‖ evaluations_sym` and verify once. (Same as the composition
+    /// opening check.)
     fn verify_opening_pair<E>(
         opening: &PolynomialOpenings<E>,
         root: &Commitment,
@@ -331,13 +317,11 @@ pub trait IsStarkVerifier<
         E: IsField,
         Field: IsSubFieldOf<E>,
     {
-        Self::verify_opening::<E>(&opening.proof, root, iota * 2, &opening.evaluations)
-            && Self::verify_opening::<E>(
-                &opening.proof_sym,
-                root,
-                iota * 2 + 1,
-                &opening.evaluations_sym,
-            )
+        let mut value = opening.evaluations.clone();
+        value.extend_from_slice(&opening.evaluations_sym);
+        opening
+            .proof
+            .verify::<BatchedMerkleTreeBackend<E>>(root, iota, &value)
     }
 
     /// Verify opening Open(tⱼ(D_LDE), 𝜐) and Open(tⱼ(D_LDE), -𝜐) for all trace polynomials tⱼ,

From f26a13a9fa826b5ac374a5d22793fdf840a61ff4 Mon Sep 17 00:00:00 2001
From: diegokingston <dkingston@fi.uba.ar>
Date: Fri, 19 Jun 2026 17:31:36 -0300
Subject: [PATCH 03/19] =?UTF-8?q?refactor(stark):=20prover=20cleanup=20?=
 =?UTF-8?q?=E2=80=94=20par=20helpers,=20ROWS=5FPER=5FLEAF,=20error=20propa?=
 =?UTF-8?q?gation,=20doc/log=20fixes?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 crypto/stark/src/commitment.rs  |   5 ++
 crypto/stark/src/gpu_lde.rs     |   4 +-
 crypto/stark/src/instruments.rs |   6 +-
 crypto/stark/src/par.rs         |  52 ++++++++++++++
 crypto/stark/src/prover.rs      | 121 ++++++++++++--------------------
 5 files changed, 107 insertions(+), 81 deletions(-)

diff --git a/crypto/stark/src/commitment.rs b/crypto/stark/src/commitment.rs
index 59d68aa7c..a55d050b2 100644
--- a/crypto/stark/src/commitment.rs
+++ b/crypto/stark/src/commitment.rs
@@ -33,6 +33,11 @@ use rayon::prelude::{IntoParallelIterator, ParallelIterator};
 
 use crate::config::{BatchedMerkleTree, BatchedMerkleTreeBackend, Commitment};
 
+/// Number of consecutive (bit-reversed) rows packed into one Merkle leaf for the
+/// composition-polynomial commitment: the row-pair leaf packing the FRI openings
+/// rely on (leaf `i` hashes rows `2i` and `2i+1`).
+pub const ROWS_PER_LEAF: usize = 2;
+
 /// Computes the Keccak-256 leaf hashes for a bit-reversed, column-major commitment,
 /// grouping `rows_per_leaf` consecutive bit-reversed rows into each leaf.
 ///
diff --git a/crypto/stark/src/gpu_lde.rs b/crypto/stark/src/gpu_lde.rs
index 36756b40b..be6d5ccfe 100644
--- a/crypto/stark/src/gpu_lde.rs
+++ b/crypto/stark/src/gpu_lde.rs
@@ -733,8 +733,8 @@ where
 /// host-side ext3 LDE eval Vecs produced by
 /// [`try_evaluate_parts_on_lde_gpu_keep`] (or the CPU path). Uses the same
 /// row-pair leaf pattern as the CPU
-/// `commit_composition_polynomial`: each leaf hashes 2 consecutive
-/// bit-reversed rows.
+/// `commit_bit_reversed` (composition-polynomial commit path): each leaf hashes
+/// 2 consecutive bit-reversed rows.
 ///
 /// Returns `None` to fall through to the CPU path when the type or size
 /// conditions don't hold; returns `None` on a math-cuda `Err` so the caller
diff --git a/crypto/stark/src/instruments.rs b/crypto/stark/src/instruments.rs
index 16ff95082..aa5cc5436 100644
--- a/crypto/stark/src/instruments.rs
+++ b/crypto/stark/src/instruments.rs
@@ -33,7 +33,7 @@ pub struct TableSubOps {
     pub constraints: Duration,
     /// decompose_and_extend_d2
     pub comp_decompose: Duration,
-    /// commit_composition_polynomial
+    /// commit_bit_reversed (composition-polynomial commit step)
     pub comp_commit: Duration,
     /// Round 3: barycentric OOD evaluation
     pub ood: Duration,
@@ -52,11 +52,11 @@ pub struct TableSubOps {
 pub struct Round1SubOps {
     /// Main trace: expand_columns_to_lde (LDE/FFT)
     pub main_lde: Duration,
-    /// Main trace: commit_columns_bit_reversed (Merkle)
+    /// Main trace: commit_bit_reversed (Merkle)
     pub main_merkle: Duration,
     /// Aux trace: expand_columns_to_lde (LDE/FFT)
     pub aux_lde: Duration,
-    /// Aux trace: commit_columns_bit_reversed (Merkle)
+    /// Aux trace: commit_bit_reversed (Merkle)
     pub aux_merkle: Duration,
 }
 
diff --git a/crypto/stark/src/par.rs b/crypto/stark/src/par.rs
index a20a452b6..051651fde 100644
--- a/crypto/stark/src/par.rs
+++ b/crypto/stark/src/par.rs
@@ -37,3 +37,55 @@ where
         (a(), b())
     }
 }
+
+/// Map `f(i)` over `range` and collect into a `Vec`, preserving index order.
+/// Parallel when `feature = "parallel"`, sequential otherwise. Rayon's
+/// `collect()` is index-ordered, so the result is identical either way.
+pub(crate) fn par_map_collect<R: Send>(
+    range: std::ops::Range<usize>,
+    f: impl Fn(usize) -> R + Sync + Send,
+) -> Vec<R> {
+    #[cfg(feature = "parallel")]
+    {
+        use rayon::prelude::*;
+        range.into_par_iter().map(f).collect()
+    }
+    #[cfg(not(feature = "parallel"))]
+    {
+        range.map(f).collect()
+    }
+}
+
+/// Run `f(&mut item)` for each element of `slice`. Parallel when
+/// `feature = "parallel"`, sequential otherwise (ordering is irrelevant).
+pub(crate) fn par_for_each_mut<T: Send>(slice: &mut [T], f: impl Fn(&mut T) + Sync + Send) {
+    #[cfg(feature = "parallel")]
+    {
+        use rayon::prelude::*;
+        slice.par_iter_mut().for_each(f);
+    }
+    #[cfg(not(feature = "parallel"))]
+    {
+        slice.iter_mut().for_each(f);
+    }
+}
+
+/// Run `f(&mut item)` for each element of `slice`, short-circuiting on the
+/// first `Err`. Parallel when `feature = "parallel"`, sequential otherwise.
+// Only called from `disk-spill`-gated paths; keep it available without warning
+// when that feature is off.
+#[cfg_attr(not(feature = "disk-spill"), allow(dead_code))]
+pub(crate) fn par_try_for_each_mut<T: Send, E: Send>(
+    slice: &mut [T],
+    f: impl Fn(&mut T) -> Result<(), E> + Sync + Send,
+) -> Result<(), E> {
+    #[cfg(feature = "parallel")]
+    {
+        use rayon::prelude::*;
+        slice.par_iter_mut().try_for_each(f)
+    }
+    #[cfg(not(feature = "parallel"))]
+    {
+        slice.iter_mut().try_for_each(f)
+    }
+}
diff --git a/crypto/stark/src/prover.rs b/crypto/stark/src/prover.rs
index 533f49154..7117fd708 100644
--- a/crypto/stark/src/prover.rs
+++ b/crypto/stark/src/prover.rs
@@ -92,6 +92,12 @@ pub enum ProvingError {
     DiskSpill(String),
 }
 
+impl From<FFTError> for ProvingError {
+    fn from(e: FFTError) -> Self {
+        ProvingError::WrongParameter(format!("{e}"))
+    }
+}
+
 /// Commitment artifacts for one trace table (main or auxiliary). Used for both
 /// plain and preprocessed tables. Preprocessed tables additionally carry a
 /// separate Merkle tree over their precomputed columns, hence the optional
@@ -351,7 +357,7 @@ where
 
 /// A container for the results of the third round of the STARK Prove protocol.
 pub(crate) struct Round3<F: IsField> {
-    /// Evaluations of the trace polynomials, main ans auxiliary, at the out-of-domain challenge.
+    /// Evaluations of the trace polynomials, main and auxiliary, at the out-of-domain challenge.
     trace_ood_evaluations: Table<F>,
     /// Evaluations of the composition polynomial parts at the out-of-domain challenge.
     composition_poly_parts_ood_evaluation: Vec<FieldElement<F>>,
@@ -434,7 +440,7 @@ pub trait IsStarkProver<
         let twiddles = LdeTwiddles::new(&domain);
         let evals =
             Self::compute_lde_from_columns_cached::<Field>(&precomputed, &domain, &twiddles);
-        let (_, commitment) = crate::commitment::commit_bit_reversed(&evals, 2)?;
+        let (_, commitment) = crate::commitment::commit_bit_reversed(&evals, crate::commitment::ROWS_PER_LEAF)?;
         Some(commitment)
     }
 
@@ -456,23 +462,16 @@ pub trait IsStarkProver<
             return Vec::new();
         }
 
-        #[cfg(not(feature = "parallel"))]
-        let columns_iter = columns.iter();
-        #[cfg(feature = "parallel")]
-        let columns_iter = columns.par_iter();
-
-        columns_iter
-            .map(|col| {
-                Polynomial::coset_lde_full::<Field>(
-                    col,
-                    domain.blowup_factor,
-                    &twiddles.coset_weights,
-                    &twiddles.inv,
-                    &twiddles.fwd,
-                )
-            })
-            .collect::<Result<Vec<Vec<FieldElement<E>>>, _>>()
+        crate::par::par_map_collect(0..columns.len(), |i| {
+            Polynomial::coset_lde_full::<Field>(
+                &columns[i],
+                domain.blowup_factor,
+                &twiddles.coset_weights,
+                &twiddles.inv,
+                &twiddles.fwd,
+            )
             .expect("coset LDE computation")
+        })
     }
 
     /// Expand each column in-place from N evaluations to N×blowup LDE evaluations.
@@ -507,11 +506,7 @@ pub trait IsStarkProver<
             return;
         }
 
-        #[cfg(feature = "parallel")]
-        let iter = columns.par_iter_mut();
-        #[cfg(not(feature = "parallel"))]
-        let iter = columns.iter_mut();
-        iter.for_each(|buf| {
+        crate::par::par_for_each_mut(columns, |buf| {
             Polynomial::coset_lde_full_expand::<Field>(
                 buf,
                 domain.blowup_factor,
@@ -588,7 +583,7 @@ pub trait IsStarkProver<
         let commit = match precomputed {
             None => {
                 #[allow(unused_mut)]
-                let (mut tree, root) = crate::commitment::commit_bit_reversed(&columns, 2)
+                let (mut tree, root) = crate::commitment::commit_bit_reversed(&columns, crate::commitment::ROWS_PER_LEAF)
                     .ok_or(ProvingError::EmptyCommitment)?;
                 #[cfg(feature = "disk-spill")]
                 if storage_mode == StorageMode::Disk {
@@ -600,11 +595,11 @@ pub trait IsStarkProver<
             Some((expected_precomputed_root, num_cols)) => {
                 #[allow(unused_mut)]
                 let (mut precomputed_tree, precomputed_root) =
-                    crate::commitment::commit_bit_reversed(&columns[..num_cols], 2)
+                    crate::commitment::commit_bit_reversed(&columns[..num_cols], crate::commitment::ROWS_PER_LEAF)
                         .ok_or(ProvingError::EmptyCommitment)?;
                 #[allow(unused_mut)]
                 let (mut mult_tree, mult_root) =
-                    crate::commitment::commit_bit_reversed(&columns[num_cols..], 2)
+                    crate::commitment::commit_bit_reversed(&columns[num_cols..], crate::commitment::ROWS_PER_LEAF)
                         .ok_or(ProvingError::EmptyCommitment)?;
                 if precomputed_root != expected_precomputed_root {
                     return Err(ProvingError::PrecomputedCommitmentMismatch);
@@ -868,11 +863,10 @@ pub trait IsStarkProver<
         } else {
             // Fallback for any future AIR with d > 2.
             let composition_poly =
-                Polynomial::interpolate_offset_fft(&constraint_evaluations, &domain.coset_offset)
-                    .unwrap();
+                Polynomial::interpolate_offset_fft(&constraint_evaluations, &domain.coset_offset)?;
             let composition_poly_parts = composition_poly.break_in_parts(number_of_parts);
 
-            let cpu_eval = || -> Vec<Vec<FieldElement<FieldExtension>>> {
+            let cpu_eval = || -> Result<Vec<Vec<FieldElement<FieldExtension>>>, ProvingError> {
                 composition_poly_parts
                     .iter()
                     .map(|part| {
@@ -882,7 +876,7 @@ pub trait IsStarkProver<
                             domain.interpolation_domain_size,
                             &domain.coset_offset,
                         )
-                        .unwrap()
+                        .map_err(ProvingError::from)
                     })
                     .collect()
             };
@@ -907,11 +901,11 @@ pub trait IsStarkProver<
                         gpu_composition_parts = Some(handle);
                         evals
                     }
-                    None => cpu_eval(),
+                    None => cpu_eval()?,
                 }
             }
             #[cfg(not(feature = "cuda"))]
-            cpu_eval()
+            cpu_eval()?
         };
         #[cfg(feature = "instruments")]
         let fft_dur = t_sub.elapsed();
@@ -933,7 +927,7 @@ pub trait IsStarkProver<
                 let root = tree.root;
                 (tree, root)
             }
-            None => crate::commitment::commit_bit_reversed(&lde_composition_poly_parts_evaluations, 2)
+            None => crate::commitment::commit_bit_reversed(&lde_composition_poly_parts_evaluations, crate::commitment::ROWS_PER_LEAF)
                 .ok_or(ProvingError::EmptyCommitment)?,
         };
         #[cfg(feature = "instruments")]
@@ -1299,12 +1293,7 @@ pub trait IsStarkProver<
             })
             .collect();
 
-        #[cfg(feature = "parallel")]
-        let iter = (0..lde_size).into_par_iter();
-        #[cfg(not(feature = "parallel"))]
-        let iter = 0..lde_size;
-
-        iter.map(|i| {
+        crate::par::par_map_collect(0..lde_size, |i| {
             let mut result = FieldElement::<FieldExtension>::zero();
 
             // H terms
@@ -1330,7 +1319,6 @@ pub trait IsStarkProver<
 
             result
         })
-        .collect()
     }
 
     /// Computes values and validity proofs of the evaluations of the composition polynomial parts
@@ -1347,7 +1335,7 @@ pub trait IsStarkProver<
     {
         let proof = composition_poly_merkle_tree
             .get_proof_by_pos(index)
-            .unwrap();
+            .expect("FRI query index in bounds");
 
         let lde_composition_poly_parts_evaluation: Vec<_> = lde_composition_poly_evaluations
             .iter()
@@ -1394,7 +1382,8 @@ pub trait IsStarkProver<
         // single leaf at position `challenge`; one Merkle path authenticates both
         // the queried row and its symmetric counterpart.
         PolynomialOpenings {
-            proof: tree.get_proof_by_pos(challenge).unwrap(),
+            proof: tree.get_proof_by_pos(challenge)
+                .expect("FRI query index in bounds"),
             evaluations: gather(reverse_index(challenge * 2, domain_size)),
             evaluations_sym: gather(reverse_index(challenge * 2 + 1, domain_size)),
         }
@@ -1462,7 +1451,7 @@ pub trait IsStarkProver<
         openings
     }
 
-    // TODO: propagate errors instead of unwrap() in commit_columns, reconstruct_round1, and expand_columns_to_lde
+    // TODO: propagate errors instead of unwrap() in commit_main_trace, reconstruct_round1, and expand_columns_to_lde
     /// Generates STARK proofs for one or more AIRs with a shared transcript.
     ///
     /// # Multi-Table Proving with LogUp
@@ -1561,11 +1550,7 @@ pub trait IsStarkProver<
         // Spill main traces to mmap before Round 1 LDE.
         #[cfg(feature = "disk-spill")]
         if storage_mode == StorageMode::Disk {
-            #[cfg(feature = "parallel")]
-            let spill_iter = air_trace_pairs.par_iter_mut();
-            #[cfg(not(feature = "parallel"))]
-            let mut spill_iter = air_trace_pairs.iter_mut();
-            spill_iter.try_for_each(|(_, trace, _)| {
+            crate::par::par_try_for_each_mut(&mut air_trace_pairs, |(_, trace, _)| {
                 trace
                     .main_table
                     .spill_to_disk()
@@ -1602,13 +1587,8 @@ pub trait IsStarkProver<
             let chunk_end = (chunk_start + k).min(num_airs);
             let chunk_range = chunk_start..chunk_end;
 
-            #[cfg(feature = "parallel")]
-            let iter = chunk_range.into_par_iter();
-            #[cfg(not(feature = "parallel"))]
-            let iter = chunk_range;
-
-            let chunk_results: Vec<Result<_, ProvingError>> = iter
-                .map(|idx| {
+            let chunk_results: Vec<Result<_, ProvingError>> =
+                crate::par::par_map_collect(chunk_range, |idx| {
                     let (air, trace, _) = &air_trace_pairs[idx];
                     let domain = &domains[idx];
                     let twiddles = &twiddle_caches[idx];
@@ -1624,8 +1604,7 @@ pub trait IsStarkProver<
                         #[cfg(feature = "disk-spill")]
                         storage_mode,
                     )
-                })
-                .collect();
+                });
 
             // Sequential: append roots to shared transcript (Fiat-Shamir ordering)
             for result in chunk_results {
@@ -1697,17 +1676,13 @@ pub trait IsStarkProver<
         // Spill all aux trace tables to mmap before any Round 1 aux LDE work.
         #[cfg(feature = "disk-spill")]
         if storage_mode == StorageMode::Disk {
-            #[cfg(feature = "parallel")]
-            let spill_iter = air_trace_pairs.par_iter_mut();
-            #[cfg(not(feature = "parallel"))]
-            let mut spill_iter = air_trace_pairs.iter_mut();
-            spill_iter.try_for_each(|(air, trace, _)| {
+            crate::par::par_try_for_each_mut(&mut air_trace_pairs, |(air, trace, _)| {
                 if air.has_aux_trace() {
                     trace
                         .spill_aux_to_disk()
                         .map_err(|e| ProvingError::DiskSpill(format!("aux trace: {e}")))?;
                 }
-                Ok(())
+                Ok::<(), ProvingError>(())
             })?;
         }
 
@@ -1753,14 +1728,9 @@ pub trait IsStarkProver<
             let chunk_end = (chunk_start + k).min(num_airs);
             let chunk_range = chunk_start..chunk_end;
 
-            #[cfg(feature = "parallel")]
-            let iter = chunk_range.into_par_iter();
-            #[cfg(not(feature = "parallel"))]
-            let iter = chunk_range;
-
             #[allow(clippy::type_complexity)]
-            let chunk_aux: Vec<Result<AuxResult<FieldExtension>, ProvingError>> = iter
-                .map(|idx| {
+            let chunk_aux: Vec<Result<AuxResult<FieldExtension>, ProvingError>> =
+                crate::par::par_map_collect(chunk_range, |idx| {
                     let (air, trace, _) = &air_trace_pairs[idx];
                     let domain = &domains[idx];
                     let twiddles = &twiddle_caches[idx];
@@ -1818,7 +1788,7 @@ pub trait IsStarkProver<
                         #[cfg(feature = "instruments")]
                         let t_sub = Instant::now();
                         #[allow(unused_mut)]
-                        let (mut tree, root) = crate::commitment::commit_bit_reversed(&columns, 2)
+                        let (mut tree, root) = crate::commitment::commit_bit_reversed(&columns, crate::commitment::ROWS_PER_LEAF)
                             .ok_or(ProvingError::EmptyCommitment)?;
                         #[cfg(feature = "instruments")]
                         crate::instruments::accum_r1_aux(aux_lde_dur, t_sub.elapsed());
@@ -1839,8 +1809,7 @@ pub trait IsStarkProver<
                         #[cfg(not(feature = "cuda"))]
                         Ok((None, Vec::new()))
                     }
-                })
-                .collect();
+                });
 
             // Sequential: append aux roots to forked transcripts.
             for (j, result) in chunk_aux.into_iter().enumerate() {
@@ -2055,7 +2024,7 @@ pub trait IsStarkProver<
 
     // TODO: propagate errors instead of unwrap() in open_deep_composition_poly and FRI operations
     /// Executes rounds 2-4 and generates a STARK proof for the trace `main_trace` with public inputs `pub_inputs`.
-    /// Warning: the transcript must be safely initializated before passing it to this method.
+    /// Warning: the transcript must be safely initialized before passing it to this method.
     fn prove_rounds_2_to_4(
         air: &dyn AIR<Field = Field, FieldExtension = FieldExtension, PublicInputs = PI>,
         pub_inputs: &PI,
@@ -2068,7 +2037,7 @@ pub trait IsStarkProver<
         FieldElement<FieldExtension>: AsBytes,
         PI: Send + Sync + Clone,
     {
-        info!("Started proof generation...");
+        log::debug!("Started proof generation...");
 
         // ===================================
         // ==========|   Round 2   |==========
@@ -2181,7 +2150,7 @@ pub trait IsStarkProver<
             });
         }
 
-        info!("End proof generation");
+        log::debug!("End proof generation");
 
         Ok(StarkProof {
             // [t]

From 256e69c9a59360b559b19e11ebb6a309d62b8ec2 Mon Sep 17 00:00:00 2001
From: diegokingston <dkingston@fi.uba.ar>
Date: Fri, 19 Jun 2026 17:33:39 -0300
Subject: [PATCH 04/19] docs(stark): fix commitment.rs leaf-layout docs after
 trace pairing

The trace commitment now uses the row-pair leaf layout (ROWS_PER_LEAF=2), same as
composition; rows_per_leaf=1 is only kept for the GPU parity test. Update the
module/const/wrapper docs that still described the pre-pairing per-row trace.
---
 crypto/stark/src/commitment.rs | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/crypto/stark/src/commitment.rs b/crypto/stark/src/commitment.rs
index a55d050b2..d4a6dbdbe 100644
--- a/crypto/stark/src/commitment.rs
+++ b/crypto/stark/src/commitment.rs
@@ -15,13 +15,15 @@
 //!   where R = rows_per_leaf and br(j) = reverse_index(j, num_rows)
 //! ```
 //!
-//! - `rows_per_leaf == 1`: one row per leaf — main/aux trace LDE columns.
-//! - `rows_per_leaf == 2`: a row pair per leaf — composition-polynomial parts
-//!   (Round 2/3), where leaf `i` hashes rows `2i` and `2i+1`.
+//! - `rows_per_leaf == 2` (`ROWS_PER_LEAF`): a row pair per leaf (leaf `i` hashes
+//!   rows `2i` and `2i+1`). Used by BOTH the main/aux trace LDE and the
+//!   composition-polynomial parts: a FRI query opens a value and its symmetric
+//!   counterpart — exactly this pair — so one Merkle path authenticates both.
+//! - `rows_per_leaf == 1`: one row per leaf. No longer used by the prover; kept
+//!   only so the GPU parity tests can compare against the per-row code path.
 //!
 //! The field-element serialization (`write_bytes_be`) + `hash_bytes` path is kept
-//! exactly as before, so commitments are byte-identical to the previous
-//! per-row / per-row-pair implementations.
+//! exactly as before.
 
 use math::fft::bit_reversing::reverse_index;
 use math::field::element::FieldElement;
@@ -34,8 +36,9 @@ use rayon::prelude::{IntoParallelIterator, ParallelIterator};
 use crate::config::{BatchedMerkleTree, BatchedMerkleTreeBackend, Commitment};
 
 /// Number of consecutive (bit-reversed) rows packed into one Merkle leaf for the
-/// composition-polynomial commitment: the row-pair leaf packing the FRI openings
-/// rely on (leaf `i` hashes rows `2i` and `2i+1`).
+/// trace AND composition-polynomial commitments: the row-pair leaf the FRI
+/// openings rely on (leaf `i` hashes rows `2i` and `2i+1`, so one Merkle path
+/// authenticates both a value and its symmetric counterpart).
 pub const ROWS_PER_LEAF: usize = 2;
 
 /// Computes the Keccak-256 leaf hashes for a bit-reversed, column-major commitment,
@@ -103,12 +106,12 @@ where
     result
 }
 
-/// Per-row Keccak-256 leaf hashes (one leaf per bit-reversed row). Used for the
-/// main/aux trace LDE commitments. Thin wrapper over
-/// [`keccak_leaves_bit_reversed_grouped`] with `rows_per_leaf = 1`.
+/// Per-row Keccak-256 leaf hashes (one leaf per bit-reversed row). Thin wrapper
+/// over [`keccak_leaves_bit_reversed_grouped`] with `rows_per_leaf = 1`.
 ///
-/// Kept as a named public function because the GPU parity tests in dependent
-/// crates compare against this exact code path.
+/// The prover no longer commits per-row (trace and composition both use the
+/// row-pair layout, `ROWS_PER_LEAF`); this stays a named public function only so
+/// the GPU parity tests in dependent crates can compare the per-row code path.
 pub fn keccak_leaves_bit_reversed<E>(columns: &[Vec<FieldElement<E>>]) -> Vec<Commitment>
 where
     E: IsField,

From f34a6f6ce4e4ead7320a663818a916cadc2f0a56 Mon Sep 17 00:00:00 2001
From: diegokingston <dkingston@fi.uba.ar>
Date: Fri, 19 Jun 2026 17:45:44 -0300
Subject: [PATCH 05/19] refactor(prover): dedup commit pipeline (commit_plain +
 spill_tree) (B)

Extract two helpers on IsStarkProver, collapsing the near-duplicate
main-trace and aux-trace commit code:

- spill_tree<C>: the identical-except-label disk-spill block, shared by
  the main / preprocessed-split / aux commit sites (4 call sites).
- commit_plain<C>: commit_bit_reversed + spill_tree + TableCommit::plain,
  shared by the main-trace (non-preprocessed None arm) and aux-trace
  plain-commit paths.

Proof output is byte-identical (stark 128/128, +disk-spill 133/133).
The only behavioral delta is in the instruments profiling feature: the
aux commit's timing bucket now includes its (tiny) disk spill, matching
what the main path already measured.

clippy clean on default / disk-spill / instruments; builds on the
combined feature set.
---
 crypto/stark/src/prover.rs | 88 ++++++++++++++++++++++++++------------
 1 file changed, 60 insertions(+), 28 deletions(-)

diff --git a/crypto/stark/src/prover.rs b/crypto/stark/src/prover.rs
index 7117fd708..74026c112 100644
--- a/crypto/stark/src/prover.rs
+++ b/crypto/stark/src/prover.rs
@@ -581,17 +581,13 @@ pub trait IsStarkProver<
         let t_sub = Instant::now();
 
         let commit = match precomputed {
-            None => {
-                #[allow(unused_mut)]
-                let (mut tree, root) = crate::commitment::commit_bit_reversed(&columns, crate::commitment::ROWS_PER_LEAF)
-                    .ok_or(ProvingError::EmptyCommitment)?;
+            None => Self::commit_plain::<Field>(
+                &columns,
                 #[cfg(feature = "disk-spill")]
-                if storage_mode == StorageMode::Disk {
-                    tree.spill_nodes_to_disk()
-                        .map_err(|e| ProvingError::DiskSpill(format!("main Merkle tree: {e}")))?;
-                }
-                TableCommit::plain(tree, root)
-            }
+                storage_mode,
+                #[cfg(feature = "disk-spill")]
+                "main Merkle tree",
+            )?,
             Some((expected_precomputed_root, num_cols)) => {
                 #[allow(unused_mut)]
                 let (mut precomputed_tree, precomputed_root) =
@@ -605,13 +601,9 @@ pub trait IsStarkProver<
                     return Err(ProvingError::PrecomputedCommitmentMismatch);
                 }
                 #[cfg(feature = "disk-spill")]
-                if storage_mode == StorageMode::Disk {
-                    precomputed_tree.spill_nodes_to_disk().map_err(|e| {
-                        ProvingError::DiskSpill(format!("precomputed Merkle tree: {e}"))
-                    })?;
-                    mult_tree
-                        .spill_nodes_to_disk()
-                        .map_err(|e| ProvingError::DiskSpill(format!("mult Merkle tree: {e}")))?;
+                {
+                    Self::spill_tree(&mut precomputed_tree, storage_mode, "precomputed Merkle tree")?;
+                    Self::spill_tree(&mut mult_tree, storage_mode, "mult Merkle tree")?;
                 }
                 TableCommit::preprocessed(
                     mult_tree,
@@ -632,6 +624,48 @@ pub trait IsStarkProver<
         Ok((commit, columns))
     }
 
+    /// Spill a committed Merkle tree to disk when `storage_mode` is `Disk`,
+    /// tagging any I/O error with `label`. No-op otherwise. Shared by every commit
+    /// site (main / preprocessed split / aux).
+    #[cfg(feature = "disk-spill")]
+    fn spill_tree<C>(
+        tree: &mut BatchedMerkleTree<C>,
+        storage_mode: StorageMode,
+        label: &str,
+    ) -> Result<(), ProvingError>
+    where
+        C: IsField,
+        FieldElement<C>: AsBytes + Sync + Send,
+    {
+        if storage_mode == StorageMode::Disk {
+            tree.spill_nodes_to_disk()
+                .map_err(|e| ProvingError::DiskSpill(format!("{label}: {e}")))?;
+        }
+        Ok(())
+    }
+
+    /// Commit an already-LDE-expanded plain column set: build the row-paired
+    /// Merkle tree, spill it to disk if requested, and wrap it as a plain
+    /// `TableCommit`. Shared by the main-trace (non-preprocessed) and aux-trace
+    /// commit paths.
+    fn commit_plain<C>(
+        columns: &[Vec<FieldElement<C>>],
+        #[cfg(feature = "disk-spill")] storage_mode: StorageMode,
+        #[cfg(feature = "disk-spill")] spill_label: &str,
+    ) -> Result<TableCommit<C>, ProvingError>
+    where
+        C: IsField,
+        FieldElement<C>: AsBytes + Sync + Send + math::traits::ByteConversion,
+    {
+        #[allow(unused_mut)]
+        let (mut tree, root) =
+            crate::commitment::commit_bit_reversed(columns, crate::commitment::ROWS_PER_LEAF)
+                .ok_or(ProvingError::EmptyCommitment)?;
+        #[cfg(feature = "disk-spill")]
+        Self::spill_tree(&mut tree, storage_mode, spill_label)?;
+        Ok(TableCommit::plain(tree, root))
+    }
+
     /// Recompute Round1 from the trace, reusing the Merkle trees stored in commitments.
     ///
     /// Only used by `run_debug_checks` — Phase D consumes the cached LDE
@@ -1787,22 +1821,20 @@ pub trait IsStarkProver<
                         let aux_lde_dur = t_sub.elapsed();
                         #[cfg(feature = "instruments")]
                         let t_sub = Instant::now();
-                        #[allow(unused_mut)]
-                        let (mut tree, root) = crate::commitment::commit_bit_reversed(&columns, crate::commitment::ROWS_PER_LEAF)
-                            .ok_or(ProvingError::EmptyCommitment)?;
+                        let commit = Self::commit_plain::<FieldExtension>(
+                            &columns,
+                            #[cfg(feature = "disk-spill")]
+                            storage_mode,
+                            #[cfg(feature = "disk-spill")]
+                            "aux Merkle tree",
+                        )?;
                         #[cfg(feature = "instruments")]
                         crate::instruments::accum_r1_aux(aux_lde_dur, t_sub.elapsed());
 
-                        #[cfg(feature = "disk-spill")]
-                        if storage_mode == StorageMode::Disk {
-                            tree.spill_nodes_to_disk().map_err(|e| {
-                                ProvingError::DiskSpill(format!("aux Merkle tree: {e}"))
-                            })?;
-                        }
                         #[cfg(feature = "cuda")]
-                        return Ok((Some(TableCommit::plain(tree, root)), columns, None));
+                        return Ok((Some(commit), columns, None));
                         #[cfg(not(feature = "cuda"))]
-                        Ok((Some(TableCommit::plain(tree, root)), columns))
+                        Ok((Some(commit), columns))
                     } else {
                         #[cfg(feature = "cuda")]
                         return Ok((None, Vec::new(), None));

From b4784e64f2dec8570d1d8b924666ee618e44e172 Mon Sep 17 00:00:00 2001
From: diegokingston <dkingston@fi.uba.ar>
Date: Mon, 22 Jun 2026 11:05:57 -0300
Subject: [PATCH 06/19] fix(prover): row-pair the preprocessed-table
 commitments (CI fix)

The trace Merkle commitment moved to a row-pair leaf layout
(ROWS_PER_LEAF=2), but the 5 preprocessed-table commitment computers
(bitwise/keccak_rc/page/decode/register) still built a 1-row-per-leaf
tree manually, so the prover's row-pair precomputed root no longer
matched the computed/hardcoded one -> PrecomputedCommitmentMismatch at
prove time.

- Route all 5 compute_*commitment fns through the shared
  stark::commitment::commit_bit_reversed(.., ROWS_PER_LEAF), dropping the
  manual bit-reverse + columns2rows + 1-row BatchedMerkleTree::build.
- Regenerate the hardcoded bitwise/keccak_rc/zero_page static commitments
  for the row-pair layout (via compute_static_commitments).
- cargo fmt (prover.rs + touched tables).

Verified: static_commitments drift tests 5/5, stark 128/128, clippy +
fmt clean, no PrecomputedCommitmentMismatch. Full ELF prove/verify runs
in CI (guest artifacts absent locally).
---
 crypto/stark/src/prover.rs     | 34 +++++++++++++++++-------
 prover/src/tables/bitwise.rs   | 48 ++++++++++++----------------------
 prover/src/tables/decode.rs    | 22 +++++-----------
 prover/src/tables/keccak_rc.rs | 38 +++++++++++----------------
 prover/src/tables/page.rs      | 35 +++++++++++--------------
 prover/src/tables/register.rs  | 17 +++++-------
 6 files changed, 82 insertions(+), 112 deletions(-)

diff --git a/crypto/stark/src/prover.rs b/crypto/stark/src/prover.rs
index 74026c112..ab1b3275b 100644
--- a/crypto/stark/src/prover.rs
+++ b/crypto/stark/src/prover.rs
@@ -440,7 +440,8 @@ pub trait IsStarkProver<
         let twiddles = LdeTwiddles::new(&domain);
         let evals =
             Self::compute_lde_from_columns_cached::<Field>(&precomputed, &domain, &twiddles);
-        let (_, commitment) = crate::commitment::commit_bit_reversed(&evals, crate::commitment::ROWS_PER_LEAF)?;
+        let (_, commitment) =
+            crate::commitment::commit_bit_reversed(&evals, crate::commitment::ROWS_PER_LEAF)?;
         Some(commitment)
     }
 
@@ -591,18 +592,27 @@ pub trait IsStarkProver<
             Some((expected_precomputed_root, num_cols)) => {
                 #[allow(unused_mut)]
                 let (mut precomputed_tree, precomputed_root) =
-                    crate::commitment::commit_bit_reversed(&columns[..num_cols], crate::commitment::ROWS_PER_LEAF)
-                        .ok_or(ProvingError::EmptyCommitment)?;
+                    crate::commitment::commit_bit_reversed(
+                        &columns[..num_cols],
+                        crate::commitment::ROWS_PER_LEAF,
+                    )
+                    .ok_or(ProvingError::EmptyCommitment)?;
                 #[allow(unused_mut)]
-                let (mut mult_tree, mult_root) =
-                    crate::commitment::commit_bit_reversed(&columns[num_cols..], crate::commitment::ROWS_PER_LEAF)
-                        .ok_or(ProvingError::EmptyCommitment)?;
+                let (mut mult_tree, mult_root) = crate::commitment::commit_bit_reversed(
+                    &columns[num_cols..],
+                    crate::commitment::ROWS_PER_LEAF,
+                )
+                .ok_or(ProvingError::EmptyCommitment)?;
                 if precomputed_root != expected_precomputed_root {
                     return Err(ProvingError::PrecomputedCommitmentMismatch);
                 }
                 #[cfg(feature = "disk-spill")]
                 {
-                    Self::spill_tree(&mut precomputed_tree, storage_mode, "precomputed Merkle tree")?;
+                    Self::spill_tree(
+                        &mut precomputed_tree,
+                        storage_mode,
+                        "precomputed Merkle tree",
+                    )?;
                     Self::spill_tree(&mut mult_tree, storage_mode, "mult Merkle tree")?;
                 }
                 TableCommit::preprocessed(
@@ -961,8 +971,11 @@ pub trait IsStarkProver<
                 let root = tree.root;
                 (tree, root)
             }
-            None => crate::commitment::commit_bit_reversed(&lde_composition_poly_parts_evaluations, crate::commitment::ROWS_PER_LEAF)
-                .ok_or(ProvingError::EmptyCommitment)?,
+            None => crate::commitment::commit_bit_reversed(
+                &lde_composition_poly_parts_evaluations,
+                crate::commitment::ROWS_PER_LEAF,
+            )
+            .ok_or(ProvingError::EmptyCommitment)?,
         };
         #[cfg(feature = "instruments")]
         let merkle_dur = t_sub.elapsed();
@@ -1416,7 +1429,8 @@ pub trait IsStarkProver<
         // single leaf at position `challenge`; one Merkle path authenticates both
         // the queried row and its symmetric counterpart.
         PolynomialOpenings {
-            proof: tree.get_proof_by_pos(challenge)
+            proof: tree
+                .get_proof_by_pos(challenge)
                 .expect("FRI query index in bounds"),
             evaluations: gather(reverse_index(challenge * 2, domain_size)),
             evaluations_sym: gather(reverse_index(challenge * 2 + 1, domain_size)),
diff --git a/prover/src/tables/bitwise.rs b/prover/src/tables/bitwise.rs
index cb92e37ce..4788f65af 100644
--- a/prover/src/tables/bitwise.rs
+++ b/prover/src/tables/bitwise.rs
@@ -25,13 +25,13 @@
 //! All lookups are provided as receivers with negative multiplicity,
 //! meaning other tables send to this table.
 
-use math::fft::bit_reversing::in_place_bit_reverse_permute;
 use math::polynomial::Polynomial;
-use stark::config::{BatchedMerkleTree, Commitment};
+use stark::commitment::{ROWS_PER_LEAF, commit_bit_reversed};
+use stark::config::Commitment;
 use stark::lookup::{BusInteraction, BusValue, Multiplicity, Packing};
 use stark::proof::options::ProofOptions;
 use stark::prover::evaluate_polynomial_on_lde_domain;
-use stark::trace::{TraceTable, columns2rows};
+use stark::trace::TraceTable;
 
 #[cfg(feature = "parallel")]
 use rayon::prelude::*;
@@ -195,19 +195,19 @@ pub const fn is_preprocessed() -> bool {
 fn static_commitment(blowup_factor: u8) -> Option<Commitment> {
     match blowup_factor {
         2 => Some([
-            0xfb, 0x46, 0xff, 0x1c, 0xed, 0x4c, 0x97, 0xfb, 0xb2, 0x17, 0x55, 0x24, 0x08, 0x04,
-            0x15, 0xee, 0xbe, 0xa6, 0xee, 0x86, 0x69, 0xaf, 0x3a, 0x4f, 0x9e, 0x2a, 0x44, 0x81,
-            0xf9, 0xb0, 0xf3, 0xff,
+            0xfa, 0x3e, 0xcf, 0x80, 0xfd, 0x95, 0xe5, 0x09, 0x74, 0xd4, 0x55, 0x23, 0xf6, 0x42,
+            0xb6, 0x4b, 0x05, 0xc4, 0xf9, 0x66, 0xc2, 0x4d, 0xff, 0xda, 0x31, 0x47, 0xab, 0x7b,
+            0x0c, 0x6d, 0xc4, 0xcf,
         ]),
         4 => Some([
-            0xb5, 0xc4, 0xc0, 0x80, 0x03, 0x5b, 0xb6, 0x12, 0x78, 0x8c, 0x4d, 0xd4, 0x9e, 0x3d,
-            0xc4, 0xe2, 0xef, 0x95, 0xf0, 0xbf, 0xe8, 0x1d, 0x98, 0xec, 0x7f, 0x58, 0x3a, 0x47,
-            0x18, 0x03, 0x7e, 0xa5,
+            0xff, 0x76, 0x8e, 0x85, 0x4b, 0xdc, 0x32, 0x61, 0x96, 0x16, 0x15, 0x19, 0x73, 0x70,
+            0xf0, 0x64, 0x81, 0xfd, 0x4f, 0x5c, 0xbd, 0x9c, 0x30, 0x26, 0xd5, 0xc0, 0x81, 0xf3,
+            0xce, 0x38, 0x50, 0x3e,
         ]),
         8 => Some([
-            0x8a, 0x18, 0x70, 0x51, 0x34, 0x1a, 0x65, 0xaa, 0x79, 0x17, 0x07, 0x9a, 0xf3, 0x0b,
-            0xcb, 0xd0, 0x7c, 0xe3, 0x2a, 0xce, 0x89, 0x9a, 0xfd, 0xc8, 0x0d, 0x6b, 0x48, 0x43,
-            0x83, 0x5d, 0x18, 0xb8,
+            0x0e, 0x1b, 0xc1, 0x0d, 0xae, 0x64, 0xe7, 0xca, 0xe0, 0x2a, 0x3b, 0xab, 0xd7, 0xd2,
+            0xbb, 0x80, 0xd5, 0x24, 0x5a, 0xce, 0x25, 0xb6, 0x84, 0x77, 0x9c, 0xb5, 0xeb, 0x67,
+            0x61, 0x82, 0x78, 0x3d,
         ]),
         _ => None,
     }
@@ -283,7 +283,7 @@ pub fn compute_preprocessed_commitment(options: &ProofOptions) -> Commitment {
     let coset_offset = FE::from(options.coset_offset);
 
     #[cfg(feature = "parallel")]
-    let mut lde_columns: Vec<Vec<FE>> = polys
+    let lde_columns: Vec<Vec<FE>> = polys
         .par_iter()
         .map(|poly| {
             evaluate_polynomial_on_lde_domain(poly, blowup_factor, NUM_ROWS, &coset_offset)
@@ -292,7 +292,7 @@ pub fn compute_preprocessed_commitment(options: &ProofOptions) -> Commitment {
         .collect();
 
     #[cfg(not(feature = "parallel"))]
-    let mut lde_columns: Vec<Vec<FE>> = polys
+    let lde_columns: Vec<Vec<FE>> = polys
         .iter()
         .map(|poly| {
             evaluate_polynomial_on_lde_domain(poly, blowup_factor, NUM_ROWS, &coset_offset)
@@ -300,25 +300,9 @@ pub fn compute_preprocessed_commitment(options: &ProofOptions) -> Commitment {
         })
         .collect();
 
-    // Step 4: Bit-reverse permute (parallel)
-    #[cfg(feature = "parallel")]
-    lde_columns.par_iter_mut().for_each(|col| {
-        in_place_bit_reverse_permute(col);
-    });
-
-    #[cfg(not(feature = "parallel"))]
-    for col in lde_columns.iter_mut() {
-        in_place_bit_reverse_permute(col);
-    }
-
-    // Step 5: Convert columns to rows for Merkle tree
-    let lde_rows = columns2rows(lde_columns);
-
-    // Step 6: Build Merkle tree over LDE (N * blowup leaves)
-    let tree = BatchedMerkleTree::<GoldilocksField>::build(&lde_rows)
+    let (_, root) = commit_bit_reversed(&lde_columns, ROWS_PER_LEAF)
         .expect("Failed to build Merkle tree for bitwise LDE");
-
-    tree.root
+    root
 }
 
 /// Returns the preprocessed commitment for the bitwise table.
diff --git a/prover/src/tables/decode.rs b/prover/src/tables/decode.rs
index f1fe14e03..b70115a32 100644
--- a/prover/src/tables/decode.rs
+++ b/prover/src/tables/decode.rs
@@ -34,13 +34,13 @@
 use executor::elf::Elf;
 use executor::vm::instruction::decoding::{Instruction, InstructionError};
 use executor::vm::memory::U64HashMap;
-use math::fft::bit_reversing::in_place_bit_reverse_permute;
 use math::polynomial::Polynomial;
-use stark::config::{BatchedMerkleTree, Commitment};
+use stark::commitment::{ROWS_PER_LEAF, commit_bit_reversed};
+use stark::config::Commitment;
 use stark::lookup::{BusInteraction, BusValue, Multiplicity, Packing};
 use stark::proof::options::ProofOptions;
 use stark::prover::evaluate_polynomial_on_lde_domain;
-use stark::trace::{TraceTable, columns2rows};
+use stark::trace::TraceTable;
 
 use super::types::{BusId, FE, GoldilocksExtension, GoldilocksField};
 
@@ -286,7 +286,7 @@ pub fn compute_precomputed_commitment(
     // Step 4: Evaluate polynomials on LDE domain (N * blowup_factor points)
     let blowup_factor = options.blowup_factor as usize;
     let coset_offset = FE::from(options.coset_offset);
-    let mut lde_columns: Vec<Vec<FE>> = polys
+    let lde_columns: Vec<Vec<FE>> = polys
         .iter()
         .map(|poly| {
             evaluate_polynomial_on_lde_domain(poly, blowup_factor, num_rows, &coset_offset)
@@ -294,19 +294,9 @@ pub fn compute_precomputed_commitment(
         })
         .collect();
 
-    // Step 5: Bit-reverse permute (same as prover)
-    for col in lde_columns.iter_mut() {
-        in_place_bit_reverse_permute(col);
-    }
-
-    // Step 6: Convert columns to rows for Merkle tree
-    let lde_rows = columns2rows(lde_columns);
-
-    // Step 7: Build Merkle tree over LDE (N * blowup leaves)
-    let tree = BatchedMerkleTree::<GoldilocksField>::build(&lde_rows)
+    let (_, root) = commit_bit_reversed(&lde_columns, ROWS_PER_LEAF)
         .expect("Failed to build Merkle tree for decode LDE");
-
-    tree.root
+    root
 }
 
 // =========================================================================
diff --git a/prover/src/tables/keccak_rc.rs b/prover/src/tables/keccak_rc.rs
index 3bcdf3428..99cd875b6 100644
--- a/prover/src/tables/keccak_rc.rs
+++ b/prover/src/tables/keccak_rc.rs
@@ -8,14 +8,14 @@
 //! committed via a static lookup table (with recompute as fallback for
 //! `ProofOptions` not covered by the static table).
 
-use math::fft::bit_reversing::in_place_bit_reverse_permute;
 use math::field::element::FieldElement;
 use math::polynomial::Polynomial;
-use stark::config::{BatchedMerkleTree, Commitment};
+use stark::commitment::{ROWS_PER_LEAF, commit_bit_reversed};
+use stark::config::Commitment;
 use stark::lookup::{BusInteraction, BusValue, Multiplicity, Packing};
 use stark::proof::options::ProofOptions;
 use stark::prover::evaluate_polynomial_on_lde_domain;
-use stark::trace::{TraceTable, columns2rows};
+use stark::trace::TraceTable;
 
 use executor::vm::instruction::execution::KECCAK_RC;
 
@@ -98,19 +98,19 @@ pub const fn generate_row(round: usize) -> [u64; NUM_PRECOMPUTED_COLS] {
 fn static_commitment(blowup_factor: u8) -> Option<Commitment> {
     match blowup_factor {
         2 => Some([
-            0xe8, 0x06, 0x8b, 0xb2, 0xbd, 0x3d, 0x80, 0xf3, 0x92, 0x95, 0x31, 0x1a, 0xfd, 0x55,
-            0xba, 0x12, 0x3f, 0x76, 0xeb, 0x44, 0x32, 0x57, 0x9d, 0xb7, 0x7f, 0x1e, 0x63, 0xb4,
-            0x98, 0xb5, 0xb0, 0xb7,
+            0xab, 0x7a, 0xad, 0xf5, 0xbf, 0xa2, 0xd5, 0x5c, 0x29, 0x83, 0x83, 0xe6, 0x2e, 0x47,
+            0xa0, 0xa5, 0x22, 0xf9, 0x57, 0x89, 0x5a, 0x5c, 0xbb, 0x1f, 0x34, 0xbc, 0x21, 0x72,
+            0xa9, 0x2c, 0x85, 0xe3,
         ]),
         4 => Some([
-            0xa9, 0xfb, 0xc9, 0x15, 0x1c, 0x22, 0x75, 0xe7, 0x56, 0xeb, 0x6d, 0xf9, 0xfe, 0x83,
-            0x2a, 0xb1, 0xa7, 0x1a, 0x20, 0x71, 0x9b, 0x0c, 0xff, 0x6b, 0x3f, 0x57, 0xc6, 0x84,
-            0x3e, 0xbf, 0xc8, 0xaa,
+            0xfb, 0x42, 0x58, 0x76, 0xf4, 0x30, 0x98, 0x04, 0xef, 0x8c, 0x4e, 0x65, 0xf7, 0x1a,
+            0x29, 0x03, 0xd2, 0xc6, 0x12, 0x0d, 0x18, 0xe1, 0x28, 0x6e, 0x70, 0xeb, 0xa8, 0x11,
+            0x3c, 0x3e, 0xe1, 0xdd,
         ]),
         8 => Some([
-            0x5c, 0x30, 0xf6, 0xa0, 0xcf, 0x78, 0x43, 0x15, 0x5b, 0x5d, 0x18, 0x34, 0x44, 0xba,
-            0x81, 0x9a, 0x64, 0x05, 0x5c, 0x79, 0x26, 0x18, 0x09, 0x24, 0x6b, 0xa2, 0x3f, 0x5f,
-            0x77, 0x09, 0xd5, 0xfc,
+            0x9f, 0x91, 0xaf, 0xb6, 0x5b, 0x75, 0x1e, 0xfb, 0x73, 0x93, 0x2c, 0xc4, 0xa8, 0xe1,
+            0xb5, 0x21, 0x91, 0x5d, 0x6a, 0x19, 0x2e, 0x1d, 0xa8, 0x80, 0x21, 0x1f, 0x36, 0x76,
+            0x9b, 0x8e, 0x3d, 0xb6,
         ]),
         _ => None,
     }
@@ -145,7 +145,7 @@ pub fn compute_preprocessed_commitment(options: &ProofOptions) -> Commitment {
     // Evaluate on LDE domain
     let blowup_factor = options.blowup_factor as usize;
     let coset_offset = FE::from(options.coset_offset);
-    let mut lde_columns: Vec<Vec<FE>> = polys
+    let lde_columns: Vec<Vec<FE>> = polys
         .iter()
         .map(|poly| {
             evaluate_polynomial_on_lde_domain(poly, blowup_factor, NUM_ROWS, &coset_offset)
@@ -153,17 +153,9 @@ pub fn compute_preprocessed_commitment(options: &ProofOptions) -> Commitment {
         })
         .collect();
 
-    // Bit-reverse permute
-    for col in lde_columns.iter_mut() {
-        in_place_bit_reverse_permute(col);
-    }
-
-    // Build Merkle tree
-    let lde_rows = columns2rows(lde_columns);
-    let tree = BatchedMerkleTree::<GoldilocksField>::build(&lde_rows)
+    let (_, root) = commit_bit_reversed(&lde_columns, ROWS_PER_LEAF)
         .expect("Failed to build Merkle tree for keccak_rc LDE");
-
-    tree.root
+    root
 }
 
 /// Returns the preprocessed commitment for the keccak_rc table.
diff --git a/prover/src/tables/page.rs b/prover/src/tables/page.rs
index 3997e8c22..e8f7c578e 100644
--- a/prover/src/tables/page.rs
+++ b/prover/src/tables/page.rs
@@ -32,13 +32,13 @@
 
 use std::collections::HashMap;
 
-use math::fft::bit_reversing::in_place_bit_reverse_permute;
 use math::polynomial::Polynomial;
-use stark::config::{BatchedMerkleTree, Commitment};
+use stark::commitment::{ROWS_PER_LEAF, commit_bit_reversed};
+use stark::config::Commitment;
 use stark::lookup::{BusInteraction, BusValue, LinearTerm, Multiplicity, Packing};
 use stark::proof::options::ProofOptions;
 use stark::prover::evaluate_polynomial_on_lde_domain;
-use stark::trace::{TraceTable, columns2rows};
+use stark::trace::TraceTable;
 
 use super::types::{BusId, FE, GoldilocksExtension, GoldilocksField};
 
@@ -247,19 +247,19 @@ pub fn generate_page_trace(
 pub(crate) fn static_zero_page_commitment(blowup_factor: u8) -> Option<Commitment> {
     match blowup_factor {
         2 => Some([
-            0xf9, 0x80, 0x0e, 0x45, 0x72, 0x5a, 0x8e, 0x8e, 0x5e, 0xd7, 0x5b, 0x60, 0xce, 0xd0,
-            0x8e, 0xa3, 0x27, 0x3b, 0x8a, 0xb5, 0x98, 0xc0, 0xe3, 0x16, 0xf6, 0x86, 0x75, 0x39,
-            0x4c, 0xe5, 0x88, 0x5e,
+            0x7d, 0x74, 0x85, 0xf0, 0x2b, 0x74, 0xe0, 0x3f, 0x14, 0x99, 0xb3, 0xa0, 0x5f, 0x1d,
+            0x6e, 0xf2, 0x21, 0xff, 0xaf, 0x24, 0x7e, 0x30, 0xb0, 0xda, 0x48, 0x79, 0xe1, 0x43,
+            0xee, 0xea, 0x6a, 0x0f,
         ]),
         4 => Some([
-            0x0f, 0xb5, 0x0c, 0xa8, 0x3b, 0x69, 0x4f, 0x91, 0x60, 0xbf, 0x0d, 0x0d, 0xd3, 0x33,
-            0x25, 0x38, 0x11, 0xbb, 0xf8, 0xfd, 0x54, 0xbd, 0x06, 0x7d, 0xd1, 0xeb, 0xa3, 0x58,
-            0xe8, 0x37, 0x45, 0x56,
+            0x5c, 0xcc, 0x5b, 0xb1, 0xe8, 0x11, 0x91, 0x81, 0xbd, 0xdd, 0x39, 0x40, 0x77, 0x87,
+            0xdc, 0x98, 0x06, 0x06, 0x8c, 0x63, 0xcd, 0xfd, 0xf1, 0xda, 0x4a, 0x55, 0x31, 0x4d,
+            0x6a, 0x16, 0x18, 0xd0,
         ]),
         8 => Some([
-            0x4a, 0xfb, 0xc9, 0x6d, 0x46, 0x29, 0xa3, 0xc2, 0x36, 0x14, 0xd8, 0x24, 0x3e, 0xef,
-            0x97, 0x3f, 0xe1, 0xda, 0x2b, 0xf7, 0x87, 0xb6, 0x54, 0xe1, 0xc6, 0x46, 0xc0, 0x85,
-            0x96, 0x7f, 0x7f, 0x48,
+            0xf0, 0xc0, 0x69, 0xed, 0xf8, 0x59, 0xd6, 0x56, 0x15, 0x3c, 0x2f, 0x93, 0x65, 0xd6,
+            0xe9, 0xe9, 0x8e, 0xd1, 0x83, 0x94, 0xf9, 0x75, 0x59, 0xd1, 0xec, 0x16, 0xe1, 0x37,
+            0xd5, 0x32, 0xd6, 0xd9,
         ]),
         _ => None,
     }
@@ -312,7 +312,7 @@ pub fn compute_precomputed_commitment(config: &PageConfig, options: &ProofOption
 
     let blowup_factor = options.blowup_factor as usize;
     let coset_offset = FE::from(options.coset_offset);
-    let mut lde_columns: Vec<Vec<FE>> = polys
+    let lde_columns: Vec<Vec<FE>> = polys
         .iter()
         .map(|poly| {
             evaluate_polynomial_on_lde_domain(poly, blowup_factor, num_rows, &coset_offset)
@@ -320,14 +320,9 @@ pub fn compute_precomputed_commitment(config: &PageConfig, options: &ProofOption
         })
         .collect();
 
-    for col in lde_columns.iter_mut() {
-        in_place_bit_reverse_permute(col);
-    }
-
-    let lde_rows = columns2rows(lde_columns);
-    let tree = BatchedMerkleTree::<GoldilocksField>::build(&lde_rows)
+    let (_, root) = commit_bit_reversed(&lde_columns, ROWS_PER_LEAF)
         .expect("Failed to build Merkle tree for page LDE");
-    tree.root
+    root
 }
 
 /// Returns the zero-init PAGE preprocessed commitment.
diff --git a/prover/src/tables/register.rs b/prover/src/tables/register.rs
index 2907c924a..662996dca 100644
--- a/prover/src/tables/register.rs
+++ b/prover/src/tables/register.rs
@@ -20,13 +20,13 @@
 
 use std::collections::HashMap;
 
-use math::fft::bit_reversing::in_place_bit_reverse_permute;
 use math::polynomial::Polynomial;
-use stark::config::{BatchedMerkleTree, Commitment};
+use stark::commitment::{ROWS_PER_LEAF, commit_bit_reversed};
+use stark::config::Commitment;
 use stark::lookup::{BusInteraction, BusValue, Multiplicity, Packing};
 use stark::proof::options::ProofOptions;
 use stark::prover::evaluate_polynomial_on_lde_domain;
-use stark::trace::{TraceTable, columns2rows};
+use stark::trace::TraceTable;
 
 use super::page::STACK_TOP;
 use super::types::{BusId, FE, GoldilocksExtension, GoldilocksField};
@@ -219,7 +219,7 @@ pub fn compute_precomputed_commitment(options: &ProofOptions, entry_point: u64)
 
     let blowup_factor = options.blowup_factor as usize;
     let coset_offset = FE::from(options.coset_offset);
-    let mut lde_columns: Vec<Vec<FE>> = polys
+    let lde_columns: Vec<Vec<FE>> = polys
         .iter()
         .map(|poly| {
             evaluate_polynomial_on_lde_domain(poly, blowup_factor, num_rows, &coset_offset)
@@ -227,14 +227,9 @@ pub fn compute_precomputed_commitment(options: &ProofOptions, entry_point: u64)
         })
         .collect();
 
-    for col in lde_columns.iter_mut() {
-        in_place_bit_reverse_permute(col);
-    }
-
-    let lde_rows = columns2rows(lde_columns);
-    let tree = BatchedMerkleTree::<GoldilocksField>::build(&lde_rows)
+    let (_, root) = commit_bit_reversed(&lde_columns, ROWS_PER_LEAF)
         .expect("Failed to build Merkle tree for register LDE");
-    tree.root
+    root
 }
 
 /// Returns the preprocessed commitment for the REGISTER table.

From c6b9832ad2faaf25444a56fed2d098c7fc7e97be Mon Sep 17 00:00:00 2001
From: diegokingston <dkingston@fi.uba.ar>
Date: Mon, 22 Jun 2026 11:36:12 -0300
Subject: [PATCH 07/19] test(prover): regenerate SUB_DECODE_COMMITMENT_BLOWUP_2
 for row-pair layout

The compile-time decode-commitment const for sub.elf shifted with the
row-pair preprocessed commitment; regenerated via commitment_from_elf
(the print_decode_commitment_for_sub regen path).
---
 prover/src/tests/decode_tests.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/prover/src/tests/decode_tests.rs b/prover/src/tests/decode_tests.rs
index 43e6991cf..e2169da5a 100644
--- a/prover/src/tests/decode_tests.rs
+++ b/prover/src/tests/decode_tests.rs
@@ -242,8 +242,8 @@ fn decode_commitment_zero_bytes_rejects() {
 /// AIR or FFT pipeline changes, this drifts and the test fails —
 /// regenerate via the `print_decode_commitment_for_sub` helper below.
 const SUB_DECODE_COMMITMENT_BLOWUP_2: [u8; 32] = [
-    0x60, 0x66, 0x0b, 0x18, 0x0d, 0x41, 0x08, 0xb3, 0x3a, 0x03, 0x99, 0x03, 0x8c, 0x9d, 0x12, 0x57,
-    0x68, 0x8d, 0xed, 0x13, 0x60, 0xeb, 0x1d, 0x2b, 0xa8, 0xea, 0x1c, 0x76, 0xc9, 0xdd, 0x25, 0xaf,
+    0x9b, 0x94, 0x41, 0x2d, 0xf0, 0x7f, 0xde, 0x2a, 0x0c, 0x31, 0x99, 0x79, 0x43, 0xef, 0x8b, 0xc1,
+    0x57, 0x05, 0x5f, 0xc6, 0x34, 0x22, 0x06, 0x06, 0x7c, 0x31, 0x73, 0xa0, 0x3b, 0xd8, 0xe3, 0x95,
 ];
 
 #[test]

From acd06e623257bbe3a422926061ea58571f2fd970 Mon Sep 17 00:00:00 2001
From: diegokingston <dkingston@fi.uba.ar>
Date: Mon, 22 Jun 2026 11:47:12 -0300
Subject: [PATCH 08/19] test(prover): TEMP print actual decode commitment to
 regenerate const from CI

---
 prover/src/tests/decode_tests.rs | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/prover/src/tests/decode_tests.rs b/prover/src/tests/decode_tests.rs
index e2169da5a..618bfb672 100644
--- a/prover/src/tests/decode_tests.rs
+++ b/prover/src/tests/decode_tests.rs
@@ -252,6 +252,16 @@ fn decode_commitment_compile_time_const_accepts() {
     let vm_proof = prove(&elf_bytes).expect("prove failed");
     let options = GoldilocksCubicProofOptions::with_blowup(2).expect("blowup=2 valid");
 
+    // TEMP(ci-regen): surface the actual row-pair decode commitment computed from
+    // CI's sub.elf so SUB_DECODE_COMMITMENT_BLOWUP_2 can be regenerated. Remove.
+    {
+        let elf = Elf::load(&elf_bytes).expect("ELF load");
+        eprintln!(
+            "ACTUAL_SUB_DECODE_COMMITMENT_BLOWUP_2 = {:02x?}",
+            commitment_from_elf(&elf, &options).expect("decode commitment")
+        );
+    }
+
     // Pass the OFFLINE-COMPUTED const directly — mimics the recursion guest's
     // workflow where the value lives in the caller's compiled binary.
     let result = verify_with_options(

From c1ba23fc9f23faac8cd3fac6a221e9f0f84110ec Mon Sep 17 00:00:00 2001
From: diegokingston <dkingston@fi.uba.ar>
Date: Mon, 22 Jun 2026 11:56:47 -0300
Subject: [PATCH 09/19] test(prover): set SUB_DECODE_COMMITMENT_BLOWUP_2 to
 CI-computed row-pair value

Regenerated from CI's sub.elf (local riscv toolchain unavailable);
removed the temporary print instrumentation.
---
 prover/src/tests/decode_tests.rs | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/prover/src/tests/decode_tests.rs b/prover/src/tests/decode_tests.rs
index 618bfb672..a761ac929 100644
--- a/prover/src/tests/decode_tests.rs
+++ b/prover/src/tests/decode_tests.rs
@@ -242,8 +242,8 @@ fn decode_commitment_zero_bytes_rejects() {
 /// AIR or FFT pipeline changes, this drifts and the test fails —
 /// regenerate via the `print_decode_commitment_for_sub` helper below.
 const SUB_DECODE_COMMITMENT_BLOWUP_2: [u8; 32] = [
-    0x9b, 0x94, 0x41, 0x2d, 0xf0, 0x7f, 0xde, 0x2a, 0x0c, 0x31, 0x99, 0x79, 0x43, 0xef, 0x8b, 0xc1,
-    0x57, 0x05, 0x5f, 0xc6, 0x34, 0x22, 0x06, 0x06, 0x7c, 0x31, 0x73, 0xa0, 0x3b, 0xd8, 0xe3, 0x95,
+    0xe9, 0x71, 0x68, 0xd6, 0x2e, 0xb1, 0xf6, 0x56, 0x61, 0x9d, 0x04, 0x6e, 0x65, 0xed, 0x63, 0x4a,
+    0x27, 0xa3, 0x4d, 0xcb, 0x6c, 0x02, 0x11, 0xd7, 0x65, 0xc9, 0xc9, 0xfd, 0x59, 0x34, 0x41, 0x5f,
 ];
 
 #[test]
@@ -252,16 +252,6 @@ fn decode_commitment_compile_time_const_accepts() {
     let vm_proof = prove(&elf_bytes).expect("prove failed");
     let options = GoldilocksCubicProofOptions::with_blowup(2).expect("blowup=2 valid");
 
-    // TEMP(ci-regen): surface the actual row-pair decode commitment computed from
-    // CI's sub.elf so SUB_DECODE_COMMITMENT_BLOWUP_2 can be regenerated. Remove.
-    {
-        let elf = Elf::load(&elf_bytes).expect("ELF load");
-        eprintln!(
-            "ACTUAL_SUB_DECODE_COMMITMENT_BLOWUP_2 = {:02x?}",
-            commitment_from_elf(&elf, &options).expect("decode commitment")
-        );
-    }
-
     // Pass the OFFLINE-COMPUTED const directly — mimics the recursion guest's
     // workflow where the value lives in the caller's compiled binary.
     let result = verify_with_options(

From a1afb67ce9f65638196df5d08e96a8548957ce60 Mon Sep 17 00:00:00 2001
From: Joaquin Carletti <joaquin.carletti@lambdaclass.com>
Date: Fri, 26 Jun 2026 10:54:35 -0300
Subject: [PATCH 10/19] mplement changes in GPU

---
 crypto/math-cuda/kernels/keccak.cu      |  45 ++++++++
 crypto/math-cuda/src/device.rs          |   3 +
 crypto/math-cuda/src/lde.rs             | 113 ++++++++++++++------
 crypto/math-cuda/src/merkle.rs          | 133 ++++++++++++++++++++++++
 crypto/math-cuda/tests/keccak_leaves.rs |  81 +++++++++++++++
 crypto/stark/src/gpu_lde.rs             |  24 +++--
 6 files changed, 361 insertions(+), 38 deletions(-)

diff --git a/crypto/math-cuda/kernels/keccak.cu b/crypto/math-cuda/kernels/keccak.cu
index c22bc4d05..272b847dc 100644
--- a/crypto/math-cuda/kernels/keccak.cu
+++ b/crypto/math-cuda/kernels/keccak.cu
@@ -181,6 +181,51 @@ extern "C" __global__ void keccak256_leaves_base_batched(
     finalize_keccak256(st, rate_pos, hashed_leaves_out + tid * 32);
 }
 
+// ---------------------------------------------------------------------------
+// Goldilocks BASE-FIELD row-pair leaf hashing.
+//
+// Leaf `leaf_idx` hashes TWO consecutive bit-reversed rows
+//   br_0 = reverse_index(2*leaf_idx),  br_1 = reverse_index(2*leaf_idx + 1)
+// each written column-by-column in canonical BE (same per-row byte layout as
+// `keccak256_leaves_base_batched`), in (br_0 row: col 0..K-1) then (br_1 row:
+// col 0..K-1) order. `num_leaves = num_rows / 2`; writes 32 bytes to
+// `hashed_leaves_out[leaf_idx * 32 ..]`. Matches the CPU
+// `keccak_leaves_row_pair_bit_reversed` (rows_per_leaf = 2) — the base-field
+// analog of `keccak_comp_poly_leaves_ext3`.
+// ---------------------------------------------------------------------------
+extern "C" __global__ void keccak256_leaves_base_row_pair_batched(
+    const uint64_t *columns_base_ptr,
+    uint64_t col_stride,
+    uint64_t num_cols,
+    uint64_t num_rows,
+    uint64_t log_num_rows,
+    uint8_t *hashed_leaves_out) {
+    uint64_t tid = (uint64_t)blockIdx.x * blockDim.x + threadIdx.x;
+    uint64_t num_leaves = num_rows >> 1;
+    if (tid >= num_leaves) return;
+
+    uint64_t br_0 = __brevll(2 * tid) >> (64 - log_num_rows);
+    uint64_t br_1 = __brevll(2 * tid + 1) >> (64 - log_num_rows);
+
+    uint64_t st[25];
+    #pragma unroll
+    for (int i = 0; i < 25; ++i) st[i] = 0;
+
+    uint32_t rate_pos = 0;
+    // First row (br_0): col 0..K-1.
+    for (uint64_t c = 0; c < num_cols; ++c) {
+        uint64_t v = columns_base_ptr[c * col_stride + br_0];
+        absorb_lane(st, rate_pos, bswap64(goldilocks::canonical(v)));
+    }
+    // Second row (br_1): col 0..K-1.
+    for (uint64_t c = 0; c < num_cols; ++c) {
+        uint64_t v = columns_base_ptr[c * col_stride + br_1];
+        absorb_lane(st, rate_pos, bswap64(goldilocks::canonical(v)));
+    }
+
+    finalize_keccak256(st, rate_pos, hashed_leaves_out + tid * 32);
+}
+
 // ---------------------------------------------------------------------------
 // Goldilocks EXT3 leaf hashing (3 base-field components per ext3 element).
 //
diff --git a/crypto/math-cuda/src/device.rs b/crypto/math-cuda/src/device.rs
index 17e2f9f82..d2a98652b 100644
--- a/crypto/math-cuda/src/device.rs
+++ b/crypto/math-cuda/src/device.rs
@@ -143,6 +143,7 @@ pub struct Backend {
 
     // keccak.ptx
     pub keccak256_leaves_base_batched: CudaFunction,
+    pub keccak256_leaves_base_row_pair_batched: CudaFunction,
     pub keccak256_leaves_ext3_batched: CudaFunction,
     pub keccak_comp_poly_leaves_ext3: CudaFunction,
     pub keccak_fri_leaves_ext3: CudaFunction,
@@ -238,6 +239,8 @@ impl Backend {
             pointwise_mul_batched: ntt.load_function("pointwise_mul_batched")?,
             scalar_mul_batched: ntt.load_function("scalar_mul_batched")?,
             keccak256_leaves_base_batched: keccak.load_function("keccak256_leaves_base_batched")?,
+            keccak256_leaves_base_row_pair_batched: keccak
+                .load_function("keccak256_leaves_base_row_pair_batched")?,
             keccak256_leaves_ext3_batched: keccak.load_function("keccak256_leaves_ext3_batched")?,
             keccak_comp_poly_leaves_ext3: keccak.load_function("keccak_comp_poly_leaves_ext3")?,
             keccak_fri_leaves_ext3: keccak.load_function("keccak_fri_leaves_ext3")?,
diff --git a/crypto/math-cuda/src/lde.rs b/crypto/math-cuda/src/lde.rs
index ee5dc3fce..59e01245f 100644
--- a/crypto/math-cuda/src/lde.rs
+++ b/crypto/math-cuda/src/lde.rs
@@ -16,7 +16,10 @@ use cudarc::driver::{CudaSlice, CudaStream, LaunchConfig, PushKernelArg};
 
 use crate::Result;
 use crate::device::{Backend, backend};
-use crate::merkle::{keccak_launch_cfg, launch_keccak_base, launch_keccak_ext3};
+use crate::merkle::{
+    keccak_launch_cfg, launch_keccak_base, launch_keccak_base_row_pair, launch_keccak_ext3,
+    launch_keccak_ext3_row_pair,
+};
 use crate::ntt::run_ntt_body;
 
 /// Goldilocks `TWO_ADICITY = 32` puts the theoretical domain ceiling at
@@ -43,17 +46,17 @@ enum KeccakCommit {
 }
 
 impl KeccakCommit {
-    fn total_nodes_bytes(self, lde_size: usize) -> usize {
+    fn total_nodes_bytes(self, num_leaves: usize) -> usize {
         match self {
-            KeccakCommit::LeavesOnly => lde_size * 32,
-            KeccakCommit::FullTree => (2 * lde_size - 1) * 32,
+            KeccakCommit::LeavesOnly => num_leaves * 32,
+            KeccakCommit::FullTree => (2 * num_leaves - 1) * 32,
         }
     }
 
-    fn leaves_offset_bytes(self, lde_size: usize) -> usize {
+    fn leaves_offset_bytes(self, num_leaves: usize) -> usize {
         match self {
             KeccakCommit::LeavesOnly => 0,
-            KeccakCommit::FullTree => (lde_size - 1) * 32,
+            KeccakCommit::FullTree => (num_leaves - 1) * 32,
         }
     }
 }
@@ -613,6 +616,7 @@ pub fn coset_lde_batch_base_into_with_leaf_hash(
         hashed_leaves_out,
         KeccakCommit::LeavesOnly,
         false,
+        1,
     )
     .map(|_| ())
 }
@@ -640,6 +644,7 @@ pub fn coset_lde_batch_base_into_with_merkle_tree(
         merkle_nodes_out,
         KeccakCommit::FullTree,
         false,
+        1,
     )
     .map(|_| ())
 }
@@ -662,6 +667,9 @@ pub fn coset_lde_batch_base_into_with_merkle_tree_keep(
         merkle_nodes_out,
         KeccakCommit::FullTree,
         true,
+        // Trace commit: one leaf per bit-reversed row pair, matching the CPU
+        // `commit_bit_reversed(.., ROWS_PER_LEAF=2)` and `verify_opening_pair`.
+        2,
     )?;
     let handle = opt.expect("keep_device_buf=true must return Some");
     Ok(handle)
@@ -675,6 +683,9 @@ fn coset_lde_batch_base_into_with_merkle_tree_inner(
     nodes_out: &mut [u8],
     commit: KeccakCommit,
     keep_device_buf: bool,
+    // 1 = one leaf per bit-reversed row; 2 = one leaf per row pair (2i, 2i+1),
+    // matching the CPU `commit_bit_reversed(.., 2)` used for the trace commit.
+    rows_per_leaf: usize,
 ) -> Result<Option<GpuLdeBase>> {
     if columns.is_empty() {
         assert_eq!(outputs.len(), 0);
@@ -698,7 +709,13 @@ fn coset_lde_batch_base_into_with_merkle_tree_inner(
     for o in outputs.iter() {
         assert_eq!(o.len(), lde_size);
     }
-    let nodes_dev_bytes = commit.total_nodes_bytes(lde_size);
+    assert!(
+        rows_per_leaf == 1 || rows_per_leaf == 2,
+        "rows_per_leaf must be 1 or 2"
+    );
+    assert_eq!(lde_size % rows_per_leaf, 0);
+    let num_leaves = lde_size / rows_per_leaf;
+    let nodes_dev_bytes = commit.total_nodes_bytes(num_leaves);
     assert_eq!(nodes_out.len(), nodes_dev_bytes);
     let log_n = n.trailing_zeros() as u64;
     let log_lde = lde_size.trailing_zeros() as u64;
@@ -787,22 +804,33 @@ fn coset_lde_batch_base_into_with_merkle_tree_inner(
     // written before any reader sees it: the keccak kernel fills the
     // leaves slab, the inner-tree pass (when present) fills the head.
     let mut nodes_dev = unsafe { stream.alloc::<u8>(nodes_dev_bytes) }?;
-    let leaves_offset_bytes = commit.leaves_offset_bytes(lde_size);
+    let leaves_offset_bytes = commit.leaves_offset_bytes(num_leaves);
     {
         let mut leaves_view =
-            nodes_dev.slice_mut(leaves_offset_bytes..leaves_offset_bytes + lde_size * 32);
-        launch_keccak_base(
-            stream.as_ref(),
-            &buf,
-            col_stride_u64,
-            m as u64,
-            lde_u64,
-            &mut leaves_view,
-        )?;
+            nodes_dev.slice_mut(leaves_offset_bytes..leaves_offset_bytes + num_leaves * 32);
+        if rows_per_leaf == 2 {
+            launch_keccak_base_row_pair(
+                stream.as_ref(),
+                &buf,
+                col_stride_u64,
+                m as u64,
+                lde_u64,
+                &mut leaves_view,
+            )?;
+        } else {
+            launch_keccak_base(
+                stream.as_ref(),
+                &buf,
+                col_stride_u64,
+                m as u64,
+                lde_u64,
+                &mut leaves_view,
+            )?;
+        }
     }
 
     if commit == KeccakCommit::FullTree {
-        crate::merkle::build_inner_tree_levels(stream.as_ref(), be, &mut nodes_dev, lde_size)?;
+        crate::merkle::build_inner_tree_levels(stream.as_ref(), be, &mut nodes_dev, num_leaves)?;
     }
 
     // D2H the LDE and the tree/leaves nodes via pinned staging.
@@ -848,6 +876,7 @@ pub fn coset_lde_batch_ext3_into_with_leaf_hash(
         hashed_leaves_out,
         KeccakCommit::LeavesOnly,
         false,
+        1,
     )
     .map(|_| ())
 }
@@ -872,6 +901,7 @@ pub fn coset_lde_batch_ext3_into_with_merkle_tree(
         merkle_nodes_out,
         KeccakCommit::FullTree,
         false,
+        1,
     )
     .map(|_| ())
 }
@@ -896,6 +926,9 @@ pub fn coset_lde_batch_ext3_into_with_merkle_tree_keep(
         merkle_nodes_out,
         KeccakCommit::FullTree,
         true,
+        // Trace commit: one leaf per bit-reversed row pair, matching the CPU
+        // `commit_bit_reversed(.., ROWS_PER_LEAF=2)` and `verify_opening_pair`.
+        2,
     )?;
     Ok(opt.expect("keep_device_buf=true must return Some"))
 }
@@ -910,6 +943,9 @@ fn coset_lde_batch_ext3_into_with_merkle_tree_inner(
     nodes_out: &mut [u8],
     commit: KeccakCommit,
     keep_device_buf: bool,
+    // 1 = one leaf per bit-reversed row; 2 = one leaf per row pair (2i, 2i+1),
+    // matching the CPU `commit_bit_reversed(.., 2)` used for the trace commit.
+    rows_per_leaf: usize,
 ) -> Result<Option<GpuLdeExt3>> {
     if columns.is_empty() {
         assert_eq!(outputs.len(), 0);
@@ -935,7 +971,13 @@ fn coset_lde_batch_ext3_into_with_merkle_tree_inner(
     for o in outputs.iter() {
         assert_eq!(o.len(), 3 * lde_size);
     }
-    let nodes_dev_bytes = commit.total_nodes_bytes(lde_size);
+    assert!(
+        rows_per_leaf == 1 || rows_per_leaf == 2,
+        "rows_per_leaf must be 1 or 2"
+    );
+    assert_eq!(lde_size % rows_per_leaf, 0);
+    let num_leaves = lde_size / rows_per_leaf;
+    let nodes_dev_bytes = commit.total_nodes_bytes(num_leaves);
     assert_eq!(nodes_out.len(), nodes_dev_bytes);
     let log_n = n.trailing_zeros() as u64;
     let log_lde = lde_size.trailing_zeros() as u64;
@@ -1016,22 +1058,33 @@ fn coset_lde_batch_ext3_into_with_merkle_tree_inner(
     // (2*lde_size - 1)*32). Leaf kernel writes to the leaves slab; the
     // inner-tree pass (when present) fills the head.
     let mut nodes_dev = unsafe { stream.alloc::<u8>(nodes_dev_bytes) }?;
-    let leaves_offset_bytes = commit.leaves_offset_bytes(lde_size);
+    let leaves_offset_bytes = commit.leaves_offset_bytes(num_leaves);
     {
         let mut leaves_view =
-            nodes_dev.slice_mut(leaves_offset_bytes..leaves_offset_bytes + lde_size * 32);
-        launch_keccak_ext3(
-            stream.as_ref(),
-            &buf,
-            col_stride_u64,
-            m as u64,
-            lde_u64,
-            &mut leaves_view,
-        )?;
+            nodes_dev.slice_mut(leaves_offset_bytes..leaves_offset_bytes + num_leaves * 32);
+        if rows_per_leaf == 2 {
+            launch_keccak_ext3_row_pair(
+                stream.as_ref(),
+                &buf,
+                col_stride_u64,
+                m as u64,
+                lde_u64,
+                &mut leaves_view,
+            )?;
+        } else {
+            launch_keccak_ext3(
+                stream.as_ref(),
+                &buf,
+                col_stride_u64,
+                m as u64,
+                lde_u64,
+                &mut leaves_view,
+            )?;
+        }
     }
 
     if commit == KeccakCommit::FullTree {
-        crate::merkle::build_inner_tree_levels(stream.as_ref(), be, &mut nodes_dev, lde_size)?;
+        crate::merkle::build_inner_tree_levels(stream.as_ref(), be, &mut nodes_dev, num_leaves)?;
     }
 
     // D2H LDE (mb * lde_size u64) and tree/leaves nodes.
diff --git a/crypto/math-cuda/src/merkle.rs b/crypto/math-cuda/src/merkle.rs
index 932e81325..f9ca74706 100644
--- a/crypto/math-cuda/src/merkle.rs
+++ b/crypto/math-cuda/src/merkle.rs
@@ -94,6 +94,79 @@ pub fn keccak_leaves_ext3(
     Ok(out)
 }
 
+/// Row-pair base-field leaf hashing: leaf `i` hashes bit-reversed rows `2i`,
+/// `2i+1`. Returns `num_rows/2 * 32` hash bytes. Parity entry point for the
+/// CPU `keccak_leaves_row_pair_bit_reversed` over base columns (the layout the
+/// trace commit uses).
+pub fn keccak_leaves_base_row_pair(
+    columns: &[u64],
+    col_stride: usize,
+    num_cols: usize,
+    num_rows: usize,
+) -> Result<Vec<u8>> {
+    assert!(num_rows.is_power_of_two());
+    assert!(num_rows >= 2);
+    assert!(
+        col_stride >= num_rows,
+        "col_stride must be >= num_rows to keep per-column reads in-bounds"
+    );
+    let total = num_cols
+        .checked_mul(col_stride)
+        .expect("num_cols * col_stride overflows usize");
+    assert!(columns.len() >= total);
+    let be = backend()?;
+    let stream = be.next_stream();
+    let cols_dev = stream.clone_htod(&columns[..total])?;
+    let mut out_dev = stream.alloc_zeros::<u8>((num_rows / 2) * 32)?;
+    launch_keccak_base_row_pair(
+        stream.as_ref(),
+        &cols_dev,
+        col_stride as u64,
+        num_cols as u64,
+        num_rows as u64,
+        &mut out_dev.as_view_mut(),
+    )?;
+    let out = stream.clone_dtoh(&out_dev)?;
+    stream.synchronize()?;
+    Ok(out)
+}
+
+/// Row-pair ext3 variant. Returns `num_rows/2 * 32` hash bytes. Parity entry
+/// point for the CPU `keccak_leaves_row_pair_bit_reversed` over ext3 columns.
+pub fn keccak_leaves_ext3_row_pair(
+    columns: &[u64],
+    col_stride: usize,
+    num_cols: usize,
+    num_rows: usize,
+) -> Result<Vec<u8>> {
+    assert!(num_rows.is_power_of_two());
+    assert!(num_rows >= 2);
+    assert!(
+        col_stride >= num_rows,
+        "col_stride must be >= num_rows to keep per-column reads in-bounds"
+    );
+    let total = num_cols
+        .checked_mul(3)
+        .and_then(|v| v.checked_mul(col_stride))
+        .expect("num_cols * 3 * col_stride overflows usize");
+    assert!(columns.len() >= total);
+    let be = backend()?;
+    let stream = be.next_stream();
+    let cols_dev = stream.clone_htod(&columns[..total])?;
+    let mut out_dev = stream.alloc_zeros::<u8>((num_rows / 2) * 32)?;
+    launch_keccak_ext3_row_pair(
+        stream.as_ref(),
+        &cols_dev,
+        col_stride as u64,
+        num_cols as u64,
+        num_rows as u64,
+        &mut out_dev.as_view_mut(),
+    )?;
+    let out = stream.clone_dtoh(&out_dev)?;
+    stream.synchronize()?;
+    Ok(out)
+}
+
 /// Block size for Keccak kernels. Per-thread register footprint is ~60 regs
 /// (25-lane state + auxiliaries). The default 256 threads/block pushes the
 /// block register file past the hardware limit on sm_120 (Blackwell). 128
@@ -170,6 +243,66 @@ pub(crate) fn launch_keccak_base(
     Ok(())
 }
 
+/// Row-pair base-field leaf hashing: leaf `i` hashes bit-reversed rows `2i`,
+/// `2i+1` (one Merkle path per FRI query). Writes `num_rows/2` leaves of 32
+/// bytes into `out_dev`. Base-field analog of the comp-poly ext3 path; matches
+/// the CPU `keccak_leaves_row_pair_bit_reversed`.
+pub(crate) fn launch_keccak_base_row_pair(
+    stream: &CudaStream,
+    cols_dev: &CudaSlice<u64>,
+    col_stride: u64,
+    num_cols: u64,
+    num_rows: u64,
+    out_dev: &mut CudaViewMut<'_, u8>,
+) -> Result<()> {
+    debug_assert!(num_rows >= 2, "keccak row-pair leaf kernel: num_rows must be >= 2");
+    let be = backend()?;
+    let log_num_rows = num_rows.trailing_zeros() as u64;
+    // One thread per leaf (= row pair).
+    let cfg = keccak_launch_cfg(num_rows >> 1);
+    unsafe {
+        stream
+            .launch_builder(&be.keccak256_leaves_base_row_pair_batched)
+            .arg(cols_dev)
+            .arg(&col_stride)
+            .arg(&num_cols)
+            .arg(&num_rows)
+            .arg(&log_num_rows)
+            .arg(out_dev)
+            .launch(cfg)?;
+    }
+    Ok(())
+}
+
+/// Row-pair ext3 leaf hashing for the aux trace: reuses the comp-poly kernel
+/// (`keccak_comp_poly_leaves_ext3`), which hashes bit-reversed rows `2i`, `2i+1`
+/// across all ext3 columns. Writes `num_rows/2` leaves of 32 bytes.
+pub(crate) fn launch_keccak_ext3_row_pair(
+    stream: &CudaStream,
+    cols_dev: &CudaSlice<u64>,
+    col_stride: u64,
+    num_cols: u64,
+    num_rows: u64,
+    out_dev: &mut CudaViewMut<'_, u8>,
+) -> Result<()> {
+    debug_assert!(num_rows >= 2, "keccak row-pair leaf kernel: num_rows must be >= 2");
+    let be = backend()?;
+    let log_num_rows = num_rows.trailing_zeros() as u64;
+    let cfg = keccak_launch_cfg(num_rows >> 1);
+    unsafe {
+        stream
+            .launch_builder(&be.keccak_comp_poly_leaves_ext3)
+            .arg(cols_dev)
+            .arg(&col_stride)
+            .arg(&num_cols)
+            .arg(&num_rows)
+            .arg(&log_num_rows)
+            .arg(out_dev)
+            .launch(cfg)?;
+    }
+    Ok(())
+}
+
 /// Given `hashed_leaves` of length `leaves_len * 32`, build the full Merkle
 /// tree on device and return the complete node buffer `(2*leaves_len - 1) *
 /// 32` bytes in the standard layout:
diff --git a/crypto/math-cuda/tests/keccak_leaves.rs b/crypto/math-cuda/tests/keccak_leaves.rs
index d614e233d..526f47ac3 100644
--- a/crypto/math-cuda/tests/keccak_leaves.rs
+++ b/crypto/math-cuda/tests/keccak_leaves.rs
@@ -97,6 +97,87 @@ fn keccak_leaves_ext3_matches_cpu() {
     }
 }
 
+#[test]
+fn keccak_leaves_base_row_pair_matches_cpu() {
+    // Row-pair (trace) commit: leaf `i` hashes bit-reversed rows `2i`, `2i+1`.
+    // GPU `keccak_leaves_base_row_pair` must match the CPU prover helper
+    // `keccak_leaves_row_pair_bit_reversed` over base columns.
+    for log_n in [4u32, 6, 8, 10, 12] {
+        for num_cols in [1usize, 5, 17, 41] {
+            let n = 1 << log_n;
+            let mut rng = ChaCha8Rng::seed_from_u64(500 + log_n as u64 + num_cols as u64);
+            let columns: Vec<Vec<Fp>> = (0..num_cols)
+                .map(|_| (0..n).map(|_| Fp::from_raw(rng.r#gen::<u64>())).collect())
+                .collect();
+
+            let cpu = keccak_leaves_row_pair_bit_reversed(&columns);
+            assert_eq!(cpu.len(), n / 2);
+
+            let mut flat = vec![0u64; num_cols * n];
+            for (c, col) in columns.iter().enumerate() {
+                for (r, e) in col.iter().enumerate() {
+                    flat[c * n + r] = *e.value();
+                }
+            }
+            let gpu = math_cuda::merkle::keccak_leaves_base_row_pair(&flat, n, num_cols, n).unwrap();
+            assert_eq!(gpu.len(), (n / 2) * 32);
+            for i in 0..n / 2 {
+                assert_eq!(
+                    &gpu[i * 32..(i + 1) * 32],
+                    &cpu[i][..],
+                    "base row-pair leaf mismatch at i={i} (log_n={log_n}, cols={num_cols})"
+                );
+            }
+        }
+    }
+}
+
+#[test]
+fn keccak_leaves_ext3_row_pair_matches_cpu() {
+    for log_n in [4u32, 6, 8, 10] {
+        for num_cols in [1usize, 3, 11, 20] {
+            let n = 1 << log_n;
+            let mut rng = ChaCha8Rng::seed_from_u64(600 + log_n as u64 + num_cols as u64);
+            let columns: Vec<Vec<Fp3>> = (0..num_cols)
+                .map(|_| {
+                    (0..n)
+                        .map(|_| {
+                            Fp3::new([
+                                Fp::from_raw(rng.r#gen::<u64>()),
+                                Fp::from_raw(rng.r#gen::<u64>()),
+                                Fp::from_raw(rng.r#gen::<u64>()),
+                            ])
+                        })
+                        .collect()
+                })
+                .collect();
+
+            let cpu = keccak_leaves_row_pair_bit_reversed(&columns);
+            assert_eq!(cpu.len(), n / 2);
+
+            // De-interleaved 3-slab layout per ext3 column (same as the 1-row
+            // ext3 leaf path): [col*3+k] each a contiguous slab of n u64s.
+            let mut flat = vec![0u64; num_cols * 3 * n];
+            for (c, col) in columns.iter().enumerate() {
+                for (r, e) in col.iter().enumerate() {
+                    flat[(c * 3) * n + r] = *e.value()[0].value();
+                    flat[(c * 3 + 1) * n + r] = *e.value()[1].value();
+                    flat[(c * 3 + 2) * n + r] = *e.value()[2].value();
+                }
+            }
+            let gpu = math_cuda::merkle::keccak_leaves_ext3_row_pair(&flat, n, num_cols, n).unwrap();
+            assert_eq!(gpu.len(), (n / 2) * 32);
+            for i in 0..n / 2 {
+                assert_eq!(
+                    &gpu[i * 32..(i + 1) * 32],
+                    &cpu[i][..],
+                    "ext3 row-pair leaf mismatch at i={i} (log_n={log_n}, cols={num_cols})"
+                );
+            }
+        }
+    }
+}
+
 #[test]
 fn keccak_comp_poly_leaves_matches_cpu() {
     // Built tree's leaves live at byte offset `(num_leaves - 1) * 32` and
diff --git a/crypto/stark/src/gpu_lde.rs b/crypto/stark/src/gpu_lde.rs
index be6d5ccfe..5f09cd3b2 100644
--- a/crypto/stark/src/gpu_lde.rs
+++ b/crypto/stark/src/gpu_lde.rs
@@ -269,17 +269,17 @@ fn restore_columns_on_err<E: IsField>(columns: &mut [Vec<FieldElement<E>>], n: u
     }
 }
 
-/// Allocate the `[u8; 32]` Merkle node buffer for a tree of `lde_size` leaves
+/// Allocate the `[u8; 32]` Merkle node buffer for a tree of `num_leaves` leaves
 /// and return the node `Vec` (length-initialised, contents undefined) together
-/// with its node count `total_nodes` (`2 * lde_size - 1`). Returns `None` if
-/// the layout would be invalid (`lde_size < 2` or `total_nodes * 32` overflows
+/// with its node count `total_nodes` (`2 * num_leaves - 1`). Returns `None` if
+/// the layout would be invalid (`num_leaves < 2` or `total_nodes * 32` overflows
 /// `usize`). The caller builds the `&mut [u8]` byte view of length
 /// `total_nodes * 32` and must overwrite every byte via the GPU D2H.
-fn alloc_merkle_nodes(lde_size: usize) -> Option<(Vec<[u8; 32]>, usize)> {
-    if lde_size < 2 {
+fn alloc_merkle_nodes(num_leaves: usize) -> Option<(Vec<[u8; 32]>, usize)> {
+    if num_leaves < 2 {
         return None;
     }
-    let total_nodes = 2usize.saturating_mul(lde_size).checked_sub(1)?;
+    let total_nodes = 2usize.saturating_mul(num_leaves).checked_sub(1)?;
     let _byte_len = total_nodes.checked_mul(32)?;
     let mut nodes: Vec<[u8; 32]> = Vec::with_capacity(total_nodes);
     // SAFETY: every byte will be overwritten via the GPU D2H before the
@@ -471,7 +471,11 @@ where
         LayoutDispatch::Run { n, lde_size } => (n, lde_size),
     };
     let num_columns = columns.len();
-    let (mut nodes, total_nodes) = alloc_merkle_nodes(lde_size)?;
+    // Row-pair trace commit: one Merkle leaf per bit-reversed row pair, so the
+    // tree has `lde_size / ROWS_PER_LEAF` leaves (matches the CPU
+    // `commit_bit_reversed(.., ROWS_PER_LEAF)` and `verify_opening_pair`).
+    let num_leaves = lde_size / crate::commitment::ROWS_PER_LEAF;
+    let (mut nodes, total_nodes) = alloc_merkle_nodes(num_leaves)?;
     let node_byte_len = total_nodes
         .checked_mul(32)
         .expect("node byte length overflow");
@@ -528,7 +532,11 @@ where
         LayoutDispatch::Run { n, lde_size } => (n, lde_size),
     };
     let num_columns = columns.len();
-    let (mut nodes, total_nodes) = alloc_merkle_nodes(lde_size)?;
+    // Row-pair trace commit: one Merkle leaf per bit-reversed row pair, so the
+    // tree has `lde_size / ROWS_PER_LEAF` leaves (matches the CPU
+    // `commit_bit_reversed(.., ROWS_PER_LEAF)` and `verify_opening_pair`).
+    let num_leaves = lde_size / crate::commitment::ROWS_PER_LEAF;
+    let (mut nodes, total_nodes) = alloc_merkle_nodes(num_leaves)?;
     let node_byte_len = total_nodes
         .checked_mul(32)
         .expect("node byte length overflow");

From 7446a9b1b1ee4e5ba8df69ed7050f1216af11887 Mon Sep 17 00:00:00 2001
From: Joaquin Carletti <joaquin.carletti@lambdaclass.com>
Date: Fri, 26 Jun 2026 11:09:44 -0300
Subject: [PATCH 11/19] refactor

---
 crypto/math-cuda/src/merkle.rs          | 104 ++++++------------------
 crypto/math-cuda/tests/keccak_leaves.rs |  12 +--
 2 files changed, 30 insertions(+), 86 deletions(-)

diff --git a/crypto/math-cuda/src/merkle.rs b/crypto/math-cuda/src/merkle.rs
index f9ca74706..5e7e4dae5 100644
--- a/crypto/math-cuda/src/merkle.rs
+++ b/crypto/math-cuda/src/merkle.rs
@@ -25,15 +25,19 @@ use crate::lde::pack_ext3_to_pinned_slabs;
 /// Run GPU Keccak-256 leaf hashing on a base-field column buffer.
 ///
 /// `columns` must hold `num_cols * col_stride` u64s with column `c`'s data
-/// at `[c*col_stride .. c*col_stride + num_rows]`. Returns `num_rows * 32`
-/// hash bytes in natural (non-bit-reversed) row order.
+/// at `[c*col_stride .. c*col_stride + num_rows]`. `rows_per_leaf` selects the
+/// leaf layout: `1` = one leaf per bit-reversed row (`num_rows` leaves), `2` =
+/// one leaf per bit-reversed row pair `2i`,`2i+1` (`num_rows/2` leaves, the
+/// trace-commit layout). Returns `(num_rows / rows_per_leaf) * 32` hash bytes.
 pub fn keccak_leaves_base(
     columns: &[u64],
     col_stride: usize,
     num_cols: usize,
     num_rows: usize,
+    rows_per_leaf: usize,
 ) -> Result<Vec<u8>> {
     assert!(num_rows.is_power_of_two());
+    assert!(rows_per_leaf == 1 || rows_per_leaf == 2);
     assert!(
         col_stride >= num_rows,
         "col_stride must be >= num_rows to keep per-column reads in-bounds"
@@ -45,8 +49,13 @@ pub fn keccak_leaves_base(
     let be = backend()?;
     let stream = be.next_stream();
     let cols_dev = stream.clone_htod(&columns[..total])?;
-    let mut out_dev = stream.alloc_zeros::<u8>(num_rows * 32)?;
-    launch_keccak_base(
+    let mut out_dev = stream.alloc_zeros::<u8>((num_rows / rows_per_leaf) * 32)?;
+    let launch = if rows_per_leaf == 2 {
+        launch_keccak_base_row_pair
+    } else {
+        launch_keccak_base
+    };
+    launch(
         stream.as_ref(),
         &cols_dev,
         col_stride as u64,
@@ -60,14 +69,17 @@ pub fn keccak_leaves_base(
 }
 
 /// Ext3 variant. Columns interleaved as three base slabs per ext3 column.
-/// `columns.len() >= num_cols * 3 * col_stride`.
+/// `columns.len() >= num_cols * 3 * col_stride`. `rows_per_leaf` as in
+/// [`keccak_leaves_base`].
 pub fn keccak_leaves_ext3(
     columns: &[u64],
     col_stride: usize,
     num_cols: usize,
     num_rows: usize,
+    rows_per_leaf: usize,
 ) -> Result<Vec<u8>> {
     assert!(num_rows.is_power_of_two());
+    assert!(rows_per_leaf == 1 || rows_per_leaf == 2);
     assert!(
         col_stride >= num_rows,
         "col_stride must be >= num_rows to keep per-column reads in-bounds"
@@ -80,81 +92,13 @@ pub fn keccak_leaves_ext3(
     let be = backend()?;
     let stream = be.next_stream();
     let cols_dev = stream.clone_htod(&columns[..total])?;
-    let mut out_dev = stream.alloc_zeros::<u8>(num_rows * 32)?;
-    launch_keccak_ext3(
-        stream.as_ref(),
-        &cols_dev,
-        col_stride as u64,
-        num_cols as u64,
-        num_rows as u64,
-        &mut out_dev.as_view_mut(),
-    )?;
-    let out = stream.clone_dtoh(&out_dev)?;
-    stream.synchronize()?;
-    Ok(out)
-}
-
-/// Row-pair base-field leaf hashing: leaf `i` hashes bit-reversed rows `2i`,
-/// `2i+1`. Returns `num_rows/2 * 32` hash bytes. Parity entry point for the
-/// CPU `keccak_leaves_row_pair_bit_reversed` over base columns (the layout the
-/// trace commit uses).
-pub fn keccak_leaves_base_row_pair(
-    columns: &[u64],
-    col_stride: usize,
-    num_cols: usize,
-    num_rows: usize,
-) -> Result<Vec<u8>> {
-    assert!(num_rows.is_power_of_two());
-    assert!(num_rows >= 2);
-    assert!(
-        col_stride >= num_rows,
-        "col_stride must be >= num_rows to keep per-column reads in-bounds"
-    );
-    let total = num_cols
-        .checked_mul(col_stride)
-        .expect("num_cols * col_stride overflows usize");
-    assert!(columns.len() >= total);
-    let be = backend()?;
-    let stream = be.next_stream();
-    let cols_dev = stream.clone_htod(&columns[..total])?;
-    let mut out_dev = stream.alloc_zeros::<u8>((num_rows / 2) * 32)?;
-    launch_keccak_base_row_pair(
-        stream.as_ref(),
-        &cols_dev,
-        col_stride as u64,
-        num_cols as u64,
-        num_rows as u64,
-        &mut out_dev.as_view_mut(),
-    )?;
-    let out = stream.clone_dtoh(&out_dev)?;
-    stream.synchronize()?;
-    Ok(out)
-}
-
-/// Row-pair ext3 variant. Returns `num_rows/2 * 32` hash bytes. Parity entry
-/// point for the CPU `keccak_leaves_row_pair_bit_reversed` over ext3 columns.
-pub fn keccak_leaves_ext3_row_pair(
-    columns: &[u64],
-    col_stride: usize,
-    num_cols: usize,
-    num_rows: usize,
-) -> Result<Vec<u8>> {
-    assert!(num_rows.is_power_of_two());
-    assert!(num_rows >= 2);
-    assert!(
-        col_stride >= num_rows,
-        "col_stride must be >= num_rows to keep per-column reads in-bounds"
-    );
-    let total = num_cols
-        .checked_mul(3)
-        .and_then(|v| v.checked_mul(col_stride))
-        .expect("num_cols * 3 * col_stride overflows usize");
-    assert!(columns.len() >= total);
-    let be = backend()?;
-    let stream = be.next_stream();
-    let cols_dev = stream.clone_htod(&columns[..total])?;
-    let mut out_dev = stream.alloc_zeros::<u8>((num_rows / 2) * 32)?;
-    launch_keccak_ext3_row_pair(
+    let mut out_dev = stream.alloc_zeros::<u8>((num_rows / rows_per_leaf) * 32)?;
+    let launch = if rows_per_leaf == 2 {
+        launch_keccak_ext3_row_pair
+    } else {
+        launch_keccak_ext3
+    };
+    launch(
         stream.as_ref(),
         &cols_dev,
         col_stride as u64,
diff --git a/crypto/math-cuda/tests/keccak_leaves.rs b/crypto/math-cuda/tests/keccak_leaves.rs
index 526f47ac3..61a861f32 100644
--- a/crypto/math-cuda/tests/keccak_leaves.rs
+++ b/crypto/math-cuda/tests/keccak_leaves.rs
@@ -38,7 +38,7 @@ fn keccak_leaves_base_matches_cpu() {
                     flat[c * n + r] = *e.value();
                 }
             }
-            let gpu = math_cuda::merkle::keccak_leaves_base(&flat, n, num_cols, n).unwrap();
+            let gpu = math_cuda::merkle::keccak_leaves_base(&flat, n, num_cols, n, 1).unwrap();
             assert_eq!(gpu.len(), n * 32);
             for i in 0..n {
                 assert_eq!(
@@ -84,7 +84,7 @@ fn keccak_leaves_ext3_matches_cpu() {
                     flat[(c * 3 + 2) * n + r] = *e.value()[2].value();
                 }
             }
-            let gpu = math_cuda::merkle::keccak_leaves_ext3(&flat, n, num_cols, n).unwrap();
+            let gpu = math_cuda::merkle::keccak_leaves_ext3(&flat, n, num_cols, n, 1).unwrap();
             assert_eq!(gpu.len(), n * 32);
             for i in 0..n {
                 assert_eq!(
@@ -100,8 +100,8 @@ fn keccak_leaves_ext3_matches_cpu() {
 #[test]
 fn keccak_leaves_base_row_pair_matches_cpu() {
     // Row-pair (trace) commit: leaf `i` hashes bit-reversed rows `2i`, `2i+1`.
-    // GPU `keccak_leaves_base_row_pair` must match the CPU prover helper
-    // `keccak_leaves_row_pair_bit_reversed` over base columns.
+    // GPU `keccak_leaves_base(.., rows_per_leaf=2)` must match the CPU prover
+    // helper `keccak_leaves_row_pair_bit_reversed` over base columns.
     for log_n in [4u32, 6, 8, 10, 12] {
         for num_cols in [1usize, 5, 17, 41] {
             let n = 1 << log_n;
@@ -119,7 +119,7 @@ fn keccak_leaves_base_row_pair_matches_cpu() {
                     flat[c * n + r] = *e.value();
                 }
             }
-            let gpu = math_cuda::merkle::keccak_leaves_base_row_pair(&flat, n, num_cols, n).unwrap();
+            let gpu = math_cuda::merkle::keccak_leaves_base(&flat, n, num_cols, n, 2).unwrap();
             assert_eq!(gpu.len(), (n / 2) * 32);
             for i in 0..n / 2 {
                 assert_eq!(
@@ -165,7 +165,7 @@ fn keccak_leaves_ext3_row_pair_matches_cpu() {
                     flat[(c * 3 + 2) * n + r] = *e.value()[2].value();
                 }
             }
-            let gpu = math_cuda::merkle::keccak_leaves_ext3_row_pair(&flat, n, num_cols, n).unwrap();
+            let gpu = math_cuda::merkle::keccak_leaves_ext3(&flat, n, num_cols, n, 2).unwrap();
             assert_eq!(gpu.len(), (n / 2) * 32);
             for i in 0..n / 2 {
                 assert_eq!(

From ade38c1570439a9319540b120ba82ae3ee0554b1 Mon Sep 17 00:00:00 2001
From: MauroFab <maurotoscano2@gmail.com>
Date: Fri, 26 Jun 2026 15:29:20 -0300
Subject: [PATCH 12/19] Fix CUDA LDE clippy lint

---
 AGENTS.md                   | 4 ++++
 crypto/math-cuda/src/lde.rs | 1 +
 2 files changed, 5 insertions(+)
 create mode 100644 AGENTS.md

diff --git a/AGENTS.md b/AGENTS.md
new file mode 100644
index 000000000..767d84610
--- /dev/null
+++ b/AGENTS.md
@@ -0,0 +1,4 @@
+# Agent Rules
+
+- Before pushing code or conflict-resolution commits to a PR branch, run `make lint` from the repository root. `cargo fmt --check` is not a substitute because CI's `Lint` job runs the full `make lint` target.
+- If `make lint` cannot complete, do not push unless the user explicitly accepts that risk, and report the incomplete lint result.
diff --git a/crypto/math-cuda/src/lde.rs b/crypto/math-cuda/src/lde.rs
index 59e01245f..8dc05ba01 100644
--- a/crypto/math-cuda/src/lde.rs
+++ b/crypto/math-cuda/src/lde.rs
@@ -675,6 +675,7 @@ pub fn coset_lde_batch_base_into_with_merkle_tree_keep(
     Ok(handle)
 }
 
+#[allow(clippy::too_many_arguments)]
 fn coset_lde_batch_base_into_with_merkle_tree_inner(
     columns: &[&[u64]],
     blowup_factor: usize,

From 316dda9f6f9772c71f4783f776db5f2c8b809daf Mon Sep 17 00:00:00 2001
From: Mauro Toscano <12560266+MauroToscano@users.noreply.github.com>
Date: Fri, 26 Jun 2026 16:26:38 -0300
Subject: [PATCH 13/19] fix(cuda): align review cleanup with row-pair commits
 (#723)

---
 crypto/math-cuda/kernels/keccak.cu       |  4 +--
 crypto/math-cuda/src/lde.rs              | 45 ++++++++++++------------
 crypto/math-cuda/src/merkle.rs           | 18 +++++++++-
 crypto/math-cuda/tests/comp_poly_tree.rs |  4 +--
 4 files changed, 44 insertions(+), 27 deletions(-)

diff --git a/crypto/math-cuda/kernels/keccak.cu b/crypto/math-cuda/kernels/keccak.cu
index 272b847dc..32dd75d47 100644
--- a/crypto/math-cuda/kernels/keccak.cu
+++ b/crypto/math-cuda/kernels/keccak.cu
@@ -159,8 +159,8 @@ extern "C" __global__ void keccak256_leaves_base_batched(
     uint64_t tid = (uint64_t)blockIdx.x * blockDim.x + threadIdx.x;
     if (tid >= num_rows) return;
 
-    // Bit-reverse the row index so we read columns at `br` but write the
-    // hashed leaf at `tid` — matching the CPU `commit_columns_bit_reversed`.
+    // Bit-reverse the row index so we read columns at `br` but write the hashed
+    // leaf at `tid` — matching the CPU per-row `commit_bit_reversed(.., 1)`.
     uint64_t br = __brevll(tid) >> (64 - log_num_rows);
 
     uint64_t st[25];
diff --git a/crypto/math-cuda/src/lde.rs b/crypto/math-cuda/src/lde.rs
index 8dc05ba01..c3f59fd90 100644
--- a/crypto/math-cuda/src/lde.rs
+++ b/crypto/math-cuda/src/lde.rs
@@ -37,11 +37,11 @@ fn assert_u32_domain(n: usize, what: &str) {
 /// Output shape requested from the fused LDE + Keccak entry points.
 #[derive(Copy, Clone, PartialEq, Eq)]
 enum KeccakCommit {
-    /// Only the `lde_size` keccak-256 leaves; no inner-tree build. Caller
-    /// receives `lde_size * 32` bytes.
+    /// Only the keccak-256 leaves; no inner-tree build. Caller receives
+    /// `num_leaves * 32` bytes.
     LeavesOnly,
     /// Full Merkle tree: leaves at the tail + inner nodes built on-device.
-    /// Caller receives `(2*lde_size - 1) * 32` bytes.
+    /// Caller receives `(2*num_leaves - 1) * 32` bytes.
     FullTree,
 }
 
@@ -595,12 +595,12 @@ pub fn coset_lde_batch_base_into(
     Ok(())
 }
 
-/// Fused LDE + Keccak-256 leaf hashing. Caller receives the `lde_size * 32`
-/// bytes of leaf hashes in `hashed_leaves_out` (one 32-byte digest per output
-/// row, in natural row order; leaves are computed reading columns at
-/// bit-reversed rows, matching `commit_columns_bit_reversed` on the CPU
-/// side). Thin wrapper over `coset_lde_batch_base_into_with_merkle_tree_inner`
-/// with `LeavesOnly` — no inner-tree build, no device handle.
+/// Fused LDE + row-pair Keccak-256 leaf hashing. Caller receives
+/// `(lde_size / 2) * 32` bytes of leaf hashes in `hashed_leaves_out` (one
+/// 32-byte digest per bit-reversed row pair, in natural leaf order, matching
+/// `commit_bit_reversed(.., 2)` on the CPU side). Thin wrapper over
+/// `coset_lde_batch_base_into_with_merkle_tree_inner` with `LeavesOnly` — no
+/// inner-tree build, no device handle.
 pub fn coset_lde_batch_base_into_with_leaf_hash(
     columns: &[&[u64]],
     blowup_factor: usize,
@@ -616,14 +616,15 @@ pub fn coset_lde_batch_base_into_with_leaf_hash(
         hashed_leaves_out,
         KeccakCommit::LeavesOnly,
         false,
-        1,
+        2,
     )
     .map(|_| ())
 }
 
 /// Like `coset_lde_batch_base_into_with_leaf_hash`, but also builds the full
-/// Merkle tree on device and returns the `2*lde_size - 1` node buffer back
-/// to the caller in `merkle_nodes_out` (byte length `(2*lde_size - 1) * 32`).
+/// row-pair Merkle tree on device and returns the `2*(lde_size/2) - 1` node
+/// buffer back to the caller in `merkle_nodes_out` (byte length
+/// `(2*(lde_size/2) - 1) * 32`).
 ///
 /// The leaf hashes are never exposed to the caller — they stay on device and
 /// feed straight into the pair-hash tree kernel, avoiding the
@@ -644,7 +645,7 @@ pub fn coset_lde_batch_base_into_with_merkle_tree(
         merkle_nodes_out,
         KeccakCommit::FullTree,
         false,
-        1,
+        2,
     )
     .map(|_| ())
 }
@@ -799,8 +800,8 @@ fn coset_lde_batch_base_into_with_merkle_tree_inner(
     )?;
 
     // Allocate the device output buffer. In `LeavesOnly` mode this is just
-    // `lde_size * 32` bytes (the leaves themselves); in `FullTree` mode it's
-    // `(2*lde_size - 1) * 32` bytes (leaves in the tail + inner nodes filled
+    // `num_leaves * 32` bytes (the leaves themselves); in `FullTree` mode it's
+    // `(2*num_leaves - 1) * 32` bytes (leaves in the tail + inner nodes filled
     // below). `alloc` (not `alloc_zeros`) is safe because every byte is
     // written before any reader sees it: the keccak kernel fills the
     // leaves slab, the inner-tree pass (when present) fills the head.
@@ -857,8 +858,8 @@ fn coset_lde_batch_base_into_with_merkle_tree_inner(
     }
 }
 
-/// Ext3 variant of `coset_lde_batch_base_into_with_leaf_hash`: fused
-/// LDE + Keccak-256 leaf hashing over ext3 columns. Thin wrapper over
+/// Ext3 variant of `coset_lde_batch_base_into_with_leaf_hash`: fused LDE +
+/// row-pair Keccak-256 leaf hashing over ext3 columns. Thin wrapper over
 /// `coset_lde_batch_ext3_into_with_merkle_tree_inner` with `LeavesOnly`.
 pub fn coset_lde_batch_ext3_into_with_leaf_hash(
     columns: &[&[u64]],
@@ -877,14 +878,14 @@ pub fn coset_lde_batch_ext3_into_with_leaf_hash(
         hashed_leaves_out,
         KeccakCommit::LeavesOnly,
         false,
-        1,
+        2,
     )
     .map(|_| ())
 }
 
 /// Ext3 variant of the fused `coset_lde_batch_base_into_with_merkle_tree`.
 /// LDE + leaf hashing + inner-tree build, all on device; D2Hs only the LDE
-/// evaluations and the full `2*lde_size - 1` node buffer.
+/// evaluations and the full `2*(lde_size/2) - 1` row-pair node buffer.
 pub fn coset_lde_batch_ext3_into_with_merkle_tree(
     columns: &[&[u64]],
     n: usize,
@@ -902,7 +903,7 @@ pub fn coset_lde_batch_ext3_into_with_merkle_tree(
         merkle_nodes_out,
         KeccakCommit::FullTree,
         false,
-        1,
+        2,
     )
     .map(|_| ())
 }
@@ -1055,8 +1056,8 @@ fn coset_lde_batch_ext3_into_with_merkle_tree_inner(
         mb_u32,
     )?;
 
-    // Allocate device output buffer (LeavesOnly → lde_size*32; FullTree →
-    // (2*lde_size - 1)*32). Leaf kernel writes to the leaves slab; the
+    // Allocate device output buffer (LeavesOnly -> num_leaves*32; FullTree ->
+    // (2*num_leaves - 1)*32). Leaf kernel writes to the leaves slab; the
     // inner-tree pass (when present) fills the head.
     let mut nodes_dev = unsafe { stream.alloc::<u8>(nodes_dev_bytes) }?;
     let leaves_offset_bytes = commit.leaves_offset_bytes(num_leaves);
diff --git a/crypto/math-cuda/src/merkle.rs b/crypto/math-cuda/src/merkle.rs
index ef84c6da4..27f38ce0a 100644
--- a/crypto/math-cuda/src/merkle.rs
+++ b/crypto/math-cuda/src/merkle.rs
@@ -3,7 +3,7 @@
 //! Matches `FieldElementVectorBackend<F, Keccak256, 32>::hash_data` in
 //! `crypto/crypto/src/merkle_tree/backends/field_element_vector.rs`, combined
 //! with the `reverse_index` row read pattern used in
-//! `commit_columns_bit_reversed` at `crypto/stark/src/prover.rs`.
+//! `commit_bit_reversed` at `crypto/stark/src/commitment.rs`.
 //!
 //! Caller supplies base-field column slabs already laid out as
 //! `[col * col_stride + row]` (the same layout `coset_lde_batch_base_into`
@@ -38,6 +38,14 @@ pub fn keccak_leaves_base(
 ) -> Result<Vec<u8>> {
     assert!(num_rows.is_power_of_two());
     assert!(rows_per_leaf == 1 || rows_per_leaf == 2);
+    assert!(
+        num_rows >= rows_per_leaf,
+        "num_rows must be at least rows_per_leaf"
+    );
+    assert!(
+        num_rows >= 2,
+        "num_rows must be at least 2 for bit-reversed GPU leaf hashing"
+    );
     assert!(
         col_stride >= num_rows,
         "col_stride must be >= num_rows to keep per-column reads in-bounds"
@@ -80,6 +88,14 @@ pub fn keccak_leaves_ext3(
 ) -> Result<Vec<u8>> {
     assert!(num_rows.is_power_of_two());
     assert!(rows_per_leaf == 1 || rows_per_leaf == 2);
+    assert!(
+        num_rows >= rows_per_leaf,
+        "num_rows must be at least rows_per_leaf"
+    );
+    assert!(
+        num_rows >= 2,
+        "num_rows must be at least 2 for bit-reversed GPU leaf hashing"
+    );
     assert!(
         col_stride >= num_rows,
         "col_stride must be >= num_rows to keep per-column reads in-bounds"
diff --git a/crypto/math-cuda/tests/comp_poly_tree.rs b/crypto/math-cuda/tests/comp_poly_tree.rs
index 29e33b6fe..51b826dd1 100644
--- a/crypto/math-cuda/tests/comp_poly_tree.rs
+++ b/crypto/math-cuda/tests/comp_poly_tree.rs
@@ -1,6 +1,6 @@
 //! Parity: GPU fused `evaluate_poly_coset_batch_ext3_into_with_merkle_tree`
 //! (LDE + row-pair Keccak leaves + Merkle inner tree) against the same CPU
-//! pipeline produced by `commit_composition_polynomial`.
+//! row-pair commitment layout used by `commit_bit_reversed(.., 2)`.
 
 use math::field::element::FieldElement;
 use math::field::extensions_goldilocks::Degree3GoldilocksExtensionField;
@@ -83,7 +83,7 @@ fn cpu_hash_pair(left: &[u8; 32], right: &[u8; 32]) -> [u8; 32] {
     out
 }
 
-/// CPU: `commit_composition_polynomial`-style tree root over num_rows/2 leaves.
+/// CPU: `commit_bit_reversed(.., 2)`-style tree root over num_rows/2 leaves.
 fn cpu_tree_nodes(parts: &[Vec<Fp3>]) -> Vec<[u8; 32]> {
     let num_rows = parts[0].len();
     let num_parts = parts.len();

From 8ddadf48e28fbc1d94c9301e3d7283dbf8a3ccca Mon Sep 17 00:00:00 2001
From: diegokingston <dkingston@fi.uba.ar>
Date: Mon, 29 Jun 2026 13:03:12 -0300
Subject: [PATCH 14/19] fix(stark): silence dead_code on par_for_each_mut
 (debug-checks-only after merge)

---
 crypto/stark/src/par.rs | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/crypto/stark/src/par.rs b/crypto/stark/src/par.rs
index 051651fde..cee693e3f 100644
--- a/crypto/stark/src/par.rs
+++ b/crypto/stark/src/par.rs
@@ -58,6 +58,9 @@ pub(crate) fn par_map_collect<R: Send>(
 
 /// Run `f(&mut item)` for each element of `slice`. Parallel when
 /// `feature = "parallel"`, sequential otherwise (ordering is irrelevant).
+// Only called from the `debug-checks`-gated column-LDE reconstruct path
+// (production LDE is row-major); keep it available without warning otherwise.
+#[cfg_attr(not(feature = "debug-checks"), allow(dead_code))]
 pub(crate) fn par_for_each_mut<T: Send>(slice: &mut [T], f: impl Fn(&mut T) + Sync + Send) {
     #[cfg(feature = "parallel")]
     {

From 9655852ae32e244852b3c2fc00a50f5ff05b9351 Mon Sep 17 00:00:00 2001
From: diegokingston <dkingston@fi.uba.ar>
Date: Mon, 29 Jun 2026 13:04:58 -0300
Subject: [PATCH 15/19] style(stark): cargo fmt after merge resolution

---
 crypto/stark/src/prover.rs | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/crypto/stark/src/prover.rs b/crypto/stark/src/prover.rs
index 189d835ae..a3c64c4fb 100644
--- a/crypto/stark/src/prover.rs
+++ b/crypto/stark/src/prover.rs
@@ -509,7 +509,6 @@ fn columns_to_row_major<E: IsField>(
     (data, num_cols)
 }
 
-
 /// The functionality of a STARK prover providing methods to run the STARK Prove protocol
 /// https://lambdaclass.github.io/lambdaworks/starks/protocol.html
 /// The default implementation is complete and is compatible with Stone prover
@@ -615,7 +614,6 @@ pub trait IsStarkProver<
         Some((tree, root))
     }
 
-
     /// Compute the LDE commitment for a subset of columns from a trace (for testing).
     ///
     /// This helper computes the same commitment the prover generates internally,
@@ -2120,8 +2118,9 @@ pub trait IsStarkProver<
                         #[cfg(feature = "instruments")]
                         let t_sub = Instant::now();
                         #[allow(unused_mut)]
-                        let (mut tree, root) = Self::commit_rows_bit_reversed(&aux_data, total_cols)
-                            .ok_or(ProvingError::EmptyCommitment)?;
+                        let (mut tree, root) =
+                            Self::commit_rows_bit_reversed(&aux_data, total_cols)
+                                .ok_or(ProvingError::EmptyCommitment)?;
                         #[cfg(feature = "disk-spill")]
                         Self::spill_tree(&mut tree, storage_mode, "aux Merkle tree")?;
                         let commit = TableCommit::plain(tree, root);

From 04e1faed699388981d3d95fb5272e420e0622fcb Mon Sep 17 00:00:00 2001
From: diegokingston <dkingston@fi.uba.ar>
Date: Mon, 29 Jun 2026 13:10:28 -0300
Subject: [PATCH 16/19] test(cuda): focused GPU row-pair commitment
 prove+verify test

---
 prover/tests/cuda_path_integration.rs | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/prover/tests/cuda_path_integration.rs b/prover/tests/cuda_path_integration.rs
index 0f7c1f3c7..583645379 100644
--- a/prover/tests/cuda_path_integration.rs
+++ b/prover/tests/cuda_path_integration.rs
@@ -66,3 +66,24 @@ fn gpu_path_fires_end_to_end() {
     let ok = verify(&proof, &elf).expect("verify");
     assert!(ok, "GPU-produced proof failed verification");
 }
+
+/// Focused validation of the GPU row-pair trace commitment: proves a large
+/// trace with the GPU path and verifies the resulting proof. Independent of the
+/// per-round counter assertions in `gpu_path_fires_end_to_end` (the R2 parts-LDE
+/// assertion bit-rotted on main and cuts off before the verify). A wrong GPU
+/// trace-commit leaf layout (1-row vs the new row-pair) would fail verification.
+#[test]
+#[ignore = "requires GPU; run with --ignored --nocapture"]
+fn gpu_proof_verifies_row_pair_commitment() {
+    let elf = asm_elf_bytes("fib_iterative_1M");
+    reset_all_gpu_call_counters();
+    let proof = prove(&elf).expect("prove");
+    assert!(
+        gpu_lde_calls() > 0,
+        "GPU LDE path did not fire (silent CPU fallback would not test the GPU commit)"
+    );
+    assert!(
+        verify(&proof, &elf).expect("verify"),
+        "GPU-produced proof (row-pair commitment) failed verification"
+    );
+}

From 05bb11ac4797fefae9f22dbd6ec9b7fdffbfc525 Mon Sep 17 00:00:00 2001
From: diegokingston <dkingston@fi.uba.ar>
Date: Mon, 29 Jun 2026 13:15:44 -0300
Subject: [PATCH 17/19] fix(cuda): drop stale R2 parts-LDE asserts (#700 fused
 path), silence GPU column-LDE dead_code

The R2 parts-LDE / comp-poly-tree GPU dispatches no longer fire since #699/#700
route degree-3 tables through the 2-part fused coset_lde_full path (no AIR has
number_of_parts > 2). try_expand_columns_batched* are debug-checks-only after
the #650 row-major LDE became production.
---
 crypto/stark/src/gpu_lde.rs           |  2 ++
 prover/tests/cuda_path_integration.rs | 19 +++++++------------
 2 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/crypto/stark/src/gpu_lde.rs b/crypto/stark/src/gpu_lde.rs
index 5f09cd3b2..5b9ad09f7 100644
--- a/crypto/stark/src/gpu_lde.rs
+++ b/crypto/stark/src/gpu_lde.rs
@@ -303,6 +303,7 @@ fn alloc_merkle_nodes(num_leaves: usize) -> Option<(Vec<[u8; 32]>, usize)> {
 /// Returns `Some(())` if the batch was handled on GPU and `columns` now holds
 /// the LDE evaluations, or if there were no columns to expand. Returns `None`
 /// to let the caller run the per-column CPU fallback.
+#[cfg_attr(not(feature = "debug-checks"), allow(dead_code))]
 pub(crate) fn try_expand_columns_batched<F, E>(
     columns: &mut [Vec<FieldElement<E>>],
     blowup_factor: usize,
@@ -582,6 +583,7 @@ where
 /// transform uses only base-field twiddles and coset weights, which act
 /// componentwise on ext3, so the per-component result equals the ext3 LDE the
 /// CPU path computes.
+#[cfg_attr(not(feature = "debug-checks"), allow(dead_code))]
 fn try_expand_columns_batched_ext3<F, E>(
     columns: &mut [Vec<FieldElement<E>>],
     blowup_factor: usize,
diff --git a/prover/tests/cuda_path_integration.rs b/prover/tests/cuda_path_integration.rs
index 583645379..0c0bb00c2 100644
--- a/prover/tests/cuda_path_integration.rs
+++ b/prover/tests/cuda_path_integration.rs
@@ -11,8 +11,8 @@
 use lambda_vm_prover::test_utils::asm_elf_bytes;
 use lambda_vm_prover::{prove, verify};
 use stark::gpu_lde::{
-    gpu_bary_calls, gpu_batch_invert_calls, gpu_comp_poly_tree_calls, gpu_deep_calls,
-    gpu_fri_calls, gpu_lde_calls, gpu_parts_lde_calls, reset_all_gpu_call_counters,
+    gpu_bary_calls, gpu_batch_invert_calls, gpu_deep_calls, gpu_fri_calls, gpu_lde_calls,
+    reset_all_gpu_call_counters,
 };
 
 #[test]
@@ -36,16 +36,11 @@ fn gpu_path_fires_end_to_end() {
     // path.
     assert!(gpu_bary_calls() > 0, "R3 GPU barycentric did not fire");
 
-    // R2 ext3 LDE of composition-poly parts. Only fires when an AIR's
-    // `number_of_parts > 2`. The branch and shift tables have degree-3
-    // transition constraints, so this triggers on any non-trivial prove.
-    assert!(gpu_parts_lde_calls() > 0, "R2 GPU parts LDE did not fire");
-
-    // R2 comp-poly Merkle tree build, paired with the parts LDE above.
-    assert!(
-        gpu_comp_poly_tree_calls() > 0,
-        "R2 GPU comp-poly tree did not fire"
-    );
+    // The R2 parts-LDE and comp-poly-tree GPU dispatches are obsolete since
+    // #699/#700 routed degree-3 tables through the 2-part decompose_and_extend_d2
+    // + fused coset_lde_full path: no VM AIR has number_of_parts > 2 anymore, so
+    // try_evaluate_parts_on_lde_gpu_keep (and its paired tree build) no longer
+    // fire. (These assertions bit-rotted against the fused composition LDE.)
 
     // DEEP fires once per table that took the R1 GPU path.
     assert!(gpu_deep_calls() > 0, "R4 GPU DEEP composition did not fire");

From 275eadd1478cff31dc1be47b2137984de84be328 Mon Sep 17 00:00:00 2001
From: Mauro Toscano <12560266+MauroToscano@users.noreply.github.com>
Date: Mon, 29 Jun 2026 18:11:00 -0300
Subject: [PATCH 18/19] =?UTF-8?q?review(stark):=20address=20PR=20#735=20re?=
 =?UTF-8?q?view=20=E2=80=94=20coverage,=20dead=20code,=20cleanups=20(#740)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Follow-up to the row-pair commitment PR. Excludes the intentional
proof-format break (proof_sym removal / leaf-count change), which is
by design.

Test coverage (in their own files under crypto/stark/src/tests/, per the
crate's test structure):
- tests/commitment_tests.rs: direct unit tests pinning the row-pair leaf
  layout (R=1 and R=2) against an independent reference, wrapper
  agreement, commit-root consistency, and empty-input short-circuit.
  Previously the leaf layout was only covered transitively via full
  prove->verify, and GPU parity tests compared against an inline reimpl
  rather than this module.
- tests/row_pair_opening_tests.rs: two negative tests for the row-pair
  verify_opening_pair — a tampered symmetric trace evaluation and a
  corrupted Merkle authentication path must both be rejected. Removing
  proof_sym deleted the old "symmetric opening mismatch" rejection class;
  these restore it (an impl ignoring evaluations_sym / the auth path
  would otherwise pass every existing test). Reuses the now pub(crate)
  make_valid_simple_proof helper.
- cuda_path_integration.rs: restore assert!(gpu_comp_poly_tree_calls() > 0).
  try_build_comp_poly_tree_gpu is dispatched unconditionally (round 2,
  after the parts-count branch), so it fires for the common
  number_of_parts == 2 (degree-3) case — it was NOT obsolete. Keep the
  genuinely-dead parts-LDE assertion dropped.

Cleanups:
- Delete dead fn commit_plain (zero callers; main/aux commit inline
  commit_rows_bit_reversed + spill_tree, which are row-major and
  incompatible with its column-major signature — the dedup never landed).
- Delete orphaned pub fn columns2rows (all callers removed by #735).
- Add ProvingError::Fft and map FFTError to it instead of WrongParameter
  (internal FFT failure is not a caller-supplied-parameter error).
- Fix stale profiler label commit_composition_poly -> commit_bit_reversed.
- Drop the rot-prone "// = 2" comment on the local ROWS_PER_LEAF alias.
---
 crypto/stark/src/prover.rs                    | 32 ++-----
 crypto/stark/src/tests/commitment_tests.rs    | 96 +++++++++++++++++++
 crypto/stark/src/tests/mod.rs                 |  2 +
 .../stark/src/tests/row_pair_opening_tests.rs | 73 ++++++++++++++
 crypto/stark/src/tests/small_trace_tests.rs   |  2 +-
 crypto/stark/src/trace.rs                     | 16 ----
 prover/src/instruments.rs                     |  2 +-
 prover/tests/cuda_path_integration.rs         | 23 +++--
 8 files changed, 198 insertions(+), 48 deletions(-)
 create mode 100644 crypto/stark/src/tests/commitment_tests.rs
 create mode 100644 crypto/stark/src/tests/row_pair_opening_tests.rs

diff --git a/crypto/stark/src/prover.rs b/crypto/stark/src/prover.rs
index a3c64c4fb..288288c06 100644
--- a/crypto/stark/src/prover.rs
+++ b/crypto/stark/src/prover.rs
@@ -88,11 +88,16 @@ pub enum ProvingError {
     /// out of disk space, fd exhaustion, or mmap failure.
     #[cfg(feature = "disk-spill")]
     DiskSpill(String),
+    /// An internal FFT/LDE computation failed (e.g. domain size exceeds the
+    /// field's two-adicity, or a degenerate coset offset). Distinct from
+    /// `WrongParameter` because the cause is internal prover machinery, not a
+    /// caller-supplied parameter. Carries the underlying `FFTError`'s message.
+    Fft(String),
 }
 
 impl From<FFTError> for ProvingError {
     fn from(e: FFTError) -> Self {
-        ProvingError::WrongParameter(format!("{e}"))
+        ProvingError::Fft(format!("{e}"))
     }
 }
 
@@ -573,7 +578,8 @@ pub trait IsStarkProver<
             "num_rows must be a power of two for reverse_index"
         );
 
-        const ROWS_PER_LEAF: usize = crate::commitment::ROWS_PER_LEAF; // = 2
+        // Local alias for the canonical constant, used several times below.
+        const ROWS_PER_LEAF: usize = crate::commitment::ROWS_PER_LEAF;
         let num_leaves = num_rows / ROWS_PER_LEAF;
         let subset_cols = col_end - col_start;
         let byte_len = <FieldElement<E> as ByteConversion>::BYTE_LEN;
@@ -884,28 +890,6 @@ pub trait IsStarkProver<
         Ok(())
     }
 
-    /// Commit an already-LDE-expanded plain column set: build the row-paired
-    /// Merkle tree, spill it to disk if requested, and wrap it as a plain
-    /// `TableCommit`. Shared by the main-trace (non-preprocessed) and aux-trace
-    /// commit paths.
-    fn commit_plain<C>(
-        columns: &[Vec<FieldElement<C>>],
-        #[cfg(feature = "disk-spill")] storage_mode: StorageMode,
-        #[cfg(feature = "disk-spill")] spill_label: &str,
-    ) -> Result<TableCommit<C>, ProvingError>
-    where
-        C: IsField,
-        FieldElement<C>: AsBytes + Sync + Send + math::traits::ByteConversion,
-    {
-        #[allow(unused_mut)]
-        let (mut tree, root) =
-            crate::commitment::commit_bit_reversed(columns, crate::commitment::ROWS_PER_LEAF)
-                .ok_or(ProvingError::EmptyCommitment)?;
-        #[cfg(feature = "disk-spill")]
-        Self::spill_tree(&mut tree, storage_mode, spill_label)?;
-        Ok(TableCommit::plain(tree, root))
-    }
-
     /// Recompute Round1 from the trace, reusing the Merkle trees stored in commitments.
     ///
     /// Only used by `run_debug_checks` — Phase D consumes the cached LDE
diff --git a/crypto/stark/src/tests/commitment_tests.rs b/crypto/stark/src/tests/commitment_tests.rs
new file mode 100644
index 000000000..f1684112b
--- /dev/null
+++ b/crypto/stark/src/tests/commitment_tests.rs
@@ -0,0 +1,96 @@
+//! Unit tests for the Merkle commitment layer (`crate::commitment`): they pin
+//! the bit-reversed, row-grouped leaf byte layout that the GPU kernels and the
+//! verifier's `verify_opening_pair` must match. Previously this layout was only
+//! covered transitively through full prove→verify.
+
+use crate::commitment::{
+    ROWS_PER_LEAF, commit_bit_reversed, keccak_leaves_bit_reversed,
+    keccak_leaves_bit_reversed_grouped, keccak_leaves_row_pair_bit_reversed,
+};
+use crate::config::{BatchedMerkleTree, BatchedMerkleTreeBackend, Commitment};
+use math::fft::bit_reversing::reverse_index;
+use math::field::{element::FieldElement, goldilocks::GoldilocksField};
+use math::traits::ByteConversion;
+
+type F = GoldilocksField;
+type Felt = FieldElement<F>;
+
+/// 3 columns × 8 rows of distinct, nonzero values.
+fn sample_columns() -> Vec<Vec<Felt>> {
+    (0..3u64)
+        .map(|c| (0..8u64).map(|r| Felt::from(100 * c + r + 1)).collect())
+        .collect()
+}
+
+/// Independent reference for one leaf, written straight from the module-doc
+/// layout (`rows_per_leaf` consecutive bit-reversed rows, column-major within
+/// each row, big-endian), hashed once with the same backend the prover uses.
+/// Structurally separate from the production `map_init` loop, so a transposed
+/// row/column order or a wrong bit-reversal in production fails this check.
+fn expected_leaf(columns: &[Vec<Felt>], rows_per_leaf: usize, leaf_idx: usize) -> Commitment {
+    let num_rows = columns[0].len();
+    let byte_len = <Felt as ByteConversion>::BYTE_LEN;
+    let mut buf = vec![0u8; rows_per_leaf * columns.len() * byte_len];
+    let mut offset = 0;
+    for k in 0..rows_per_leaf {
+        let br = reverse_index(rows_per_leaf * leaf_idx + k, num_rows as u64);
+        for col in columns {
+            col[br].write_bytes_be(&mut buf[offset..offset + byte_len]);
+            offset += byte_len;
+        }
+    }
+    BatchedMerkleTreeBackend::<F>::hash_bytes(&buf)
+}
+
+#[test]
+fn grouped_leaves_match_documented_layout_for_r1_and_r2() {
+    let columns = sample_columns();
+    let num_rows = columns[0].len();
+    for &rows_per_leaf in &[1usize, 2usize] {
+        let leaves = keccak_leaves_bit_reversed_grouped(&columns, rows_per_leaf);
+        assert_eq!(
+            leaves.len(),
+            num_rows / rows_per_leaf,
+            "leaf count for rows_per_leaf={rows_per_leaf}"
+        );
+        for (i, leaf) in leaves.iter().enumerate() {
+            assert_eq!(
+                *leaf,
+                expected_leaf(&columns, rows_per_leaf, i),
+                "leaf {i} for rows_per_leaf={rows_per_leaf}"
+            );
+        }
+    }
+}
+
+#[test]
+fn wrappers_agree_with_grouped() {
+    let columns = sample_columns();
+    assert_eq!(
+        keccak_leaves_bit_reversed(&columns),
+        keccak_leaves_bit_reversed_grouped(&columns, 1)
+    );
+    assert_eq!(
+        keccak_leaves_row_pair_bit_reversed(&columns),
+        keccak_leaves_bit_reversed_grouped(&columns, ROWS_PER_LEAF)
+    );
+}
+
+#[test]
+fn commit_root_matches_tree_built_over_leaves() {
+    let columns = sample_columns();
+    let leaves = keccak_leaves_bit_reversed_grouped(&columns, ROWS_PER_LEAF);
+    let tree = BatchedMerkleTree::<F>::build_from_hashed_leaves(leaves).unwrap();
+    let (_, root) = commit_bit_reversed(&columns, ROWS_PER_LEAF).unwrap();
+    assert_eq!(root, tree.root);
+}
+
+#[test]
+fn empty_and_zero_row_inputs_short_circuit() {
+    let empty: Vec<Vec<Felt>> = vec![];
+    assert!(keccak_leaves_bit_reversed_grouped(&empty, ROWS_PER_LEAF).is_empty());
+    assert!(commit_bit_reversed(&empty, ROWS_PER_LEAF).is_none());
+    let zero_rows: Vec<Vec<Felt>> = vec![vec![]];
+    assert!(keccak_leaves_bit_reversed_grouped(&zero_rows, ROWS_PER_LEAF).is_empty());
+    assert!(commit_bit_reversed(&zero_rows, ROWS_PER_LEAF).is_none());
+}
diff --git a/crypto/stark/src/tests/mod.rs b/crypto/stark/src/tests/mod.rs
index 8c0897ac1..7a3884832 100644
--- a/crypto/stark/src/tests/mod.rs
+++ b/crypto/stark/src/tests/mod.rs
@@ -2,12 +2,14 @@ pub mod air_tests;
 #[cfg(feature = "debug-checks")]
 pub mod bus_debug_tests;
 pub mod bus_tests;
+pub mod commitment_tests;
 pub mod domain_cache_stats;
 pub mod fri_tests;
 pub mod grinding_tests;
 pub mod proof_options_tests;
 pub mod prove_verify_roundtrip_tests;
 pub mod prover_tests;
+pub mod row_pair_opening_tests;
 pub mod small_trace_tests;
 #[cfg(feature = "disk-spill")]
 pub mod table_disk_spill_tests;
diff --git a/crypto/stark/src/tests/row_pair_opening_tests.rs b/crypto/stark/src/tests/row_pair_opening_tests.rs
new file mode 100644
index 000000000..89e7dc8f1
--- /dev/null
+++ b/crypto/stark/src/tests/row_pair_opening_tests.rs
@@ -0,0 +1,73 @@
+//! Negative tests for the row-pair trace opening verification
+//! (`verifier::verify_opening_pair`). The row pair `(2·iota, 2·iota+1)` is
+//! committed as a single Merkle leaf, so one `proof` authenticates both
+//! `evaluations` and `evaluations_sym`. Removing the old separate `proof_sym`
+//! opening deleted the "symmetric opening mismatch" rejection class; these
+//! tests restore it — an implementation that ignored `evaluations_sym` or the
+//! authentication path would otherwise pass every other test.
+
+use super::small_trace_tests::make_valid_simple_proof;
+use crate::verifier::{IsStarkVerifier, Verifier};
+use crypto::fiat_shamir::default_transcript::DefaultTranscript;
+use math::field::{element::FieldElement, goldilocks::GoldilocksField};
+
+type Felt = FieldElement<GoldilocksField>;
+
+/// Tampering the value at the symmetric LDE position must break verification:
+/// the committed leaf hashed `evaluations ‖ evaluations_sym`, so a perturbed
+/// `evaluations_sym` no longer reconstructs the committed leaf.
+#[test_log::test]
+fn test_verify_rejects_tampered_main_trace_evaluations_sym() {
+    let (air, mut proof) = make_valid_simple_proof();
+
+    let opening = proof
+        .deep_poly_openings
+        .first_mut()
+        .expect("test precondition: a valid proof has at least one deep poly opening");
+    assert!(
+        !opening.main_trace_polys.evaluations_sym.is_empty(),
+        "test precondition: the main-trace opening has at least one symmetric evaluation",
+    );
+    // Perturb (not resize) the first symmetric evaluation.
+    opening.main_trace_polys.evaluations_sym[0] =
+        &opening.main_trace_polys.evaluations_sym[0] + Felt::one();
+
+    assert!(
+        !Verifier::verify(
+            &proof,
+            &air,
+            &mut DefaultTranscript::<GoldilocksField>::new(&[])
+        ),
+        "Verifier must reject a tampered symmetric trace evaluation"
+    );
+}
+
+/// The row-pair Merkle authentication path itself must be checked. Corrupting a
+/// node in `main_trace_polys.proof.merkle_path` is caught ONLY by
+/// `verify_opening_pair` (the deep-composition reconstruction does not touch the
+/// auth path), so this proves the single row-pair path is actually authenticated
+/// against the committed root rather than ignored.
+#[test_log::test]
+fn test_verify_rejects_tampered_main_trace_merkle_path() {
+    let (air, mut proof) = make_valid_simple_proof();
+
+    let opening = proof
+        .deep_poly_openings
+        .first_mut()
+        .expect("test precondition: a valid proof has at least one deep poly opening");
+    let path = &mut opening.main_trace_polys.proof.merkle_path;
+    assert!(
+        !path.is_empty(),
+        "test precondition: the row-pair trace tree has a non-trivial authentication path",
+    );
+    path[0][0] ^= 0x01;
+
+    assert!(
+        !Verifier::verify(
+            &proof,
+            &air,
+            &mut DefaultTranscript::<GoldilocksField>::new(&[])
+        ),
+        "Verifier must reject a corrupted main-trace Merkle authentication path"
+    );
+}
diff --git a/crypto/stark/src/tests/small_trace_tests.rs b/crypto/stark/src/tests/small_trace_tests.rs
index 8373ae9d6..1e081910b 100644
--- a/crypto/stark/src/tests/small_trace_tests.rs
+++ b/crypto/stark/src/tests/small_trace_tests.rs
@@ -17,7 +17,7 @@ use crate::{
 
 type Felt = FieldElement<GoldilocksField>;
 
-fn make_valid_simple_proof() -> (
+pub(crate) fn make_valid_simple_proof() -> (
     SimpleAdditionAIR<GoldilocksField>,
     crate::proof::stark::StarkProof<
         GoldilocksField,
diff --git a/crypto/stark/src/trace.rs b/crypto/stark/src/trace.rs
index da4a53f6e..72b77947a 100644
--- a/crypto/stark/src/trace.rs
+++ b/crypto/stark/src/trace.rs
@@ -645,22 +645,6 @@ where
     Table::new(table_data, table_width)
 }
 
-pub fn columns2rows<F>(columns: Vec<Vec<F>>) -> Vec<Vec<F>>
-where
-    F: Clone,
-{
-    let num_rows = columns[0].len();
-    let num_cols = columns.len();
-
-    (0..num_rows)
-        .map(|row_index| {
-            (0..num_cols)
-                .map(|col_index| columns[col_index][row_index].clone())
-                .collect()
-        })
-        .collect()
-}
-
 pub(crate) fn compute_frame_evaluation_points<F, E>(
     x: &FieldElement<E>,
     frame_offsets: &[usize],
diff --git a/prover/src/instruments.rs b/prover/src/instruments.rs
index f15223e18..b787af0da 100644
--- a/prover/src/instruments.rs
+++ b/prover/src/instruments.rs
@@ -173,7 +173,7 @@ pub fn print_report(
             let mut sub_ops: Vec<(&str, Duration)> = vec![
                 ("R2  evaluate", total_constraints),
                 ("R2  decompose_and_extend_d2", total_comp_decompose),
-                ("R2  commit_composition_poly", total_comp_commit),
+                ("R2  commit_bit_reversed (comp-poly)", total_comp_commit),
                 ("R3  OOD evaluation", total_ood),
                 ("R4  deep_composition_poly_evals", total_deep_comp),
                 ("R4  interpolate+evaluate_fft", total_deep_extend),
diff --git a/prover/tests/cuda_path_integration.rs b/prover/tests/cuda_path_integration.rs
index 0c0bb00c2..521ecd0d3 100644
--- a/prover/tests/cuda_path_integration.rs
+++ b/prover/tests/cuda_path_integration.rs
@@ -11,8 +11,8 @@
 use lambda_vm_prover::test_utils::asm_elf_bytes;
 use lambda_vm_prover::{prove, verify};
 use stark::gpu_lde::{
-    gpu_bary_calls, gpu_batch_invert_calls, gpu_deep_calls, gpu_fri_calls, gpu_lde_calls,
-    reset_all_gpu_call_counters,
+    gpu_bary_calls, gpu_batch_invert_calls, gpu_comp_poly_tree_calls, gpu_deep_calls,
+    gpu_fri_calls, gpu_lde_calls, reset_all_gpu_call_counters,
 };
 
 #[test]
@@ -36,11 +36,22 @@ fn gpu_path_fires_end_to_end() {
     // path.
     assert!(gpu_bary_calls() > 0, "R3 GPU barycentric did not fire");
 
-    // The R2 parts-LDE and comp-poly-tree GPU dispatches are obsolete since
+    // R2 parts-LDE (try_evaluate_parts_on_lde_gpu_keep) is obsolete: since
     // #699/#700 routed degree-3 tables through the 2-part decompose_and_extend_d2
-    // + fused coset_lde_full path: no VM AIR has number_of_parts > 2 anymore, so
-    // try_evaluate_parts_on_lde_gpu_keep (and its paired tree build) no longer
-    // fire. (These assertions bit-rotted against the fused composition LDE.)
+    // + fused coset_lde_full path, no VM AIR has number_of_parts > 2, so that
+    // dispatch lives only in the dead `> 2` fallback. We intentionally do NOT
+    // assert gpu_parts_lde_calls() here.
+    //
+    // The comp-poly-tree build is NOT obsolete, though: try_build_comp_poly_tree_gpu
+    // is called unconditionally after the parts-count branch (prover.rs round 2),
+    // so it fires for the common number_of_parts == 2 (degree-3) case. A silent
+    // CPU fallback here would still verify, so the end-to-end check below cannot
+    // catch it — this counter assertion is what guards the GPU comp-poly-tree
+    // dispatch.
+    assert!(
+        gpu_comp_poly_tree_calls() > 0,
+        "R2 GPU comp-poly tree did not fire"
+    );
 
     // DEEP fires once per table that took the R1 GPU path.
     assert!(gpu_deep_calls() > 0, "R4 GPU DEEP composition did not fire");

From bd1ebbbeec2bf39633af00568b00d16e08af73b1 Mon Sep 17 00:00:00 2001
From: Mauro Toscano <12560266+MauroToscano@users.noreply.github.com>
Date: Mon, 29 Jun 2026 18:34:16 -0300
Subject: [PATCH 19/19] review(stark): tidy test layout + remove AGENTS.md
 (follow-up to #740) (#744)

Two items that landed too late for #740:

- Move the shared make_valid_simple_proof helper out of small_trace_tests.rs
  into tests/trace_test_helpers.rs, where the crate keeps shared test
  helpers (matching how prover_tests sources get_trace_evaluations).
  row_pair_opening_tests.rs and small_trace_tests.rs now both import it
  from there instead of one test file reaching sideways into another.
- Delete AGENTS.md (added by #735).
---
 AGENTS.md                                     |  4 ---
 .../stark/src/tests/row_pair_opening_tests.rs |  2 +-
 crypto/stark/src/tests/small_trace_tests.rs   | 26 +-------------
 crypto/stark/src/tests/trace_test_helpers.rs  | 36 +++++++++++++++++++
 4 files changed, 38 insertions(+), 30 deletions(-)
 delete mode 100644 AGENTS.md

diff --git a/AGENTS.md b/AGENTS.md
deleted file mode 100644
index 767d84610..000000000
--- a/AGENTS.md
+++ /dev/null
@@ -1,4 +0,0 @@
-# Agent Rules
-
-- Before pushing code or conflict-resolution commits to a PR branch, run `make lint` from the repository root. `cargo fmt --check` is not a substitute because CI's `Lint` job runs the full `make lint` target.
-- If `make lint` cannot complete, do not push unless the user explicitly accepts that risk, and report the incomplete lint result.
diff --git a/crypto/stark/src/tests/row_pair_opening_tests.rs b/crypto/stark/src/tests/row_pair_opening_tests.rs
index 89e7dc8f1..93423f49f 100644
--- a/crypto/stark/src/tests/row_pair_opening_tests.rs
+++ b/crypto/stark/src/tests/row_pair_opening_tests.rs
@@ -6,7 +6,7 @@
 //! tests restore it — an implementation that ignored `evaluations_sym` or the
 //! authentication path would otherwise pass every other test.
 
-use super::small_trace_tests::make_valid_simple_proof;
+use crate::tests::trace_test_helpers::make_valid_simple_proof;
 use crate::verifier::{IsStarkVerifier, Verifier};
 use crypto::fiat_shamir::default_transcript::DefaultTranscript;
 use math::field::{element::FieldElement, goldilocks::GoldilocksField};
diff --git a/crypto/stark/src/tests/small_trace_tests.rs b/crypto/stark/src/tests/small_trace_tests.rs
index 1e081910b..96e04858d 100644
--- a/crypto/stark/src/tests/small_trace_tests.rs
+++ b/crypto/stark/src/tests/small_trace_tests.rs
@@ -11,37 +11,13 @@ use crate::{
     },
     proof::options::ProofOptions,
     prover::{IsStarkProver, Prover},
+    tests::trace_test_helpers::make_valid_simple_proof,
     traits::AIR,
     verifier::{IsStarkVerifier, Verifier},
 };
 
 type Felt = FieldElement<GoldilocksField>;
 
-pub(crate) fn make_valid_simple_proof() -> (
-    SimpleAdditionAIR<GoldilocksField>,
-    crate::proof::stark::StarkProof<
-        GoldilocksField,
-        GoldilocksField,
-        SimpleAdditionPublicInputs<GoldilocksField>,
-    >,
-) {
-    let mut trace = simple_addition_trace::<GoldilocksField>(2);
-    let proof_options = ProofOptions::default_test_options();
-    let pub_inputs = SimpleAdditionPublicInputs {
-        a: Felt::from(1u64),
-        b: Felt::from(2u64),
-    };
-    let air = SimpleAdditionAIR::<GoldilocksField>::new(&proof_options);
-    let proof = Prover::prove(
-        &air,
-        &mut trace,
-        &pub_inputs,
-        &mut DefaultTranscript::<GoldilocksField>::new(&[]),
-    )
-    .unwrap();
-    (air, proof)
-}
-
 /// Test STARK prove/verify with a single-row trace.
 /// This exercises the FRI protocol with 0 FRI layers (trace_length=1, number_layers=0).
 #[test_log::test]
diff --git a/crypto/stark/src/tests/trace_test_helpers.rs b/crypto/stark/src/tests/trace_test_helpers.rs
index e62d0d3ec..4ef6455b3 100644
--- a/crypto/stark/src/tests/trace_test_helpers.rs
+++ b/crypto/stark/src/tests/trace_test_helpers.rs
@@ -1,8 +1,16 @@
+use crate::examples::simple_addition::{
+    SimpleAdditionAIR, SimpleAdditionPublicInputs, simple_addition_trace,
+};
+use crate::proof::options::ProofOptions;
+use crate::prover::{IsStarkProver, Prover};
 use crate::table::Table;
 use crate::trace::{TraceTable, compute_frame_evaluation_points};
+use crate::traits::AIR;
+use crypto::fiat_shamir::default_transcript::DefaultTranscript;
 use itertools::Itertools;
 use math::field::{
     element::FieldElement,
+    goldilocks::GoldilocksField,
     traits::{IsField, IsSubFieldOf},
 };
 use math::polynomial::Polynomial;
@@ -10,6 +18,34 @@ use math::polynomial::Polynomial;
 #[cfg(feature = "parallel")]
 use rayon::prelude::{IntoParallelRefIterator, ParallelIterator};
 
+/// Builds a valid 2-row `SimpleAddition` proof. Shared base for the
+/// proof-tamper / rejection tests in `small_trace_tests` and
+/// `row_pair_opening_tests`.
+pub fn make_valid_simple_proof() -> (
+    SimpleAdditionAIR<GoldilocksField>,
+    crate::proof::stark::StarkProof<
+        GoldilocksField,
+        GoldilocksField,
+        SimpleAdditionPublicInputs<GoldilocksField>,
+    >,
+) {
+    let mut trace = simple_addition_trace::<GoldilocksField>(2);
+    let proof_options = ProofOptions::default_test_options();
+    let pub_inputs = SimpleAdditionPublicInputs {
+        a: FieldElement::from(1u64),
+        b: FieldElement::from(2u64),
+    };
+    let air = SimpleAdditionAIR::<GoldilocksField>::new(&proof_options);
+    let proof = Prover::prove(
+        &air,
+        &mut trace,
+        &pub_inputs,
+        &mut DefaultTranscript::<GoldilocksField>::new(&[]),
+    )
+    .unwrap();
+    (air, proof)
+}
+
 /// Reference Horner-based trace-evaluation used as an oracle by the prover
 /// tests (`tests::prover_tests`). The production prover uses the LDE-based
 /// barycentric `get_trace_evaluations_from_lde`; the two are