diff --git a/crypto/math-cuda/kernels/keccak.cu b/crypto/math-cuda/kernels/keccak.cu
index 9937d7c6e..557b8dd43 100644
--- a/crypto/math-cuda/kernels/keccak.cu
+++ b/crypto/math-cuda/kernels/keccak.cu
@@ -159,8 +159,8 @@ extern "C" __global__ void keccak256_leaves_base_batched(
     uint64_t tid = (uint64_t)blockIdx.x * blockDim.x + threadIdx.x;
     if (tid >= num_rows) return;
 
-    // Bit-reverse the row index so we read columns at `br` but write the
-    // hashed leaf at `tid` — matching the CPU `commit_columns_bit_reversed`.
+    // Bit-reverse the row index so we read columns at `br` but write the hashed
+    // leaf at `tid` — matching the CPU per-row `commit_bit_reversed(.., 1)`.
     uint64_t br = __brevll(tid) >> (64 - log_num_rows);
 
     uint64_t st[25];
@@ -181,6 +181,51 @@ extern "C" __global__ void keccak256_leaves_base_batched(
     finalize_keccak256(st, rate_pos, hashed_leaves_out + tid * 32);
 }
 
+// ---------------------------------------------------------------------------
+// Goldilocks BASE-FIELD row-pair leaf hashing.
+//
+// Leaf `leaf_idx` hashes TWO consecutive bit-reversed rows
+//   br_0 = reverse_index(2*leaf_idx),  br_1 = reverse_index(2*leaf_idx + 1)
+// each written column-by-column in canonical BE (same per-row byte layout as
+// `keccak256_leaves_base_batched`), in (br_0 row: col 0..K-1) then (br_1 row:
+// col 0..K-1) order. `num_leaves = num_rows / 2`; writes 32 bytes to
+// `hashed_leaves_out[leaf_idx * 32 ..]`. Matches the CPU
+// `keccak_leaves_row_pair_bit_reversed` (rows_per_leaf = 2) — the base-field
+// analog of `keccak_comp_poly_leaves_ext3`.
+// ---------------------------------------------------------------------------
+extern "C" __global__ void keccak256_leaves_base_row_pair_batched(
+    const uint64_t *columns_base_ptr,
+    uint64_t col_stride,
+    uint64_t num_cols,
+    uint64_t num_rows,
+    uint64_t log_num_rows,
+    uint8_t *hashed_leaves_out) {
+    uint64_t tid = (uint64_t)blockIdx.x * blockDim.x + threadIdx.x;
+    uint64_t num_leaves = num_rows >> 1;
+    if (tid >= num_leaves) return;
+
+    uint64_t br_0 = __brevll(2 * tid) >> (64 - log_num_rows);
+    uint64_t br_1 = __brevll(2 * tid + 1) >> (64 - log_num_rows);
+
+    uint64_t st[25];
+    #pragma unroll
+    for (int i = 0; i < 25; ++i) st[i] = 0;
+
+    uint32_t rate_pos = 0;
+    // First row (br_0): col 0..K-1.
+    for (uint64_t c = 0; c < num_cols; ++c) {
+        uint64_t v = columns_base_ptr[c * col_stride + br_0];
+        absorb_lane(st, rate_pos, bswap64(goldilocks::canonical(v)));
+    }
+    // Second row (br_1): col 0..K-1.
+    for (uint64_t c = 0; c < num_cols; ++c) {
+        uint64_t v = columns_base_ptr[c * col_stride + br_1];
+        absorb_lane(st, rate_pos, bswap64(goldilocks::canonical(v)));
+    }
+
+    finalize_keccak256(st, rate_pos, hashed_leaves_out + tid * 32);
+}
+
 // ---------------------------------------------------------------------------
 // Goldilocks EXT3 leaf hashing (3 base-field components per ext3 element).
 //
@@ -349,15 +394,22 @@ extern "C" __global__ void keccak_merkle_level(
 }
 
 // ---------------------------------------------------------------------------
-// Row-major base leaf hashing.
+// Row-major ROW-PAIR leaf hashing.
+//
+// Row-major analog of `keccak256_leaves_base_row_pair_batched` (which reads a
+// column-major slab): each leaf hashes TWO consecutive bit-reversed rows.
+// Leaf `tid` hashes row `reverse_index(2*tid)` followed by row
+// `reverse_index(2*tid + 1)`, each as `m` canonical big-endian lanes read from
+// the contiguous row-major buffer (`data + br * m`). `num_leaves = num_rows/2`;
+// writes 32 bytes to `hashed_leaves_out[tid*32 ..]`.
 //
-// Input layout: data[row * m + col] for `num_rows` rows and `m` columns.
-// For leaf `tid`, reads the bit-reversed row `br(tid)` — a contiguous slice
-// of `m` elements starting at data[br * m]. Coalesced when multiple threads
-// in the same warp process consecutive `tid` values (they read non-overlapping
-// rows, each a contiguous block of m u64s in order).
+// `m` is the row stride in u64s: base trace = num columns; ext3 trace = 3 *
+// num columns (an ext3 element's components c0,c1,c2 are consecutive, matching
+// the CPU `write_bytes_be`). Byte layout therefore equals the CPU
+// `commit_bit_reversed(.., ROWS_PER_LEAF=2)` and the verifier's
+// `verify_opening_pair` (queried row ‖ its symmetric counterpart, one leaf).
 // ---------------------------------------------------------------------------
-extern "C" __global__ void keccak256_leaves_base_row_major(
+extern "C" __global__ void keccak256_leaves_base_row_major_row_pair(
     const uint64_t *data,
     uint64_t m,
     uint64_t num_rows,
@@ -365,19 +417,26 @@ extern "C" __global__ void keccak256_leaves_base_row_major(
     uint8_t *hashed_leaves_out)
 {
     uint64_t tid = (uint64_t)blockIdx.x * blockDim.x + threadIdx.x;
-    if (tid >= num_rows) return;
-    uint64_t br = __brevll(tid) >> (64 - log_num_rows);
-    const uint64_t *row = data + br * m;
+    uint64_t num_leaves = num_rows >> 1;
+    if (tid >= num_leaves) return;
+
+    uint64_t br_0 = __brevll(2 * tid) >> (64 - log_num_rows);
+    uint64_t br_1 = __brevll(2 * tid + 1) >> (64 - log_num_rows);
+    const uint64_t *row_0 = data + br_0 * m;
+    const uint64_t *row_1 = data + br_1 * m;
 
     uint64_t st[25];
     #pragma unroll
     for (int i = 0; i < 25; ++i) st[i] = 0;
 
     uint32_t rate_pos = 0;
+    // First row (br_0): cols 0..m-1.
     for (uint64_t c = 0; c < m; ++c) {
-        uint64_t canon = goldilocks::canonical(row[c]);
-        uint64_t lane  = bswap64(canon);
-        absorb_lane(st, rate_pos, lane);
+        absorb_lane(st, rate_pos, bswap64(goldilocks::canonical(row_0[c])));
+    }
+    // Second row (br_1): cols 0..m-1.
+    for (uint64_t c = 0; c < m; ++c) {
+        absorb_lane(st, rate_pos, bswap64(goldilocks::canonical(row_1[c])));
     }
     finalize_keccak256(st, rate_pos, hashed_leaves_out + tid * 32);
 }
diff --git a/crypto/math-cuda/src/device.rs b/crypto/math-cuda/src/device.rs
index e9db7657e..4270e5da8 100644
--- a/crypto/math-cuda/src/device.rs
+++ b/crypto/math-cuda/src/device.rs
@@ -147,8 +147,9 @@ pub struct Backend {
     pub matrix_transpose_strided: CudaFunction,
 
     // keccak.ptx
-    pub keccak256_leaves_base_row_major: CudaFunction,
+    pub keccak256_leaves_base_row_major_row_pair: CudaFunction,
     pub keccak256_leaves_base_batched: CudaFunction,
+    pub keccak256_leaves_base_row_pair_batched: CudaFunction,
     pub keccak256_leaves_ext3_batched: CudaFunction,
     pub keccak_comp_poly_leaves_ext3: CudaFunction,
     pub keccak_fri_leaves_ext3: CudaFunction,
@@ -247,9 +248,11 @@ impl Backend {
             ntt_dit_level_row_major: ntt.load_function("ntt_dit_level_row_major")?,
             pointwise_mul_row_major: ntt.load_function("pointwise_mul_row_major")?,
             matrix_transpose_strided: ntt.load_function("matrix_transpose_strided")?,
-            keccak256_leaves_base_row_major: keccak
-                .load_function("keccak256_leaves_base_row_major")?,
+            keccak256_leaves_base_row_major_row_pair: keccak
+                .load_function("keccak256_leaves_base_row_major_row_pair")?,
             keccak256_leaves_base_batched: keccak.load_function("keccak256_leaves_base_batched")?,
+            keccak256_leaves_base_row_pair_batched: keccak
+                .load_function("keccak256_leaves_base_row_pair_batched")?,
             keccak256_leaves_ext3_batched: keccak.load_function("keccak256_leaves_ext3_batched")?,
             keccak_comp_poly_leaves_ext3: keccak.load_function("keccak_comp_poly_leaves_ext3")?,
             keccak_fri_leaves_ext3: keccak.load_function("keccak_fri_leaves_ext3")?,
diff --git a/crypto/math-cuda/src/lde.rs b/crypto/math-cuda/src/lde.rs
index 164267684..b08a9394a 100644
--- a/crypto/math-cuda/src/lde.rs
+++ b/crypto/math-cuda/src/lde.rs
@@ -16,7 +16,10 @@ use cudarc::driver::{CudaSlice, CudaStream, LaunchConfig, PushKernelArg};
 
 use crate::Result;
 use crate::device::{Backend, backend};
-use crate::merkle::{keccak_launch_cfg, launch_keccak_base, launch_keccak_ext3};
+use crate::merkle::{
+    keccak_launch_cfg, launch_keccak_base, launch_keccak_base_row_pair, launch_keccak_ext3,
+    launch_keccak_ext3_row_pair,
+};
 use crate::ntt::run_ntt_body;
 
 /// Goldilocks `TWO_ADICITY = 32` puts the theoretical domain ceiling at
@@ -34,26 +37,26 @@ fn assert_u32_domain(n: usize, what: &str) {
 /// Output shape requested from the fused LDE + Keccak entry points.
 #[derive(Copy, Clone, PartialEq, Eq)]
 enum KeccakCommit {
-    /// Only the `lde_size` keccak-256 leaves; no inner-tree build. Caller
-    /// receives `lde_size * 32` bytes.
+    /// Only the keccak-256 leaves; no inner-tree build. Caller receives
+    /// `num_leaves * 32` bytes.
     LeavesOnly,
     /// Full Merkle tree: leaves at the tail + inner nodes built on-device.
-    /// Caller receives `(2*lde_size - 1) * 32` bytes.
+    /// Caller receives `(2*num_leaves - 1) * 32` bytes.
     FullTree,
 }
 
 impl KeccakCommit {
-    fn total_nodes_bytes(self, lde_size: usize) -> usize {
+    fn total_nodes_bytes(self, num_leaves: usize) -> usize {
         match self {
-            KeccakCommit::LeavesOnly => lde_size * 32,
-            KeccakCommit::FullTree => (2 * lde_size - 1) * 32,
+            KeccakCommit::LeavesOnly => num_leaves * 32,
+            KeccakCommit::FullTree => (2 * num_leaves - 1) * 32,
         }
     }
 
-    fn leaves_offset_bytes(self, lde_size: usize) -> usize {
+    fn leaves_offset_bytes(self, num_leaves: usize) -> usize {
         match self {
             KeccakCommit::LeavesOnly => 0,
-            KeccakCommit::FullTree => (lde_size - 1) * 32,
+            KeccakCommit::FullTree => (num_leaves - 1) * 32,
         }
     }
 }
@@ -304,7 +307,12 @@ fn run_row_major_ntt_body(
     Ok(())
 }
 
-fn launch_keccak_base_row_major(
+/// Row-major ROW-PAIR leaf hashing: leaf `i` hashes the two consecutive
+/// bit-reversed rows `reverse_index(2i)`, `reverse_index(2i+1)` (each `m` lanes,
+/// read contiguously from the row-major `buf`), producing `num_rows / 2` leaves.
+/// Row-major analog of [`launch_keccak_base_row_pair`]; matches the CPU
+/// `commit_bit_reversed(.., 2)` and the verifier's `verify_opening_pair`.
+fn launch_keccak_base_row_major_row_pair(
     stream: &CudaStream,
     be: &Backend,
     buf: &CudaSlice<u64>,
@@ -313,20 +321,21 @@ fn launch_keccak_base_row_major(
     log_num_rows: u64,
     leaves_out: &mut cudarc::driver::CudaViewMut<'_, u8>,
 ) -> Result<()> {
-    // The keccak kernel is register-heavy (Keccak state `uint64_t st[25]`), so it
-    // must launch with the keccak-tuned block dim (128). `for_num_elems` uses 1024
-    // threads/block, which exceeds the per-block register budget and fails the
-    // launch with CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES — silently dropping the whole
-    // R1 GPU path to the CPU fallback (no device handle for rounds 2-4).
-    //
-    // The kernel derives the bit-reversed row as `__brevll(tid) >> (64 - log_num_rows)`;
-    // a 64-bit shift is UB, so reject `num_rows < 2` (`log_num_rows == 0`), matching
-    // the `debug_assert!` guard in `launch_keccak_base`.
-    debug_assert!(num_rows >= 2, "row-major keccak requires num_rows >= 2");
-    let cfg = keccak_launch_cfg(num_rows);
+    // Register-heavy Keccak kernel: launch with the keccak-tuned block dim (128,
+    // via `keccak_launch_cfg`); a larger block exceeds the per-block register
+    // budget and fails the launch (CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES). The kernel
+    // derives rows as `__brevll(2*tid + k) >> (64 - log_num_rows)`; a 64-bit shift
+    // is UB at `log_num_rows == 0`, so require `num_rows >= 2` (also the minimum
+    // for a single row pair).
+    debug_assert!(
+        num_rows >= 2,
+        "row-major row-pair keccak requires num_rows >= 2"
+    );
+    // One thread per leaf (= one bit-reversed row pair).
+    let cfg = keccak_launch_cfg(num_rows >> 1);
     unsafe {
         stream
-            .launch_builder(&be.keccak256_leaves_base_row_major)
+            .launch_builder(&be.keccak256_leaves_base_row_major_row_pair)
             .arg(buf)
             .arg(&m)
             .arg(&num_rows)
@@ -399,7 +408,12 @@ fn coset_lde_row_major_inner(
     let lde_size = n * blowup_factor;
     assert_u32_domain(lde_size, what);
 
-    let nodes_bytes = KeccakCommit::FullTree.total_nodes_bytes(lde_size);
+    // Row-pair trace commit: one Merkle leaf per bit-reversed row pair (rows 2i,
+    // 2i+1), matching the CPU `commit_bit_reversed(.., ROWS_PER_LEAF=2)` and the
+    // verifier's `verify_opening_pair`. `lde_size` is a power of two >= 2, so it
+    // is always even.
+    let num_leaves = lde_size / 2;
+    let nodes_bytes = KeccakCommit::FullTree.total_nodes_bytes(num_leaves);
     let log_n = n.trailing_zeros() as u64;
     let log_lde = lde_size.trailing_zeros() as u64;
     let n_u64 = n as u64;
@@ -445,12 +459,14 @@ fn coset_lde_row_major_inner(
         cols_u64,
     )?;
 
-    // Keccak + Merkle on-device. Each leaf reads `total_cols` consecutive u64s.
+    // Keccak + Merkle on-device. Each row-pair leaf reads two bit-reversed rows
+    // of `total_cols` consecutive u64s (`lde_u64` is the bit-reverse modulus; the
+    // kernel emits `lde_size / 2` leaves).
     let mut nodes_dev = unsafe { stream.alloc::<u8>(nodes_bytes) }?;
-    let leaves_offset = KeccakCommit::FullTree.leaves_offset_bytes(lde_size);
+    let leaves_offset = KeccakCommit::FullTree.leaves_offset_bytes(num_leaves);
     {
-        let mut leaves_view = nodes_dev.slice_mut(leaves_offset..leaves_offset + lde_size * 32);
-        launch_keccak_base_row_major(
+        let mut leaves_view = nodes_dev.slice_mut(leaves_offset..leaves_offset + num_leaves * 32);
+        launch_keccak_base_row_major_row_pair(
             stream.as_ref(),
             be,
             &buf,
@@ -460,7 +476,7 @@ fn coset_lde_row_major_inner(
             &mut leaves_view,
         )?;
     }
-    crate::merkle::build_inner_tree_levels(stream.as_ref(), be, &mut nodes_dev, lde_size)?;
+    crate::merkle::build_inner_tree_levels(stream.as_ref(), be, &mut nodes_dev, num_leaves)?;
 
     // D2H the row-major LDE first (before the handle transpose). Release the
     // staging lock before the Merkle nodes transfer to minimise lock contention.
@@ -927,12 +943,12 @@ pub fn coset_lde_batch_base_into(
     Ok(())
 }
 
-/// Fused LDE + Keccak-256 leaf hashing. Caller receives the `lde_size * 32`
-/// bytes of leaf hashes in `hashed_leaves_out` (one 32-byte digest per output
-/// row, in natural row order; leaves are computed reading columns at
-/// bit-reversed rows, matching `commit_columns_bit_reversed` on the CPU
-/// side). Thin wrapper over `coset_lde_batch_base_into_with_merkle_tree_inner`
-/// with `LeavesOnly` — no inner-tree build, no device handle.
+/// Fused LDE + row-pair Keccak-256 leaf hashing. Caller receives
+/// `(lde_size / 2) * 32` bytes of leaf hashes in `hashed_leaves_out` (one
+/// 32-byte digest per bit-reversed row pair, in natural leaf order, matching
+/// `commit_bit_reversed(.., 2)` on the CPU side). Thin wrapper over
+/// `coset_lde_batch_base_into_with_merkle_tree_inner` with `LeavesOnly` — no
+/// inner-tree build, no device handle.
 pub fn coset_lde_batch_base_into_with_leaf_hash(
     columns: &[&[u64]],
     blowup_factor: usize,
@@ -948,13 +964,15 @@ pub fn coset_lde_batch_base_into_with_leaf_hash(
         hashed_leaves_out,
         KeccakCommit::LeavesOnly,
         false,
+        2,
     )
     .map(|_| ())
 }
 
 /// Like `coset_lde_batch_base_into_with_leaf_hash`, but also builds the full
-/// Merkle tree on device and returns the `2*lde_size - 1` node buffer back
-/// to the caller in `merkle_nodes_out` (byte length `(2*lde_size - 1) * 32`).
+/// row-pair Merkle tree on device and returns the `2*(lde_size/2) - 1` node
+/// buffer back to the caller in `merkle_nodes_out` (byte length
+/// `(2*(lde_size/2) - 1) * 32`).
 ///
 /// The leaf hashes are never exposed to the caller — they stay on device and
 /// feed straight into the pair-hash tree kernel, avoiding the
@@ -975,10 +993,12 @@ pub fn coset_lde_batch_base_into_with_merkle_tree(
         merkle_nodes_out,
         KeccakCommit::FullTree,
         false,
+        2,
     )
     .map(|_| ())
 }
 
+#[allow(clippy::too_many_arguments)]
 fn coset_lde_batch_base_into_with_merkle_tree_inner(
     columns: &[&[u64]],
     blowup_factor: usize,
@@ -987,6 +1007,9 @@ fn coset_lde_batch_base_into_with_merkle_tree_inner(
     nodes_out: &mut [u8],
     commit: KeccakCommit,
     keep_device_buf: bool,
+    // 1 = one leaf per bit-reversed row; 2 = one leaf per row pair (2i, 2i+1),
+    // matching the CPU `commit_bit_reversed(.., 2)` used for the trace commit.
+    rows_per_leaf: usize,
 ) -> Result<Option<GpuLdeBase>> {
     if columns.is_empty() {
         assert_eq!(outputs.len(), 0);
@@ -1010,7 +1033,13 @@ fn coset_lde_batch_base_into_with_merkle_tree_inner(
     for o in outputs.iter() {
         assert_eq!(o.len(), lde_size);
     }
-    let nodes_dev_bytes = commit.total_nodes_bytes(lde_size);
+    assert!(
+        rows_per_leaf == 1 || rows_per_leaf == 2,
+        "rows_per_leaf must be 1 or 2"
+    );
+    assert_eq!(lde_size % rows_per_leaf, 0);
+    let num_leaves = lde_size / rows_per_leaf;
+    let nodes_dev_bytes = commit.total_nodes_bytes(num_leaves);
     assert_eq!(nodes_out.len(), nodes_dev_bytes);
     let log_n = n.trailing_zeros() as u64;
     let log_lde = lde_size.trailing_zeros() as u64;
@@ -1093,28 +1122,39 @@ fn coset_lde_batch_base_into_with_merkle_tree_inner(
     )?;
 
     // Allocate the device output buffer. In `LeavesOnly` mode this is just
-    // `lde_size * 32` bytes (the leaves themselves); in `FullTree` mode it's
-    // `(2*lde_size - 1) * 32` bytes (leaves in the tail + inner nodes filled
+    // `num_leaves * 32` bytes (the leaves themselves); in `FullTree` mode it's
+    // `(2*num_leaves - 1) * 32` bytes (leaves in the tail + inner nodes filled
     // below). `alloc` (not `alloc_zeros`) is safe because every byte is
     // written before any reader sees it: the keccak kernel fills the
     // leaves slab, the inner-tree pass (when present) fills the head.
     let mut nodes_dev = unsafe { stream.alloc::<u8>(nodes_dev_bytes) }?;
-    let leaves_offset_bytes = commit.leaves_offset_bytes(lde_size);
+    let leaves_offset_bytes = commit.leaves_offset_bytes(num_leaves);
     {
         let mut leaves_view =
-            nodes_dev.slice_mut(leaves_offset_bytes..leaves_offset_bytes + lde_size * 32);
-        launch_keccak_base(
-            stream.as_ref(),
-            &buf,
-            col_stride_u64,
-            m as u64,
-            lde_u64,
-            &mut leaves_view,
-        )?;
+            nodes_dev.slice_mut(leaves_offset_bytes..leaves_offset_bytes + num_leaves * 32);
+        if rows_per_leaf == 2 {
+            launch_keccak_base_row_pair(
+                stream.as_ref(),
+                &buf,
+                col_stride_u64,
+                m as u64,
+                lde_u64,
+                &mut leaves_view,
+            )?;
+        } else {
+            launch_keccak_base(
+                stream.as_ref(),
+                &buf,
+                col_stride_u64,
+                m as u64,
+                lde_u64,
+                &mut leaves_view,
+            )?;
+        }
     }
 
     if commit == KeccakCommit::FullTree {
-        crate::merkle::build_inner_tree_levels(stream.as_ref(), be, &mut nodes_dev, lde_size)?;
+        crate::merkle::build_inner_tree_levels(stream.as_ref(), be, &mut nodes_dev, num_leaves)?;
     }
 
     // D2H the LDE and the tree/leaves nodes via pinned staging.
@@ -1140,8 +1180,8 @@ fn coset_lde_batch_base_into_with_merkle_tree_inner(
     }
 }
 
-/// Ext3 variant of `coset_lde_batch_base_into_with_leaf_hash`: fused
-/// LDE + Keccak-256 leaf hashing over ext3 columns. Thin wrapper over
+/// Ext3 variant of `coset_lde_batch_base_into_with_leaf_hash`: fused LDE +
+/// row-pair Keccak-256 leaf hashing over ext3 columns. Thin wrapper over
 /// `coset_lde_batch_ext3_into_with_merkle_tree_inner` with `LeavesOnly`.
 pub fn coset_lde_batch_ext3_into_with_leaf_hash(
     columns: &[&[u64]],
@@ -1160,13 +1200,14 @@ pub fn coset_lde_batch_ext3_into_with_leaf_hash(
         hashed_leaves_out,
         KeccakCommit::LeavesOnly,
         false,
+        2,
     )
     .map(|_| ())
 }
 
 /// Ext3 variant of the fused `coset_lde_batch_base_into_with_merkle_tree`.
 /// LDE + leaf hashing + inner-tree build, all on device; D2Hs only the LDE
-/// evaluations and the full `2*lde_size - 1` node buffer.
+/// evaluations and the full `2*(lde_size/2) - 1` row-pair node buffer.
 pub fn coset_lde_batch_ext3_into_with_merkle_tree(
     columns: &[&[u64]],
     n: usize,
@@ -1184,6 +1225,7 @@ pub fn coset_lde_batch_ext3_into_with_merkle_tree(
         merkle_nodes_out,
         KeccakCommit::FullTree,
         false,
+        2,
     )
     .map(|_| ())
 }
@@ -1198,6 +1240,9 @@ fn coset_lde_batch_ext3_into_with_merkle_tree_inner(
     nodes_out: &mut [u8],
     commit: KeccakCommit,
     keep_device_buf: bool,
+    // 1 = one leaf per bit-reversed row; 2 = one leaf per row pair (2i, 2i+1),
+    // matching the CPU `commit_bit_reversed(.., 2)` used for the trace commit.
+    rows_per_leaf: usize,
 ) -> Result<Option<GpuLdeExt3>> {
     if columns.is_empty() {
         assert_eq!(outputs.len(), 0);
@@ -1223,7 +1268,13 @@ fn coset_lde_batch_ext3_into_with_merkle_tree_inner(
     for o in outputs.iter() {
         assert_eq!(o.len(), 3 * lde_size);
     }
-    let nodes_dev_bytes = commit.total_nodes_bytes(lde_size);
+    assert!(
+        rows_per_leaf == 1 || rows_per_leaf == 2,
+        "rows_per_leaf must be 1 or 2"
+    );
+    assert_eq!(lde_size % rows_per_leaf, 0);
+    let num_leaves = lde_size / rows_per_leaf;
+    let nodes_dev_bytes = commit.total_nodes_bytes(num_leaves);
     assert_eq!(nodes_out.len(), nodes_dev_bytes);
     let log_n = n.trailing_zeros() as u64;
     let log_lde = lde_size.trailing_zeros() as u64;
@@ -1300,26 +1351,37 @@ fn coset_lde_batch_ext3_into_with_merkle_tree_inner(
         mb_u32,
     )?;
 
-    // Allocate device output buffer (LeavesOnly → lde_size*32; FullTree →
-    // (2*lde_size - 1)*32). Leaf kernel writes to the leaves slab; the
+    // Allocate device output buffer (LeavesOnly -> num_leaves*32; FullTree ->
+    // (2*num_leaves - 1)*32). Leaf kernel writes to the leaves slab; the
     // inner-tree pass (when present) fills the head.
     let mut nodes_dev = unsafe { stream.alloc::<u8>(nodes_dev_bytes) }?;
-    let leaves_offset_bytes = commit.leaves_offset_bytes(lde_size);
+    let leaves_offset_bytes = commit.leaves_offset_bytes(num_leaves);
     {
         let mut leaves_view =
-            nodes_dev.slice_mut(leaves_offset_bytes..leaves_offset_bytes + lde_size * 32);
-        launch_keccak_ext3(
-            stream.as_ref(),
-            &buf,
-            col_stride_u64,
-            m as u64,
-            lde_u64,
-            &mut leaves_view,
-        )?;
+            nodes_dev.slice_mut(leaves_offset_bytes..leaves_offset_bytes + num_leaves * 32);
+        if rows_per_leaf == 2 {
+            launch_keccak_ext3_row_pair(
+                stream.as_ref(),
+                &buf,
+                col_stride_u64,
+                m as u64,
+                lde_u64,
+                &mut leaves_view,
+            )?;
+        } else {
+            launch_keccak_ext3(
+                stream.as_ref(),
+                &buf,
+                col_stride_u64,
+                m as u64,
+                lde_u64,
+                &mut leaves_view,
+            )?;
+        }
     }
 
     if commit == KeccakCommit::FullTree {
-        crate::merkle::build_inner_tree_levels(stream.as_ref(), be, &mut nodes_dev, lde_size)?;
+        crate::merkle::build_inner_tree_levels(stream.as_ref(), be, &mut nodes_dev, num_leaves)?;
     }
 
     // D2H LDE (mb * lde_size u64) and tree/leaves nodes.
diff --git a/crypto/math-cuda/src/merkle.rs b/crypto/math-cuda/src/merkle.rs
index 932e81325..27f38ce0a 100644
--- a/crypto/math-cuda/src/merkle.rs
+++ b/crypto/math-cuda/src/merkle.rs
@@ -3,7 +3,7 @@
 //! Matches `FieldElementVectorBackend<F, Keccak256, 32>::hash_data` in
 //! `crypto/crypto/src/merkle_tree/backends/field_element_vector.rs`, combined
 //! with the `reverse_index` row read pattern used in
-//! `commit_columns_bit_reversed` at `crypto/stark/src/prover.rs`.
+//! `commit_bit_reversed` at `crypto/stark/src/commitment.rs`.
 //!
 //! Caller supplies base-field column slabs already laid out as
 //! `[col * col_stride + row]` (the same layout `coset_lde_batch_base_into`
@@ -25,15 +25,27 @@ use crate::lde::pack_ext3_to_pinned_slabs;
 /// Run GPU Keccak-256 leaf hashing on a base-field column buffer.
 ///
 /// `columns` must hold `num_cols * col_stride` u64s with column `c`'s data
-/// at `[c*col_stride .. c*col_stride + num_rows]`. Returns `num_rows * 32`
-/// hash bytes in natural (non-bit-reversed) row order.
+/// at `[c*col_stride .. c*col_stride + num_rows]`. `rows_per_leaf` selects the
+/// leaf layout: `1` = one leaf per bit-reversed row (`num_rows` leaves), `2` =
+/// one leaf per bit-reversed row pair `2i`,`2i+1` (`num_rows/2` leaves, the
+/// trace-commit layout). Returns `(num_rows / rows_per_leaf) * 32` hash bytes.
 pub fn keccak_leaves_base(
     columns: &[u64],
     col_stride: usize,
     num_cols: usize,
     num_rows: usize,
+    rows_per_leaf: usize,
 ) -> Result<Vec<u8>> {
     assert!(num_rows.is_power_of_two());
+    assert!(rows_per_leaf == 1 || rows_per_leaf == 2);
+    assert!(
+        num_rows >= rows_per_leaf,
+        "num_rows must be at least rows_per_leaf"
+    );
+    assert!(
+        num_rows >= 2,
+        "num_rows must be at least 2 for bit-reversed GPU leaf hashing"
+    );
     assert!(
         col_stride >= num_rows,
         "col_stride must be >= num_rows to keep per-column reads in-bounds"
@@ -45,8 +57,13 @@ pub fn keccak_leaves_base(
     let be = backend()?;
     let stream = be.next_stream();
     let cols_dev = stream.clone_htod(&columns[..total])?;
-    let mut out_dev = stream.alloc_zeros::<u8>(num_rows * 32)?;
-    launch_keccak_base(
+    let mut out_dev = stream.alloc_zeros::<u8>((num_rows / rows_per_leaf) * 32)?;
+    let launch = if rows_per_leaf == 2 {
+        launch_keccak_base_row_pair
+    } else {
+        launch_keccak_base
+    };
+    launch(
         stream.as_ref(),
         &cols_dev,
         col_stride as u64,
@@ -60,14 +77,25 @@ pub fn keccak_leaves_base(
 }
 
 /// Ext3 variant. Columns interleaved as three base slabs per ext3 column.
-/// `columns.len() >= num_cols * 3 * col_stride`.
+/// `columns.len() >= num_cols * 3 * col_stride`. `rows_per_leaf` as in
+/// [`keccak_leaves_base`].
 pub fn keccak_leaves_ext3(
     columns: &[u64],
     col_stride: usize,
     num_cols: usize,
     num_rows: usize,
+    rows_per_leaf: usize,
 ) -> Result<Vec<u8>> {
     assert!(num_rows.is_power_of_two());
+    assert!(rows_per_leaf == 1 || rows_per_leaf == 2);
+    assert!(
+        num_rows >= rows_per_leaf,
+        "num_rows must be at least rows_per_leaf"
+    );
+    assert!(
+        num_rows >= 2,
+        "num_rows must be at least 2 for bit-reversed GPU leaf hashing"
+    );
     assert!(
         col_stride >= num_rows,
         "col_stride must be >= num_rows to keep per-column reads in-bounds"
@@ -80,8 +108,13 @@ pub fn keccak_leaves_ext3(
     let be = backend()?;
     let stream = be.next_stream();
     let cols_dev = stream.clone_htod(&columns[..total])?;
-    let mut out_dev = stream.alloc_zeros::<u8>(num_rows * 32)?;
-    launch_keccak_ext3(
+    let mut out_dev = stream.alloc_zeros::<u8>((num_rows / rows_per_leaf) * 32)?;
+    let launch = if rows_per_leaf == 2 {
+        launch_keccak_ext3_row_pair
+    } else {
+        launch_keccak_ext3
+    };
+    launch(
         stream.as_ref(),
         &cols_dev,
         col_stride as u64,
@@ -170,6 +203,72 @@ pub(crate) fn launch_keccak_base(
     Ok(())
 }
 
+/// Row-pair base-field leaf hashing: leaf `i` hashes bit-reversed rows `2i`,
+/// `2i+1` (one Merkle path per FRI query). Writes `num_rows/2` leaves of 32
+/// bytes into `out_dev`. Base-field analog of the comp-poly ext3 path; matches
+/// the CPU `keccak_leaves_row_pair_bit_reversed`.
+pub(crate) fn launch_keccak_base_row_pair(
+    stream: &CudaStream,
+    cols_dev: &CudaSlice<u64>,
+    col_stride: u64,
+    num_cols: u64,
+    num_rows: u64,
+    out_dev: &mut CudaViewMut<'_, u8>,
+) -> Result<()> {
+    debug_assert!(
+        num_rows >= 2,
+        "keccak row-pair leaf kernel: num_rows must be >= 2"
+    );
+    let be = backend()?;
+    let log_num_rows = num_rows.trailing_zeros() as u64;
+    // One thread per leaf (= row pair).
+    let cfg = keccak_launch_cfg(num_rows >> 1);
+    unsafe {
+        stream
+            .launch_builder(&be.keccak256_leaves_base_row_pair_batched)
+            .arg(cols_dev)
+            .arg(&col_stride)
+            .arg(&num_cols)
+            .arg(&num_rows)
+            .arg(&log_num_rows)
+            .arg(out_dev)
+            .launch(cfg)?;
+    }
+    Ok(())
+}
+
+/// Row-pair ext3 leaf hashing for the aux trace: reuses the comp-poly kernel
+/// (`keccak_comp_poly_leaves_ext3`), which hashes bit-reversed rows `2i`, `2i+1`
+/// across all ext3 columns. Writes `num_rows/2` leaves of 32 bytes.
+pub(crate) fn launch_keccak_ext3_row_pair(
+    stream: &CudaStream,
+    cols_dev: &CudaSlice<u64>,
+    col_stride: u64,
+    num_cols: u64,
+    num_rows: u64,
+    out_dev: &mut CudaViewMut<'_, u8>,
+) -> Result<()> {
+    debug_assert!(
+        num_rows >= 2,
+        "keccak row-pair leaf kernel: num_rows must be >= 2"
+    );
+    let be = backend()?;
+    let log_num_rows = num_rows.trailing_zeros() as u64;
+    let cfg = keccak_launch_cfg(num_rows >> 1);
+    unsafe {
+        stream
+            .launch_builder(&be.keccak_comp_poly_leaves_ext3)
+            .arg(cols_dev)
+            .arg(&col_stride)
+            .arg(&num_cols)
+            .arg(&num_rows)
+            .arg(&log_num_rows)
+            .arg(out_dev)
+            .launch(cfg)?;
+    }
+    Ok(())
+}
+
 /// Given `hashed_leaves` of length `leaves_len * 32`, build the full Merkle
 /// tree on device and return the complete node buffer `(2*leaves_len - 1) *
 /// 32` bytes in the standard layout:
diff --git a/crypto/math-cuda/tests/comp_poly_tree.rs b/crypto/math-cuda/tests/comp_poly_tree.rs
index 29e33b6fe..51b826dd1 100644
--- a/crypto/math-cuda/tests/comp_poly_tree.rs
+++ b/crypto/math-cuda/tests/comp_poly_tree.rs
@@ -1,6 +1,6 @@
 //! Parity: GPU fused `evaluate_poly_coset_batch_ext3_into_with_merkle_tree`
 //! (LDE + row-pair Keccak leaves + Merkle inner tree) against the same CPU
-//! pipeline produced by `commit_composition_polynomial`.
+//! row-pair commitment layout used by `commit_bit_reversed(.., 2)`.
 
 use math::field::element::FieldElement;
 use math::field::extensions_goldilocks::Degree3GoldilocksExtensionField;
@@ -83,7 +83,7 @@ fn cpu_hash_pair(left: &[u8; 32], right: &[u8; 32]) -> [u8; 32] {
     out
 }
 
-/// CPU: `commit_composition_polynomial`-style tree root over num_rows/2 leaves.
+/// CPU: `commit_bit_reversed(.., 2)`-style tree root over num_rows/2 leaves.
 fn cpu_tree_nodes(parts: &[Vec<Fp3>]) -> Vec<[u8; 32]> {
     let num_rows = parts[0].len();
     let num_parts = parts.len();
diff --git a/crypto/math-cuda/tests/keccak_leaves.rs b/crypto/math-cuda/tests/keccak_leaves.rs
index d614e233d..61a861f32 100644
--- a/crypto/math-cuda/tests/keccak_leaves.rs
+++ b/crypto/math-cuda/tests/keccak_leaves.rs
@@ -38,7 +38,7 @@ fn keccak_leaves_base_matches_cpu() {
                     flat[c * n + r] = *e.value();
                 }
             }
-            let gpu = math_cuda::merkle::keccak_leaves_base(&flat, n, num_cols, n).unwrap();
+            let gpu = math_cuda::merkle::keccak_leaves_base(&flat, n, num_cols, n, 1).unwrap();
             assert_eq!(gpu.len(), n * 32);
             for i in 0..n {
                 assert_eq!(
@@ -84,7 +84,7 @@ fn keccak_leaves_ext3_matches_cpu() {
                     flat[(c * 3 + 2) * n + r] = *e.value()[2].value();
                 }
             }
-            let gpu = math_cuda::merkle::keccak_leaves_ext3(&flat, n, num_cols, n).unwrap();
+            let gpu = math_cuda::merkle::keccak_leaves_ext3(&flat, n, num_cols, n, 1).unwrap();
             assert_eq!(gpu.len(), n * 32);
             for i in 0..n {
                 assert_eq!(
@@ -97,6 +97,87 @@ fn keccak_leaves_ext3_matches_cpu() {
     }
 }
 
+#[test]
+fn keccak_leaves_base_row_pair_matches_cpu() {
+    // Row-pair (trace) commit: leaf `i` hashes bit-reversed rows `2i`, `2i+1`.
+    // GPU `keccak_leaves_base(.., rows_per_leaf=2)` must match the CPU prover
+    // helper `keccak_leaves_row_pair_bit_reversed` over base columns.
+    for log_n in [4u32, 6, 8, 10, 12] {
+        for num_cols in [1usize, 5, 17, 41] {
+            let n = 1 << log_n;
+            let mut rng = ChaCha8Rng::seed_from_u64(500 + log_n as u64 + num_cols as u64);
+            let columns: Vec<Vec<Fp>> = (0..num_cols)
+                .map(|_| (0..n).map(|_| Fp::from_raw(rng.r#gen::<u64>())).collect())
+                .collect();
+
+            let cpu = keccak_leaves_row_pair_bit_reversed(&columns);
+            assert_eq!(cpu.len(), n / 2);
+
+            let mut flat = vec![0u64; num_cols * n];
+            for (c, col) in columns.iter().enumerate() {
+                for (r, e) in col.iter().enumerate() {
+                    flat[c * n + r] = *e.value();
+                }
+            }
+            let gpu = math_cuda::merkle::keccak_leaves_base(&flat, n, num_cols, n, 2).unwrap();
+            assert_eq!(gpu.len(), (n / 2) * 32);
+            for i in 0..n / 2 {
+                assert_eq!(
+                    &gpu[i * 32..(i + 1) * 32],
+                    &cpu[i][..],
+                    "base row-pair leaf mismatch at i={i} (log_n={log_n}, cols={num_cols})"
+                );
+            }
+        }
+    }
+}
+
+#[test]
+fn keccak_leaves_ext3_row_pair_matches_cpu() {
+    for log_n in [4u32, 6, 8, 10] {
+        for num_cols in [1usize, 3, 11, 20] {
+            let n = 1 << log_n;
+            let mut rng = ChaCha8Rng::seed_from_u64(600 + log_n as u64 + num_cols as u64);
+            let columns: Vec<Vec<Fp3>> = (0..num_cols)
+                .map(|_| {
+                    (0..n)
+                        .map(|_| {
+                            Fp3::new([
+                                Fp::from_raw(rng.r#gen::<u64>()),
+                                Fp::from_raw(rng.r#gen::<u64>()),
+                                Fp::from_raw(rng.r#gen::<u64>()),
+                            ])
+                        })
+                        .collect()
+                })
+                .collect();
+
+            let cpu = keccak_leaves_row_pair_bit_reversed(&columns);
+            assert_eq!(cpu.len(), n / 2);
+
+            // De-interleaved 3-slab layout per ext3 column (same as the 1-row
+            // ext3 leaf path): [col*3+k] each a contiguous slab of n u64s.
+            let mut flat = vec![0u64; num_cols * 3 * n];
+            for (c, col) in columns.iter().enumerate() {
+                for (r, e) in col.iter().enumerate() {
+                    flat[(c * 3) * n + r] = *e.value()[0].value();
+                    flat[(c * 3 + 1) * n + r] = *e.value()[1].value();
+                    flat[(c * 3 + 2) * n + r] = *e.value()[2].value();
+                }
+            }
+            let gpu = math_cuda::merkle::keccak_leaves_ext3(&flat, n, num_cols, n, 2).unwrap();
+            assert_eq!(gpu.len(), (n / 2) * 32);
+            for i in 0..n / 2 {
+                assert_eq!(
+                    &gpu[i * 32..(i + 1) * 32],
+                    &cpu[i][..],
+                    "ext3 row-pair leaf mismatch at i={i} (log_n={log_n}, cols={num_cols})"
+                );
+            }
+        }
+    }
+}
+
 #[test]
 fn keccak_comp_poly_leaves_matches_cpu() {
     // Built tree's leaves live at byte offset `(num_leaves - 1) * 32` and
diff --git a/crypto/math-cuda/tests/merkle_root_parity.rs b/crypto/math-cuda/tests/merkle_root_parity.rs
index 72e2aaea4..0cbe016b6 100644
--- a/crypto/math-cuda/tests/merkle_root_parity.rs
+++ b/crypto/math-cuda/tests/merkle_root_parity.rs
@@ -55,7 +55,9 @@ fn gpu_merkle_root(columns: &[Vec<u64>], blowup: usize, weights: &[u64]) -> [u8;
         }
     }
 
-    let gpu_leaves = math_cuda::merkle::keccak_leaves_base(&flat, n_lde, num_cols, n_lde)
+    // Per-row leaves (rows_per_leaf = 1): this parity test compares the generic
+    // keccak-leaves + Merkle primitives against a per-row CPU reference.
+    let gpu_leaves = math_cuda::merkle::keccak_leaves_base(&flat, n_lde, num_cols, n_lde, 1)
         .expect("GPU keccak leaves");
     let nodes =
         math_cuda::merkle::build_merkle_tree_on_device(&gpu_leaves).expect("GPU Merkle tree");
@@ -190,7 +192,7 @@ fn gpu_ext3_merkle_root(columns: &[Vec<Fp3>], blowup: usize, weights: &[u64]) ->
     }
 
     let gpu_leaves =
-        math_cuda::merkle::keccak_leaves_ext3(&flat_for_keccak, lde_size, num_cols, lde_size)
+        math_cuda::merkle::keccak_leaves_ext3(&flat_for_keccak, lde_size, num_cols, lde_size, 1)
             .expect("GPU ext3 keccak leaves");
     let nodes =
         math_cuda::merkle::build_merkle_tree_on_device(&gpu_leaves).expect("GPU Merkle tree");
diff --git a/crypto/stark/src/commitment.rs b/crypto/stark/src/commitment.rs
new file mode 100644
index 000000000..d4a6dbdbe
--- /dev/null
+++ b/crypto/stark/src/commitment.rs
@@ -0,0 +1,155 @@
+//! Merkle-tree commitment to bit-reversed, column-major LDE evaluations.
+//!
+//! This is the commitment layer the prover uses for the main/aux trace LDEs and
+//! the composition-polynomial parts. It is decoupled from `IsStarkProver`: the
+//! prover only orchestrates *when* to commit; the *how* (leaf layout, bit-reverse
+//! permutation, Keccak hashing, tree build) lives here.
+//!
+//! ## Leaf layout
+//!
+//! For each leaf `i` we hash `rows_per_leaf` consecutive (bit-reversed) rows,
+//! big-endian-concatenated column-by-column:
+//!
+//! ```text
+//! leaf(i) = keccak( col_0[br(R·i)]‖col_1[br(R·i)]‖…  ‖  col_0[br(R·i+1)]‖…  ‖ … )
+//!   where R = rows_per_leaf and br(j) = reverse_index(j, num_rows)
+//! ```
+//!
+//! - `rows_per_leaf == 2` (`ROWS_PER_LEAF`): a row pair per leaf (leaf `i` hashes
+//!   rows `2i` and `2i+1`). Used by BOTH the main/aux trace LDE and the
+//!   composition-polynomial parts: a FRI query opens a value and its symmetric
+//!   counterpart — exactly this pair — so one Merkle path authenticates both.
+//! - `rows_per_leaf == 1`: one row per leaf. No longer used by the prover; kept
+//!   only so the GPU parity tests can compare against the per-row code path.
+//!
+//! The field-element serialization (`write_bytes_be`) + `hash_bytes` path is kept
+//! exactly as before.
+
+use math::fft::bit_reversing::reverse_index;
+use math::field::element::FieldElement;
+use math::field::traits::IsField;
+use math::traits::{AsBytes, ByteConversion};
+
+#[cfg(feature = "parallel")]
+use rayon::prelude::{IntoParallelIterator, ParallelIterator};
+
+use crate::config::{BatchedMerkleTree, BatchedMerkleTreeBackend, Commitment};
+
+/// Number of consecutive (bit-reversed) rows packed into one Merkle leaf for the
+/// trace AND composition-polynomial commitments: the row-pair leaf the FRI
+/// openings rely on (leaf `i` hashes rows `2i` and `2i+1`, so one Merkle path
+/// authenticates both a value and its symmetric counterpart).
+pub const ROWS_PER_LEAF: usize = 2;
+
+/// Computes the Keccak-256 leaf hashes for a bit-reversed, column-major commitment,
+/// grouping `rows_per_leaf` consecutive bit-reversed rows into each leaf.
+///
+/// Returns one `Commitment` per leaf (`columns[0].len() / rows_per_leaf` leaves),
+/// or an empty `Vec` when there is nothing to hash. See the module docs for the
+/// exact leaf byte layout. This is the single code path behind both the per-row
+/// ([`keccak_leaves_bit_reversed`]) and per-row-pair
+/// ([`keccak_leaves_row_pair_bit_reversed`]) commitments.
+pub fn keccak_leaves_bit_reversed_grouped<E>(
+    columns: &[Vec<FieldElement<E>>],
+    rows_per_leaf: usize,
+) -> Vec<Commitment>
+where
+    E: IsField,
+    FieldElement<E>: AsBytes + Sync + Send + ByteConversion,
+{
+    if columns.is_empty() || columns[0].is_empty() {
+        return Vec::new();
+    }
+
+    let num_rows = columns[0].len();
+    let byte_len = <FieldElement<E> as ByteConversion>::BYTE_LEN;
+
+    debug_assert!(
+        num_rows.is_power_of_two(),
+        "num_rows must be a power of two for reverse_index"
+    );
+    debug_assert!(
+        rows_per_leaf >= 1 && num_rows.is_multiple_of(rows_per_leaf),
+        "num_rows must be a multiple of rows_per_leaf"
+    );
+
+    let num_leaves = num_rows / rows_per_leaf;
+    let total_bytes = rows_per_leaf * columns.len() * byte_len;
+
+    // Leaf `i`: the `rows_per_leaf` bit-reversed rows starting at `R·i`, each row
+    // written column-by-column in big-endian, then hashed once.
+    let hash_leaf = |buf: &mut [u8], leaf_idx: usize| -> Commitment {
+        let mut offset = 0;
+        for k in 0..rows_per_leaf {
+            let br = reverse_index(rows_per_leaf * leaf_idx + k, num_rows as u64);
+            for col in columns {
+                col[br].write_bytes_be(&mut buf[offset..offset + byte_len]);
+                offset += byte_len;
+            }
+        }
+        BatchedMerkleTreeBackend::<E>::hash_bytes(buf)
+    };
+
+    // Per-thread buffer reuse (map_init) avoids millions of small allocations.
+    #[cfg(feature = "parallel")]
+    let result: Vec<Commitment> = (0..num_leaves)
+        .into_par_iter()
+        .map_init(|| vec![0u8; total_bytes], |buf, i| hash_leaf(buf, i))
+        .collect();
+
+    #[cfg(not(feature = "parallel"))]
+    let result: Vec<Commitment> = {
+        let mut buf = vec![0u8; total_bytes];
+        (0..num_leaves).map(|i| hash_leaf(&mut buf, i)).collect()
+    };
+
+    result
+}
+
+/// Per-row Keccak-256 leaf hashes (one leaf per bit-reversed row). Thin wrapper
+/// over [`keccak_leaves_bit_reversed_grouped`] with `rows_per_leaf = 1`.
+///
+/// The prover no longer commits per-row (trace and composition both use the
+/// row-pair layout, `ROWS_PER_LEAF`); this stays a named public function only so
+/// the GPU parity tests in dependent crates can compare the per-row code path.
+pub fn keccak_leaves_bit_reversed<E>(columns: &[Vec<FieldElement<E>>]) -> Vec<Commitment>
+where
+    E: IsField,
+    FieldElement<E>: AsBytes + Sync + Send + ByteConversion,
+{
+    keccak_leaves_bit_reversed_grouped(columns, 1)
+}
+
+/// Per-row-pair Keccak-256 leaf hashes (leaf `i` hashes bit-reversed rows `2i`,
+/// `2i+1`). Used for the composition-polynomial-parts commitment. Thin wrapper
+/// over [`keccak_leaves_bit_reversed_grouped`] with `rows_per_leaf = 2`.
+pub fn keccak_leaves_row_pair_bit_reversed<E>(parts: &[Vec<FieldElement<E>>]) -> Vec<Commitment>
+where
+    E: IsField,
+    FieldElement<E>: AsBytes + Sync + Send + ByteConversion,
+{
+    keccak_leaves_bit_reversed_grouped(parts, 2)
+}
+
+/// Builds the Merkle tree committing to `columns`' bit-reversed, column-major LDE
+/// evaluations, grouping `rows_per_leaf` rows per leaf, and returns the tree and
+/// its root. `None` when there is nothing to commit.
+///
+/// Replaces the prover's former `commit_columns_bit_reversed` (`rows_per_leaf = 1`)
+/// and `commit_composition_polynomial` (`rows_per_leaf = 2`).
+pub fn commit_bit_reversed<E>(
+    columns: &[Vec<FieldElement<E>>],
+    rows_per_leaf: usize,
+) -> Option<(BatchedMerkleTree<E>, Commitment)>
+where
+    E: IsField,
+    FieldElement<E>: AsBytes + Sync + Send + ByteConversion,
+{
+    if columns.is_empty() || columns[0].is_empty() {
+        return None;
+    }
+    let hashed_leaves = keccak_leaves_bit_reversed_grouped(columns, rows_per_leaf);
+    let tree = BatchedMerkleTree::<E>::build_from_hashed_leaves(hashed_leaves)?;
+    let root = tree.root;
+    Some((tree, root))
+}
diff --git a/crypto/stark/src/gpu_lde.rs b/crypto/stark/src/gpu_lde.rs
index 29e9b94e6..920bf937e 100644
--- a/crypto/stark/src/gpu_lde.rs
+++ b/crypto/stark/src/gpu_lde.rs
@@ -269,29 +269,6 @@ fn restore_columns_on_err<E: IsField>(columns: &mut [Vec<FieldElement<E>>], n: u
     }
 }
 
-/// Allocate the `[u8; 32]` Merkle node buffer for a tree of `lde_size` leaves
-/// and return the node `Vec` (length-initialised, contents undefined) together
-/// with its node count `total_nodes` (`2 * lde_size - 1`). Returns `None` if
-/// the layout would be invalid (`lde_size < 2` or `total_nodes * 32` overflows
-/// `usize`). The caller builds the `&mut [u8]` byte view of length
-/// `total_nodes * 32` and must overwrite every byte via the GPU D2H.
-fn alloc_merkle_nodes(lde_size: usize) -> Option<(Vec<[u8; 32]>, usize)> {
-    if lde_size < 2 {
-        return None;
-    }
-    let total_nodes = 2usize.saturating_mul(lde_size).checked_sub(1)?;
-    let _byte_len = total_nodes.checked_mul(32)?;
-    let mut nodes: Vec<[u8; 32]> = Vec::with_capacity(total_nodes);
-    // SAFETY: every byte will be overwritten via the GPU D2H before the
-    // contents are read. The caller computes the byte-length view from the
-    // returned `nodes` Vec using `total_nodes.checked_mul(32)`.
-    #[allow(clippy::uninit_vec)]
-    unsafe {
-        nodes.set_len(total_nodes)
-    };
-    Some((nodes, total_nodes))
-}
-
 /// Try to GPU-batch all columns in one pass.
 ///
 /// Engaged for Goldilocks-base and ext3 tables whose LDE size is above the
@@ -303,6 +280,7 @@ fn alloc_merkle_nodes(lde_size: usize) -> Option<(Vec<[u8; 32]>, usize)> {
 /// Returns `Some(())` if the batch was handled on GPU and `columns` now holds
 /// the LDE evaluations, or if there were no columns to expand. Returns `None`
 /// to let the caller run the per-column CPU fallback.
+#[cfg_attr(not(feature = "debug-checks"), allow(dead_code))]
 pub(crate) fn try_expand_columns_batched<F, E>(
     columns: &mut [Vec<FieldElement<E>>],
     blowup_factor: usize,
@@ -598,6 +576,7 @@ where
 /// transform uses only base-field twiddles and coset weights, which act
 /// componentwise on ext3, so the per-component result equals the ext3 LDE the
 /// CPU path computes.
+#[cfg_attr(not(feature = "debug-checks"), allow(dead_code))]
 fn try_expand_columns_batched_ext3<F, E>(
     columns: &mut [Vec<FieldElement<E>>],
     blowup_factor: usize,
@@ -757,8 +736,8 @@ where
 /// host-side ext3 LDE eval Vecs produced by
 /// [`try_evaluate_parts_on_lde_gpu_keep`] (or the CPU path). Uses the same
 /// row-pair leaf pattern as the CPU
-/// `commit_composition_polynomial`: each leaf hashes 2 consecutive
-/// bit-reversed rows.
+/// `commit_bit_reversed` (composition-polynomial commit path): each leaf hashes
+/// 2 consecutive bit-reversed rows.
 ///
 /// Returns `None` to fall through to the CPU path when the type or size
 /// conditions don't hold; returns `None` on a math-cuda `Err` so the caller
diff --git a/crypto/stark/src/instruments.rs b/crypto/stark/src/instruments.rs
index 16ff95082..aa5cc5436 100644
--- a/crypto/stark/src/instruments.rs
+++ b/crypto/stark/src/instruments.rs
@@ -33,7 +33,7 @@ pub struct TableSubOps {
     pub constraints: Duration,
     /// decompose_and_extend_d2
     pub comp_decompose: Duration,
-    /// commit_composition_polynomial
+    /// commit_bit_reversed (composition-polynomial commit step)
     pub comp_commit: Duration,
     /// Round 3: barycentric OOD evaluation
     pub ood: Duration,
@@ -52,11 +52,11 @@ pub struct TableSubOps {
 pub struct Round1SubOps {
     /// Main trace: expand_columns_to_lde (LDE/FFT)
     pub main_lde: Duration,
-    /// Main trace: commit_columns_bit_reversed (Merkle)
+    /// Main trace: commit_bit_reversed (Merkle)
     pub main_merkle: Duration,
     /// Aux trace: expand_columns_to_lde (LDE/FFT)
     pub aux_lde: Duration,
-    /// Aux trace: commit_columns_bit_reversed (Merkle)
+    /// Aux trace: commit_bit_reversed (Merkle)
     pub aux_merkle: Duration,
 }
 
diff --git a/crypto/stark/src/lib.rs b/crypto/stark/src/lib.rs
index e9f6a1cda..87236c5f9 100644
--- a/crypto/stark/src/lib.rs
+++ b/crypto/stark/src/lib.rs
@@ -5,6 +5,7 @@ compile_error!("the `disk-spill` feature requires memmap2, which does not compil
 
 #[cfg(feature = "debug-checks")]
 pub mod bus_debug;
+pub mod commitment;
 pub mod constraints;
 pub mod context;
 pub mod debug;
diff --git a/crypto/stark/src/par.rs b/crypto/stark/src/par.rs
index a20a452b6..cee693e3f 100644
--- a/crypto/stark/src/par.rs
+++ b/crypto/stark/src/par.rs
@@ -37,3 +37,58 @@ where
         (a(), b())
     }
 }
+
+/// Map `f(i)` over `range` and collect into a `Vec`, preserving index order.
+/// Parallel when `feature = "parallel"`, sequential otherwise. Rayon's
+/// `collect()` is index-ordered, so the result is identical either way.
+pub(crate) fn par_map_collect<R: Send>(
+    range: std::ops::Range<usize>,
+    f: impl Fn(usize) -> R + Sync + Send,
+) -> Vec<R> {
+    #[cfg(feature = "parallel")]
+    {
+        use rayon::prelude::*;
+        range.into_par_iter().map(f).collect()
+    }
+    #[cfg(not(feature = "parallel"))]
+    {
+        range.map(f).collect()
+    }
+}
+
+/// Run `f(&mut item)` for each element of `slice`. Parallel when
+/// `feature = "parallel"`, sequential otherwise (ordering is irrelevant).
+// Only called from the `debug-checks`-gated column-LDE reconstruct path
+// (production LDE is row-major); keep it available without warning otherwise.
+#[cfg_attr(not(feature = "debug-checks"), allow(dead_code))]
+pub(crate) fn par_for_each_mut<T: Send>(slice: &mut [T], f: impl Fn(&mut T) + Sync + Send) {
+    #[cfg(feature = "parallel")]
+    {
+        use rayon::prelude::*;
+        slice.par_iter_mut().for_each(f);
+    }
+    #[cfg(not(feature = "parallel"))]
+    {
+        slice.iter_mut().for_each(f);
+    }
+}
+
+/// Run `f(&mut item)` for each element of `slice`, short-circuiting on the
+/// first `Err`. Parallel when `feature = "parallel"`, sequential otherwise.
+// Only called from `disk-spill`-gated paths; keep it available without warning
+// when that feature is off.
+#[cfg_attr(not(feature = "disk-spill"), allow(dead_code))]
+pub(crate) fn par_try_for_each_mut<T: Send, E: Send>(
+    slice: &mut [T],
+    f: impl Fn(&mut T) -> Result<(), E> + Sync + Send,
+) -> Result<(), E> {
+    #[cfg(feature = "parallel")]
+    {
+        use rayon::prelude::*;
+        slice.par_iter_mut().try_for_each(f)
+    }
+    #[cfg(not(feature = "parallel"))]
+    {
+        slice.iter_mut().try_for_each(f)
+    }
+}
diff --git a/crypto/stark/src/proof/stark.rs b/crypto/stark/src/proof/stark.rs
index 1751d60fe..851c0b37a 100644
--- a/crypto/stark/src/proof/stark.rs
+++ b/crypto/stark/src/proof/stark.rs
@@ -10,9 +10,14 @@ use crate::{
 
 #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
 #[serde(bound = "")]
+/// Opening of a bit-reversed, row-paired commitment at one FRI query.
+///
+/// The queried row and its symmetric counterpart (LDE positions `2·iota`,
+/// `2·iota+1`) are committed together as a single leaf at position `iota`, so one
+/// Merkle `proof` authenticates both `evaluations` (the row) and
+/// `evaluations_sym` (its symmetric). Same layout used for trace and composition.
 pub struct PolynomialOpenings<F: IsField> {
     pub proof: Proof<Commitment>,
-    pub proof_sym: Proof<Commitment>,
     pub evaluations: Vec<FieldElement<F>>,
     pub evaluations_sym: Vec<FieldElement<F>>,
 }
diff --git a/crypto/stark/src/prover.rs b/crypto/stark/src/prover.rs
index 30554c15e..2ce1cb855 100644
--- a/crypto/stark/src/prover.rs
+++ b/crypto/stark/src/prover.rs
@@ -12,7 +12,7 @@ use math::fft::two_half_fft::TwoHalfTwiddles;
 use log::info;
 use math::field::traits::{IsField, IsSubFieldOf};
 use math::spill_safe::SpillSafe;
-use math::traits::{AsBytes, ByteConversion};
+use math::traits::AsBytes;
 use math::{
     field::{element::FieldElement, traits::IsFFTField},
     polynomial::Polynomial,
@@ -44,6 +44,8 @@ use super::proof::stark::{DeepPolynomialOpening, MultiProof, StarkProof};
 use super::trace::TraceTable;
 use super::traits::AIR;
 
+pub use crate::commitment::{keccak_leaves_bit_reversed, keccak_leaves_row_pair_bit_reversed};
+
 /// A triple of (AIR, TraceTable, PublicInputs) for proving.
 type AirTracePair<'a, Field, FieldExtension, PI> = (
     &'a dyn AIR<Field = Field, FieldExtension = FieldExtension, PublicInputs = PI>,
@@ -86,6 +88,17 @@ pub enum ProvingError {
     /// out of disk space, fd exhaustion, or mmap failure.
     #[cfg(feature = "disk-spill")]
     DiskSpill(String),
+    /// An internal FFT/LDE computation failed (e.g. domain size exceeds the
+    /// field's two-adicity, or a degenerate coset offset). Distinct from
+    /// `WrongParameter` because the cause is internal prover machinery, not a
+    /// caller-supplied parameter. Carries the underlying `FFTError`'s message.
+    Fft(String),
+}
+
+impl From<FFTError> for ProvingError {
+    fn from(e: FFTError) -> Self {
+        ProvingError::Fft(format!("{e}"))
+    }
 }
 
 /// Commitment artifacts for one trace table (main or auxiliary). Used for both
@@ -432,7 +445,7 @@ where
 
 /// A container for the results of the third round of the STARK Prove protocol.
 pub(crate) struct Round3<F: IsField> {
-    /// Evaluations of the trace polynomials, main ans auxiliary, at the out-of-domain challenge.
+    /// Evaluations of the trace polynomials, main and auxiliary, at the out-of-domain challenge.
     trace_ood_evaluations: Table<F>,
     /// Evaluations of the composition polynomial parts at the out-of-domain challenge.
     composition_poly_parts_ood_evaluation: Vec<FieldElement<F>>,
@@ -475,128 +488,6 @@ where
     }
 }
 
-/// Compute Keccak-256 leaf hashes for `commit_columns_bit_reversed`: one
-/// leaf per row, where each row is read at `reverse_index(row_idx)` and the
-/// columns are concatenated as big-endian bytes before hashing.
-///
-/// Returns `Vec<Commitment>` with the same length as `columns[0]`. Exposed
-/// (instead of being a closure inside `commit_columns_bit_reversed`) so
-/// parity tests in dependent crates can compare against the same code path
-/// the prover uses.
-pub fn keccak_leaves_bit_reversed<E>(columns: &[Vec<FieldElement<E>>]) -> Vec<Commitment>
-where
-    E: IsField,
-    FieldElement<E>: AsBytes + Sync + Send + ByteConversion,
-{
-    if columns.is_empty() || columns[0].is_empty() {
-        return Vec::new();
-    }
-
-    let num_rows = columns[0].len();
-    let num_cols = columns.len();
-    let byte_len = <FieldElement<E> as ByteConversion>::BYTE_LEN;
-
-    debug_assert!(
-        num_rows.is_power_of_two(),
-        "num_rows must be a power of two for reverse_index"
-    );
-
-    let total_bytes = num_cols * byte_len;
-
-    let hash_leaf = |buf: &mut [u8], row_idx: usize| -> Commitment {
-        let br_idx = reverse_index(row_idx, num_rows as u64);
-        for col_idx in 0..num_cols {
-            columns[col_idx][br_idx]
-                .write_bytes_be(&mut buf[col_idx * byte_len..(col_idx + 1) * byte_len]);
-        }
-        BatchedMerkleTreeBackend::<E>::hash_bytes(buf)
-    };
-
-    #[cfg(feature = "parallel")]
-    let iter = (0..num_rows).into_par_iter();
-    #[cfg(not(feature = "parallel"))]
-    let iter = 0..num_rows;
-
-    // Per-thread buffer reuse: map_init allocates one buffer per Rayon thread,
-    // eliminating millions of small heap allocations under parallel contention.
-    #[cfg(feature = "parallel")]
-    let result: Vec<Commitment> = iter
-        .map_init(|| vec![0u8; total_bytes], |buf, i| hash_leaf(buf, i))
-        .collect();
-
-    #[cfg(not(feature = "parallel"))]
-    let result: Vec<Commitment> = {
-        let mut buf = vec![0u8; total_bytes];
-        iter.map(|i| hash_leaf(&mut buf, i)).collect()
-    };
-
-    result
-}
-
-/// Compute Keccak-256 leaf hashes for `commit_composition_polynomial`: one
-/// leaf per row-pair, where leaf `i` hashes the BE concatenation of
-/// `parts[..][br_0] ++ parts[..][br_1]` with
-/// `br_k = reverse_index(2*i + k, num_rows)`.
-///
-/// Returns `Vec<Commitment>` of length `parts[0].len() / 2`.
-pub fn keccak_leaves_row_pair_bit_reversed<E>(parts: &[Vec<FieldElement<E>>]) -> Vec<Commitment>
-where
-    E: IsField,
-    FieldElement<E>: AsBytes + Sync + Send + ByteConversion,
-{
-    let num_parts = parts.len();
-    if num_parts == 0 {
-        return Vec::new();
-    }
-    let num_rows = parts[0].len();
-    if num_rows == 0 {
-        return Vec::new();
-    }
-
-    let num_leaves = num_rows / 2;
-    debug_assert!(
-        num_rows.is_power_of_two(),
-        "num_rows must be a power of two for reverse_index"
-    );
-
-    let byte_len = <FieldElement<E> as ByteConversion>::BYTE_LEN;
-
-    let total_bytes = 2 * num_parts * byte_len;
-
-    let hash_leaf_pair = |buf: &mut [u8], leaf_idx: usize| -> Commitment {
-        let br_0 = reverse_index(2 * leaf_idx, num_rows as u64);
-        let br_1 = reverse_index(2 * leaf_idx + 1, num_rows as u64);
-        let mut offset = 0;
-        for part in parts.iter() {
-            part[br_0].write_bytes_be(&mut buf[offset..offset + byte_len]);
-            offset += byte_len;
-        }
-        for part in parts.iter() {
-            part[br_1].write_bytes_be(&mut buf[offset..offset + byte_len]);
-            offset += byte_len;
-        }
-        BatchedMerkleTreeBackend::<E>::hash_bytes(buf)
-    };
-
-    #[cfg(feature = "parallel")]
-    let iter = (0..num_leaves).into_par_iter();
-    #[cfg(not(feature = "parallel"))]
-    let iter = 0..num_leaves;
-
-    #[cfg(feature = "parallel")]
-    let result: Vec<Commitment> = iter
-        .map_init(|| vec![0u8; total_bytes], |buf, i| hash_leaf_pair(buf, i))
-        .collect();
-
-    #[cfg(not(feature = "parallel"))]
-    let result: Vec<Commitment> = {
-        let mut buf = vec![0u8; total_bytes];
-        iter.map(|i| hash_leaf_pair(&mut buf, i)).collect()
-    };
-
-    result
-}
-
 /// The functionality of a STARK prover providing methods to run the STARK Prove protocol
 /// https://lambdaclass.github.io/lambdaworks/starks/protocol.html
 /// The default implementation is complete and is compatible with Stone prover
@@ -615,34 +506,11 @@ pub trait IsStarkProver<
     FieldElement<Field>: math::traits::ByteConversion,
     FieldElement<FieldExtension>: math::traits::ByteConversion,
 {
-    /// Builds a Merkle tree commitment from column-major LDE evaluations with
-    /// bit-reverse permutation, without cloning the full evaluation matrix.
-    ///
-    /// For each row index `i`, we hash `col_0[br(i)] || col_1[br(i)] || ...`
-    /// where `br(i)` is the bit-reversal of `i`. This produces the same Merkle
-    /// tree as the old clone + bit-reverse + columns2rows + batch_commit flow,
-    /// but avoids allocating the cloned and transposed matrices entirely.
-    fn commit_columns_bit_reversed<E>(
-        columns: &[Vec<FieldElement<E>>],
-    ) -> Option<(BatchedMerkleTree<E>, Commitment)>
-    where
-        FieldElement<E>: AsBytes + Sync + Send + math::traits::ByteConversion,
-        E: IsField,
-    {
-        if columns.is_empty() || columns[0].is_empty() {
-            return None;
-        }
-        let hashed_leaves = keccak_leaves_bit_reversed(columns);
-        let tree = BatchedMerkleTree::<E>::build_from_hashed_leaves(hashed_leaves)?;
-        let root = tree.root;
-        Some((tree, root))
-    }
-
-    /// Row-major counterpart of [`commit_columns_bit_reversed`]: commit a
-    /// row-major flat buffer (`num_rows * num_cols`) by hashing each leaf from
-    /// the row at `reverse_index(row_idx)`. The leaf bytes are identical to the
-    /// column-major path (same row values), so the Merkle root is identical —
-    /// only the read pattern changes (contiguous row slice, no column gather).
+    /// Commit a row-major flat buffer (`num_rows * num_cols`) by hashing pairs
+    /// of consecutive bit-reversed rows into each Merkle leaf (`ROWS_PER_LEAF = 2`).
+    /// The byte layout per leaf matches `keccak_leaves_bit_reversed_grouped(columns, 2)`:
+    /// leaf i = hash( row[br(2i)] ++ row[br(2i+1)] ), read as contiguous slices from
+    /// the row-major buffer — no transpose needed.
     fn commit_rows_bit_reversed<E>(
         data: &[FieldElement<E>],
         num_cols: usize,
@@ -654,10 +522,10 @@ pub trait IsStarkProver<
         Self::commit_rows_bit_reversed_subset(data, num_cols, 0, num_cols)
     }
 
-    /// Subset variant of [`commit_rows_bit_reversed`]: hash only columns in the
-    /// contiguous range `[col_start..col_end)` of each row. Used for
-    /// preprocessed traces where precomputed cols and multiplicity cols commit
-    /// to separate Merkle trees from the same row-major buffer.
+    /// Subset variant of [`commit_rows_bit_reversed`]: hash pairs of bit-reversed rows
+    /// from the column range `[col_start..col_end)`. Used for preprocessed traces where
+    /// precomputed cols and multiplicity cols commit to separate Merkle trees from the
+    /// same row-major buffer, both using the row-pair (`ROWS_PER_LEAF = 2`) leaf layout.
     fn commit_rows_bit_reversed_subset<E>(
         data: &[FieldElement<E>],
         num_cols: usize,
@@ -679,44 +547,45 @@ pub trait IsStarkProver<
         if num_rows == 0 {
             return None;
         }
-        let subset_cols = col_end - col_start;
-        let byte_len = <FieldElement<E> as ByteConversion>::BYTE_LEN;
-        let row_bytes = subset_cols * byte_len;
-
         debug_assert!(
             num_rows.is_power_of_two(),
             "num_rows must be a power of two for reverse_index"
         );
 
+        // Local alias for the canonical constant, used several times below.
+        const ROWS_PER_LEAF: usize = crate::commitment::ROWS_PER_LEAF;
+        let num_leaves = num_rows / ROWS_PER_LEAF;
+        let subset_cols = col_end - col_start;
+        let byte_len = <FieldElement<E> as ByteConversion>::BYTE_LEN;
+        let leaf_bytes = ROWS_PER_LEAF * subset_cols * byte_len;
+
+        let hash_leaf = |buf: &mut [u8], leaf_idx: usize| -> Commitment {
+            let mut offset = 0;
+            for k in 0..ROWS_PER_LEAF {
+                let br_idx = reverse_index(ROWS_PER_LEAF * leaf_idx + k, num_rows as u64);
+                let row_start = br_idx * num_cols;
+                let row = &data[row_start + col_start..row_start + col_end];
+                for elem in row.iter() {
+                    elem.write_bytes_be(&mut buf[offset..offset + byte_len]);
+                    offset += byte_len;
+                }
+            }
+            BatchedMerkleTreeBackend::<E>::hash_bytes(buf)
+        };
+
         #[cfg(feature = "parallel")]
-        let hashed_leaves: Vec<Commitment> = (0..num_rows)
+        let hashed_leaves: Vec<Commitment> = (0..num_leaves)
             .into_par_iter()
             .map_init(
-                || vec![0u8; row_bytes],
-                |buf, row_idx| {
-                    let br_idx = reverse_index(row_idx, num_rows as u64);
-                    let row_start = br_idx * num_cols;
-                    let row = &data[row_start + col_start..row_start + col_end];
-                    for (i, elem) in row.iter().enumerate() {
-                        elem.write_bytes_be(&mut buf[i * byte_len..(i + 1) * byte_len]);
-                    }
-                    BatchedMerkleTreeBackend::<E>::hash_bytes(buf)
-                },
+                || vec![0u8; leaf_bytes],
+                |buf, leaf_idx| hash_leaf(buf, leaf_idx),
             )
             .collect();
         #[cfg(not(feature = "parallel"))]
         let hashed_leaves: Vec<Commitment> = {
-            let mut buf = vec![0u8; row_bytes];
-            (0..num_rows)
-                .map(|row_idx| {
-                    let br_idx = reverse_index(row_idx, num_rows as u64);
-                    let row_start = br_idx * num_cols;
-                    let row = &data[row_start + col_start..row_start + col_end];
-                    for (i, elem) in row.iter().enumerate() {
-                        elem.write_bytes_be(&mut buf[i * byte_len..(i + 1) * byte_len]);
-                    }
-                    BatchedMerkleTreeBackend::<E>::hash_bytes(&buf)
-                })
+            let mut buf = vec![0u8; leaf_bytes];
+            (0..num_leaves)
+                .map(|leaf_idx| hash_leaf(&mut buf, leaf_idx))
                 .collect()
         };
 
@@ -747,7 +616,8 @@ pub trait IsStarkProver<
         let twiddles = LdeTwiddles::new(&domain);
         let evals =
             Self::compute_lde_from_columns_cached::<Field>(&precomputed, &domain, &twiddles);
-        let (_, commitment) = Self::commit_columns_bit_reversed(&evals)?;
+        let (_, commitment) =
+            crate::commitment::commit_bit_reversed(&evals, crate::commitment::ROWS_PER_LEAF)?;
         Some(commitment)
     }
 
@@ -773,23 +643,16 @@ pub trait IsStarkProver<
             return Vec::new();
         }
 
-        #[cfg(not(feature = "parallel"))]
-        let columns_iter = columns.iter();
-        #[cfg(feature = "parallel")]
-        let columns_iter = columns.par_iter();
-
-        columns_iter
-            .map(|col| {
-                Polynomial::coset_lde_full::<Field>(
-                    col,
-                    domain.blowup_factor,
-                    &twiddles.coset_weights,
-                    &twiddles.inv,
-                    &twiddles.fwd,
-                )
-            })
-            .collect::<Result<Vec<Vec<FieldElement<E>>>, _>>()
+        crate::par::par_map_collect(0..columns.len(), |i| {
+            Polynomial::coset_lde_full::<Field>(
+                &columns[i],
+                domain.blowup_factor,
+                &twiddles.coset_weights,
+                &twiddles.inv,
+                &twiddles.fwd,
+            )
             .expect("coset LDE computation")
+        })
     }
 
     /// Expand each column in-place from N evaluations to N×blowup LDE evaluations.
@@ -828,11 +691,7 @@ pub trait IsStarkProver<
             return;
         }
 
-        #[cfg(feature = "parallel")]
-        let iter = columns.par_iter_mut();
-        #[cfg(not(feature = "parallel"))]
-        let iter = columns.iter_mut();
-        iter.for_each(|buf| {
+        crate::par::par_for_each_mut(columns, |buf| {
             Polynomial::coset_lde_full_expand::<Field>(
                 buf,
                 domain.blowup_factor,
@@ -944,10 +803,7 @@ pub trait IsStarkProver<
                 let (mut tree, root) = Self::commit_rows_bit_reversed(&main_data, total_cols)
                     .ok_or(ProvingError::EmptyCommitment)?;
                 #[cfg(feature = "disk-spill")]
-                if storage_mode == StorageMode::Disk {
-                    tree.spill_nodes_to_disk()
-                        .map_err(|e| ProvingError::DiskSpill(format!("main Merkle tree: {e}")))?;
-                }
+                Self::spill_tree(&mut tree, storage_mode, "main Merkle tree")?;
                 TableCommit::plain(tree, root)
             }
             Some((expected_precomputed_root, num_precomputed)) => {
@@ -972,13 +828,13 @@ pub trait IsStarkProver<
                     return Err(ProvingError::PrecomputedCommitmentMismatch);
                 }
                 #[cfg(feature = "disk-spill")]
-                if storage_mode == StorageMode::Disk {
-                    precomputed_tree.spill_nodes_to_disk().map_err(|e| {
-                        ProvingError::DiskSpill(format!("precomputed Merkle tree: {e}"))
-                    })?;
-                    mult_tree
-                        .spill_nodes_to_disk()
-                        .map_err(|e| ProvingError::DiskSpill(format!("mult Merkle tree: {e}")))?;
+                {
+                    Self::spill_tree(
+                        &mut precomputed_tree,
+                        storage_mode,
+                        "precomputed Merkle tree",
+                    )?;
+                    Self::spill_tree(&mut mult_tree, storage_mode, "mult Merkle tree")?;
                 }
                 TableCommit::preprocessed(
                     mult_tree,
@@ -999,6 +855,26 @@ pub trait IsStarkProver<
         Ok((commit, (main_data, total_cols)))
     }
 
+    /// Spill a committed Merkle tree to disk when `storage_mode` is `Disk`,
+    /// tagging any I/O error with `label`. No-op otherwise. Shared by every commit
+    /// site (main / preprocessed split / aux).
+    #[cfg(feature = "disk-spill")]
+    fn spill_tree<C>(
+        tree: &mut BatchedMerkleTree<C>,
+        storage_mode: StorageMode,
+        label: &str,
+    ) -> Result<(), ProvingError>
+    where
+        C: IsField,
+        FieldElement<C>: AsBytes + Sync + Send,
+    {
+        if storage_mode == StorageMode::Disk {
+            tree.spill_nodes_to_disk()
+                .map_err(|e| ProvingError::DiskSpill(format!("{label}: {e}")))?;
+        }
+        Ok(())
+    }
+
     /// Recompute Round1 from the trace, reusing the Merkle trees stored in commitments.
     ///
     /// Only used by `run_debug_checks` — Phase D consumes the cached LDE
@@ -1124,30 +1000,6 @@ pub trait IsStarkProver<
         }
     }
 
-    /// Returns the Merkle tree and the commitment to the evaluations of the parts of the
-    /// composition polynomial.
-    fn commit_composition_polynomial(
-        lde_composition_poly_parts_evaluations: &[Vec<FieldElement<FieldExtension>>],
-    ) -> Option<(BatchedMerkleTree<FieldExtension>, Commitment)>
-    where
-        FieldElement<Field>: AsBytes + Sync + Send,
-        FieldElement<FieldExtension>: AsBytes + Sync + Send + math::traits::ByteConversion,
-    {
-        let num_parts = lde_composition_poly_parts_evaluations.len();
-        if num_parts == 0 {
-            return None;
-        }
-        let num_rows = lde_composition_poly_parts_evaluations[0].len();
-        if num_rows == 0 {
-            return None;
-        }
-        let hashed_leaves =
-            keccak_leaves_row_pair_bit_reversed(lde_composition_poly_parts_evaluations);
-        let tree = BatchedMerkleTree::<FieldExtension>::build_from_hashed_leaves(hashed_leaves)?;
-        let root = tree.root;
-        Some((tree, root))
-    }
-
     /// Algebraically decompose H(x) = H₀(x²) + x·H₁(x²) on the LDE coset, then
     /// extend each half to the full LDE domain. This replaces the expensive
     /// iFFT(2N) + break_in_parts + FFT(2N)×2 pipeline with:
@@ -1294,11 +1146,10 @@ pub trait IsStarkProver<
         } else {
             // Fallback for any future AIR with d > 2.
             let composition_poly =
-                Polynomial::interpolate_offset_fft(&constraint_evaluations, &domain.coset_offset)
-                    .unwrap();
+                Polynomial::interpolate_offset_fft(&constraint_evaluations, &domain.coset_offset)?;
             let composition_poly_parts = composition_poly.break_in_parts(number_of_parts);
 
-            let cpu_eval = || -> Vec<Vec<FieldElement<FieldExtension>>> {
+            let cpu_eval = || -> Result<Vec<Vec<FieldElement<FieldExtension>>>, ProvingError> {
                 composition_poly_parts
                     .iter()
                     .map(|part| {
@@ -1308,7 +1159,7 @@ pub trait IsStarkProver<
                             domain.interpolation_domain_size,
                             &domain.coset_offset,
                         )
-                        .unwrap()
+                        .map_err(ProvingError::from)
                     })
                     .collect()
             };
@@ -1333,11 +1184,11 @@ pub trait IsStarkProver<
                         gpu_composition_parts = Some(handle);
                         evals
                     }
-                    None => cpu_eval(),
+                    None => cpu_eval()?,
                 }
             }
             #[cfg(not(feature = "cuda"))]
-            cpu_eval()
+            cpu_eval()?
         };
         #[cfg(feature = "instruments")]
         let fft_dur = t_sub.elapsed();
@@ -1359,8 +1210,11 @@ pub trait IsStarkProver<
                 let root = tree.root;
                 (tree, root)
             }
-            None => Self::commit_composition_polynomial(&lde_composition_poly_parts_evaluations)
-                .ok_or(ProvingError::EmptyCommitment)?,
+            None => crate::commitment::commit_bit_reversed(
+                &lde_composition_poly_parts_evaluations,
+                crate::commitment::ROWS_PER_LEAF,
+            )
+            .ok_or(ProvingError::EmptyCommitment)?,
         };
         #[cfg(feature = "instruments")]
         let merkle_dur = t_sub.elapsed();
@@ -1725,12 +1579,7 @@ pub trait IsStarkProver<
             })
             .collect();
 
-        #[cfg(feature = "parallel")]
-        let iter = (0..lde_size).into_par_iter();
-        #[cfg(not(feature = "parallel"))]
-        let iter = 0..lde_size;
-
-        iter.map(|i| {
+        crate::par::par_map_collect(0..lde_size, |i| {
             let mut result = FieldElement::<FieldExtension>::zero();
 
             // H terms
@@ -1756,7 +1605,6 @@ pub trait IsStarkProver<
 
             result
         })
-        .collect()
     }
 
     /// Computes values and validity proofs of the evaluations of the composition polynomial parts
@@ -1773,7 +1621,7 @@ pub trait IsStarkProver<
     {
         let proof = composition_poly_merkle_tree
             .get_proof_by_pos(index)
-            .unwrap();
+            .expect("FRI query index in bounds");
 
         let lde_composition_poly_parts_evaluation: Vec<_> = lde_composition_poly_evaluations
             .iter()
@@ -1786,8 +1634,7 @@ pub trait IsStarkProver<
             .collect();
 
         PolynomialOpenings {
-            proof: proof.clone(),
-            proof_sym: proof,
+            proof,
             evaluations: lde_composition_poly_parts_evaluation
                 .clone()
                 .into_iter()
@@ -1817,13 +1664,15 @@ pub trait IsStarkProver<
         G: Fn(usize) -> Vec<FieldElement<C>>,
     {
         let domain_size = domain.lde_roots_of_unity_coset.len() as u64;
-        let index = challenge * 2;
-        let index_sym = challenge * 2 + 1;
+        // Rows `2·challenge` and `2·challenge+1` are committed together as the
+        // single leaf at position `challenge`; one Merkle path authenticates both
+        // the queried row and its symmetric counterpart.
         PolynomialOpenings {
-            proof: tree.get_proof_by_pos(index).unwrap(),
-            proof_sym: tree.get_proof_by_pos(index_sym).unwrap(),
-            evaluations: gather(reverse_index(index, domain_size)),
-            evaluations_sym: gather(reverse_index(index_sym, domain_size)),
+            proof: tree
+                .get_proof_by_pos(challenge)
+                .expect("FRI query index in bounds"),
+            evaluations: gather(reverse_index(challenge * 2, domain_size)),
+            evaluations_sym: gather(reverse_index(challenge * 2 + 1, domain_size)),
         }
     }
 
@@ -1889,7 +1738,7 @@ pub trait IsStarkProver<
         openings
     }
 
-    // TODO: propagate errors instead of unwrap() in commit_columns, reconstruct_round1, and expand_columns_to_lde
+    // TODO: propagate errors instead of unwrap() in commit_main_trace, reconstruct_round1, and expand_columns_to_lde
     /// Generates STARK proofs for one or more AIRs with a shared transcript.
     ///
     /// # Multi-Table Proving with LogUp
@@ -1988,11 +1837,7 @@ pub trait IsStarkProver<
         // Spill main traces to mmap before Round 1 LDE.
         #[cfg(feature = "disk-spill")]
         if storage_mode == StorageMode::Disk {
-            #[cfg(feature = "parallel")]
-            let spill_iter = air_trace_pairs.par_iter_mut();
-            #[cfg(not(feature = "parallel"))]
-            let mut spill_iter = air_trace_pairs.iter_mut();
-            spill_iter.try_for_each(|(_, trace, _)| {
+            crate::par::par_try_for_each_mut(&mut air_trace_pairs, |(_, trace, _)| {
                 trace
                     .main_table
                     .spill_to_disk()
@@ -2029,13 +1874,8 @@ pub trait IsStarkProver<
             let chunk_end = (chunk_start + k).min(num_airs);
             let chunk_range = chunk_start..chunk_end;
 
-            #[cfg(feature = "parallel")]
-            let iter = chunk_range.into_par_iter();
-            #[cfg(not(feature = "parallel"))]
-            let iter = chunk_range;
-
-            let chunk_results: Vec<Result<_, ProvingError>> = iter
-                .map(|idx| {
+            let chunk_results: Vec<Result<_, ProvingError>> =
+                crate::par::par_map_collect(chunk_range, |idx| {
                     let (air, trace, _) = &air_trace_pairs[idx];
                     let domain = &domains[idx];
                     let twiddles = &twiddle_caches[idx];
@@ -2051,8 +1891,7 @@ pub trait IsStarkProver<
                         #[cfg(feature = "disk-spill")]
                         storage_mode,
                     )
-                })
-                .collect();
+                });
 
             // Sequential: append roots to shared transcript (Fiat-Shamir ordering)
             for result in chunk_results {
@@ -2124,17 +1963,13 @@ pub trait IsStarkProver<
         // Spill all aux trace tables to mmap before any Round 1 aux LDE work.
         #[cfg(feature = "disk-spill")]
         if storage_mode == StorageMode::Disk {
-            #[cfg(feature = "parallel")]
-            let spill_iter = air_trace_pairs.par_iter_mut();
-            #[cfg(not(feature = "parallel"))]
-            let mut spill_iter = air_trace_pairs.iter_mut();
-            spill_iter.try_for_each(|(air, trace, _)| {
+            crate::par::par_try_for_each_mut(&mut air_trace_pairs, |(air, trace, _)| {
                 if air.has_aux_trace() {
                     trace
                         .spill_aux_to_disk()
                         .map_err(|e| ProvingError::DiskSpill(format!("aux trace: {e}")))?;
                 }
-                Ok(())
+                Ok::<(), ProvingError>(())
             })?;
         }
 
@@ -2180,14 +2015,9 @@ pub trait IsStarkProver<
             let chunk_end = (chunk_start + k).min(num_airs);
             let chunk_range = chunk_start..chunk_end;
 
-            #[cfg(feature = "parallel")]
-            let iter = chunk_range.into_par_iter();
-            #[cfg(not(feature = "parallel"))]
-            let iter = chunk_range;
-
             #[allow(clippy::type_complexity)]
-            let chunk_aux: Vec<Result<AuxResult<FieldExtension>, ProvingError>> = iter
-                .map(|idx| {
+            let chunk_aux: Vec<Result<AuxResult<FieldExtension>, ProvingError>> =
+                crate::par::par_map_collect(chunk_range, |idx| {
                     let (air, trace, _) = &air_trace_pairs[idx];
                     let domain = &domains[idx];
                     let twiddles = &twiddle_caches[idx];
@@ -2200,7 +2030,11 @@ pub trait IsStarkProver<
                         #[cfg(feature = "cuda")]
                         {
                             let (trace_slice, num_cols) = trace.aux_data_row_major();
-                            let n = if num_cols > 0 { trace_slice.len() / num_cols } else { 0 };
+                            let n = if num_cols > 0 {
+                                trace_slice.len() / num_cols
+                            } else {
+                                0
+                            };
                             #[cfg(feature = "instruments")]
                             let t_sub = Instant::now();
                             if let Some((tree, handle, aux_data)) =
@@ -2209,7 +2043,10 @@ pub trait IsStarkProver<
                                     FieldExtension,
                                     BatchedMerkleTreeBackend<FieldExtension>,
                                 >(
-                                    trace_slice, n, num_cols, domain.blowup_factor,
+                                    trace_slice,
+                                    n,
+                                    num_cols,
+                                    domain.blowup_factor,
                                     &twiddles.coset_weights,
                                 )
                             {
@@ -2258,33 +2095,26 @@ pub trait IsStarkProver<
                         #[cfg(feature = "instruments")]
                         let t_sub = Instant::now();
                         #[allow(unused_mut)]
-                        let (mut tree, root) = Self::commit_rows_bit_reversed(&aux_data, total_cols)
-                            .ok_or(ProvingError::EmptyCommitment)?;
+                        let (mut tree, root) =
+                            Self::commit_rows_bit_reversed(&aux_data, total_cols)
+                                .ok_or(ProvingError::EmptyCommitment)?;
+                        #[cfg(feature = "disk-spill")]
+                        Self::spill_tree(&mut tree, storage_mode, "aux Merkle tree")?;
+                        let commit = TableCommit::plain(tree, root);
                         #[cfg(feature = "instruments")]
                         crate::instruments::accum_r1_aux(aux_lde_dur, t_sub.elapsed());
 
-                        #[cfg(feature = "disk-spill")]
-                        if storage_mode == StorageMode::Disk {
-                            tree.spill_nodes_to_disk().map_err(|e| {
-                                ProvingError::DiskSpill(format!("aux Merkle tree: {e}"))
-                            })?;
-                        }
                         #[cfg(feature = "cuda")]
-                        return Ok((
-                            Some(TableCommit::plain(tree, root)),
-                            (aux_data, total_cols),
-                            None,
-                        ));
+                        return Ok((Some(commit), (aux_data, total_cols), None));
                         #[cfg(not(feature = "cuda"))]
-                        Ok((Some(TableCommit::plain(tree, root)), (aux_data, total_cols)))
+                        Ok((Some(commit), (aux_data, total_cols)))
                     } else {
                         #[cfg(feature = "cuda")]
                         return Ok((None, (Vec::new(), 0), None));
                         #[cfg(not(feature = "cuda"))]
                         Ok((None, (Vec::new(), 0)))
                     }
-                })
-                .collect();
+                });
 
             // Sequential: append aux roots to forked transcripts.
             for (j, result) in chunk_aux.into_iter().enumerate() {
@@ -2500,7 +2330,7 @@ pub trait IsStarkProver<
 
     // TODO: propagate errors instead of unwrap() in open_deep_composition_poly and FRI operations
     /// Executes rounds 2-4 and generates a STARK proof for the trace `main_trace` with public inputs `pub_inputs`.
-    /// Warning: the transcript must be safely initializated before passing it to this method.
+    /// Warning: the transcript must be safely initialized before passing it to this method.
     fn prove_rounds_2_to_4(
         air: &dyn AIR<Field = Field, FieldExtension = FieldExtension, PublicInputs = PI>,
         pub_inputs: &PI,
@@ -2514,7 +2344,7 @@ pub trait IsStarkProver<
         FieldElement<FieldExtension>: AsBytes,
         PI: Send + Sync + Clone,
     {
-        info!("Started proof generation...");
+        log::debug!("Started proof generation...");
 
         // ===================================
         // ==========|   Round 2   |==========
@@ -2628,7 +2458,7 @@ pub trait IsStarkProver<
             });
         }
 
-        info!("End proof generation");
+        log::debug!("End proof generation");
 
         Ok(StarkProof {
             // [t]
diff --git a/crypto/stark/src/tests/commitment_tests.rs b/crypto/stark/src/tests/commitment_tests.rs
new file mode 100644
index 000000000..f1684112b
--- /dev/null
+++ b/crypto/stark/src/tests/commitment_tests.rs
@@ -0,0 +1,96 @@
+//! Unit tests for the Merkle commitment layer (`crate::commitment`): they pin
+//! the bit-reversed, row-grouped leaf byte layout that the GPU kernels and the
+//! verifier's `verify_opening_pair` must match. Previously this layout was only
+//! covered transitively through full prove→verify.
+
+use crate::commitment::{
+    ROWS_PER_LEAF, commit_bit_reversed, keccak_leaves_bit_reversed,
+    keccak_leaves_bit_reversed_grouped, keccak_leaves_row_pair_bit_reversed,
+};
+use crate::config::{BatchedMerkleTree, BatchedMerkleTreeBackend, Commitment};
+use math::fft::bit_reversing::reverse_index;
+use math::field::{element::FieldElement, goldilocks::GoldilocksField};
+use math::traits::ByteConversion;
+
+type F = GoldilocksField;
+type Felt = FieldElement<F>;
+
+/// 3 columns × 8 rows of distinct, nonzero values.
+fn sample_columns() -> Vec<Vec<Felt>> {
+    (0..3u64)
+        .map(|c| (0..8u64).map(|r| Felt::from(100 * c + r + 1)).collect())
+        .collect()
+}
+
+/// Independent reference for one leaf, written straight from the module-doc
+/// layout (`rows_per_leaf` consecutive bit-reversed rows, column-major within
+/// each row, big-endian), hashed once with the same backend the prover uses.
+/// Structurally separate from the production `map_init` loop, so a transposed
+/// row/column order or a wrong bit-reversal in production fails this check.
+fn expected_leaf(columns: &[Vec<Felt>], rows_per_leaf: usize, leaf_idx: usize) -> Commitment {
+    let num_rows = columns[0].len();
+    let byte_len = <Felt as ByteConversion>::BYTE_LEN;
+    let mut buf = vec![0u8; rows_per_leaf * columns.len() * byte_len];
+    let mut offset = 0;
+    for k in 0..rows_per_leaf {
+        let br = reverse_index(rows_per_leaf * leaf_idx + k, num_rows as u64);
+        for col in columns {
+            col[br].write_bytes_be(&mut buf[offset..offset + byte_len]);
+            offset += byte_len;
+        }
+    }
+    BatchedMerkleTreeBackend::<F>::hash_bytes(&buf)
+}
+
+#[test]
+fn grouped_leaves_match_documented_layout_for_r1_and_r2() {
+    let columns = sample_columns();
+    let num_rows = columns[0].len();
+    for &rows_per_leaf in &[1usize, 2usize] {
+        let leaves = keccak_leaves_bit_reversed_grouped(&columns, rows_per_leaf);
+        assert_eq!(
+            leaves.len(),
+            num_rows / rows_per_leaf,
+            "leaf count for rows_per_leaf={rows_per_leaf}"
+        );
+        for (i, leaf) in leaves.iter().enumerate() {
+            assert_eq!(
+                *leaf,
+                expected_leaf(&columns, rows_per_leaf, i),
+                "leaf {i} for rows_per_leaf={rows_per_leaf}"
+            );
+        }
+    }
+}
+
+#[test]
+fn wrappers_agree_with_grouped() {
+    let columns = sample_columns();
+    assert_eq!(
+        keccak_leaves_bit_reversed(&columns),
+        keccak_leaves_bit_reversed_grouped(&columns, 1)
+    );
+    assert_eq!(
+        keccak_leaves_row_pair_bit_reversed(&columns),
+        keccak_leaves_bit_reversed_grouped(&columns, ROWS_PER_LEAF)
+    );
+}
+
+#[test]
+fn commit_root_matches_tree_built_over_leaves() {
+    let columns = sample_columns();
+    let leaves = keccak_leaves_bit_reversed_grouped(&columns, ROWS_PER_LEAF);
+    let tree = BatchedMerkleTree::<F>::build_from_hashed_leaves(leaves).unwrap();
+    let (_, root) = commit_bit_reversed(&columns, ROWS_PER_LEAF).unwrap();
+    assert_eq!(root, tree.root);
+}
+
+#[test]
+fn empty_and_zero_row_inputs_short_circuit() {
+    let empty: Vec<Vec<Felt>> = vec![];
+    assert!(keccak_leaves_bit_reversed_grouped(&empty, ROWS_PER_LEAF).is_empty());
+    assert!(commit_bit_reversed(&empty, ROWS_PER_LEAF).is_none());
+    let zero_rows: Vec<Vec<Felt>> = vec![vec![]];
+    assert!(keccak_leaves_bit_reversed_grouped(&zero_rows, ROWS_PER_LEAF).is_empty());
+    assert!(commit_bit_reversed(&zero_rows, ROWS_PER_LEAF).is_none());
+}
diff --git a/crypto/stark/src/tests/mod.rs b/crypto/stark/src/tests/mod.rs
index 8c0897ac1..7a3884832 100644
--- a/crypto/stark/src/tests/mod.rs
+++ b/crypto/stark/src/tests/mod.rs
@@ -2,12 +2,14 @@ pub mod air_tests;
 #[cfg(feature = "debug-checks")]
 pub mod bus_debug_tests;
 pub mod bus_tests;
+pub mod commitment_tests;
 pub mod domain_cache_stats;
 pub mod fri_tests;
 pub mod grinding_tests;
 pub mod proof_options_tests;
 pub mod prove_verify_roundtrip_tests;
 pub mod prover_tests;
+pub mod row_pair_opening_tests;
 pub mod small_trace_tests;
 #[cfg(feature = "disk-spill")]
 pub mod table_disk_spill_tests;
diff --git a/crypto/stark/src/tests/prover_tests.rs b/crypto/stark/src/tests/prover_tests.rs
index ab3589702..cb7fb5c44 100644
--- a/crypto/stark/src/tests/prover_tests.rs
+++ b/crypto/stark/src/tests/prover_tests.rs
@@ -564,7 +564,7 @@ fn test_deep_poly_direct_2n_matches_interpolate_fft_extend() {
 }
 
 #[test]
-fn commit_rows_bit_reversed_matches_commit_columns_bit_reversed() {
+fn commit_rows_bit_reversed_matches_commit_bit_reversed() {
     type F = GoldilocksField;
     type FE = FieldElement<F>;
 
@@ -588,8 +588,12 @@ fn commit_rows_bit_reversed_matches_commit_columns_bit_reversed() {
                 }
             }
 
-            let (_, root_col) = Prover::<F, F, ()>::commit_columns_bit_reversed(&columns)
-                .expect("column-major commit must succeed");
+            // Both commits are row-pair (ROWS_PER_LEAF=2): the column-major
+            // `commitment` path and the row-major prover path must produce the
+            // same Merkle root (identical leaf bytes, only the read pattern differs).
+            let (_, root_col) =
+                crate::commitment::commit_bit_reversed(&columns, crate::commitment::ROWS_PER_LEAF)
+                    .expect("column-major commit must succeed");
             let (_, root_row) = Prover::<F, F, ()>::commit_rows_bit_reversed(&row_major, num_cols)
                 .expect("row-major commit must succeed");
 
diff --git a/crypto/stark/src/tests/row_pair_opening_tests.rs b/crypto/stark/src/tests/row_pair_opening_tests.rs
new file mode 100644
index 000000000..93423f49f
--- /dev/null
+++ b/crypto/stark/src/tests/row_pair_opening_tests.rs
@@ -0,0 +1,73 @@
+//! Negative tests for the row-pair trace opening verification
+//! (`verifier::verify_opening_pair`). The row pair `(2·iota, 2·iota+1)` is
+//! committed as a single Merkle leaf, so one `proof` authenticates both
+//! `evaluations` and `evaluations_sym`. Removing the old separate `proof_sym`
+//! opening deleted the "symmetric opening mismatch" rejection class; these
+//! tests restore it — an implementation that ignored `evaluations_sym` or the
+//! authentication path would otherwise pass every other test.
+
+use crate::tests::trace_test_helpers::make_valid_simple_proof;
+use crate::verifier::{IsStarkVerifier, Verifier};
+use crypto::fiat_shamir::default_transcript::DefaultTranscript;
+use math::field::{element::FieldElement, goldilocks::GoldilocksField};
+
+type Felt = FieldElement<GoldilocksField>;
+
+/// Tampering the value at the symmetric LDE position must break verification:
+/// the committed leaf hashed `evaluations ‖ evaluations_sym`, so a perturbed
+/// `evaluations_sym` no longer reconstructs the committed leaf.
+#[test_log::test]
+fn test_verify_rejects_tampered_main_trace_evaluations_sym() {
+    let (air, mut proof) = make_valid_simple_proof();
+
+    let opening = proof
+        .deep_poly_openings
+        .first_mut()
+        .expect("test precondition: a valid proof has at least one deep poly opening");
+    assert!(
+        !opening.main_trace_polys.evaluations_sym.is_empty(),
+        "test precondition: the main-trace opening has at least one symmetric evaluation",
+    );
+    // Perturb (not resize) the first symmetric evaluation.
+    opening.main_trace_polys.evaluations_sym[0] =
+        &opening.main_trace_polys.evaluations_sym[0] + Felt::one();
+
+    assert!(
+        !Verifier::verify(
+            &proof,
+            &air,
+            &mut DefaultTranscript::<GoldilocksField>::new(&[])
+        ),
+        "Verifier must reject a tampered symmetric trace evaluation"
+    );
+}
+
+/// The row-pair Merkle authentication path itself must be checked. Corrupting a
+/// node in `main_trace_polys.proof.merkle_path` is caught ONLY by
+/// `verify_opening_pair` (the deep-composition reconstruction does not touch the
+/// auth path), so this proves the single row-pair path is actually authenticated
+/// against the committed root rather than ignored.
+#[test_log::test]
+fn test_verify_rejects_tampered_main_trace_merkle_path() {
+    let (air, mut proof) = make_valid_simple_proof();
+
+    let opening = proof
+        .deep_poly_openings
+        .first_mut()
+        .expect("test precondition: a valid proof has at least one deep poly opening");
+    let path = &mut opening.main_trace_polys.proof.merkle_path;
+    assert!(
+        !path.is_empty(),
+        "test precondition: the row-pair trace tree has a non-trivial authentication path",
+    );
+    path[0][0] ^= 0x01;
+
+    assert!(
+        !Verifier::verify(
+            &proof,
+            &air,
+            &mut DefaultTranscript::<GoldilocksField>::new(&[])
+        ),
+        "Verifier must reject a corrupted main-trace Merkle authentication path"
+    );
+}
diff --git a/crypto/stark/src/tests/small_trace_tests.rs b/crypto/stark/src/tests/small_trace_tests.rs
index 8373ae9d6..96e04858d 100644
--- a/crypto/stark/src/tests/small_trace_tests.rs
+++ b/crypto/stark/src/tests/small_trace_tests.rs
@@ -11,37 +11,13 @@ use crate::{
     },
     proof::options::ProofOptions,
     prover::{IsStarkProver, Prover},
+    tests::trace_test_helpers::make_valid_simple_proof,
     traits::AIR,
     verifier::{IsStarkVerifier, Verifier},
 };
 
 type Felt = FieldElement<GoldilocksField>;
 
-fn make_valid_simple_proof() -> (
-    SimpleAdditionAIR<GoldilocksField>,
-    crate::proof::stark::StarkProof<
-        GoldilocksField,
-        GoldilocksField,
-        SimpleAdditionPublicInputs<GoldilocksField>,
-    >,
-) {
-    let mut trace = simple_addition_trace::<GoldilocksField>(2);
-    let proof_options = ProofOptions::default_test_options();
-    let pub_inputs = SimpleAdditionPublicInputs {
-        a: Felt::from(1u64),
-        b: Felt::from(2u64),
-    };
-    let air = SimpleAdditionAIR::<GoldilocksField>::new(&proof_options);
-    let proof = Prover::prove(
-        &air,
-        &mut trace,
-        &pub_inputs,
-        &mut DefaultTranscript::<GoldilocksField>::new(&[]),
-    )
-    .unwrap();
-    (air, proof)
-}
-
 /// Test STARK prove/verify with a single-row trace.
 /// This exercises the FRI protocol with 0 FRI layers (trace_length=1, number_layers=0).
 #[test_log::test]
diff --git a/crypto/stark/src/tests/trace_test_helpers.rs b/crypto/stark/src/tests/trace_test_helpers.rs
index e62d0d3ec..4ef6455b3 100644
--- a/crypto/stark/src/tests/trace_test_helpers.rs
+++ b/crypto/stark/src/tests/trace_test_helpers.rs
@@ -1,8 +1,16 @@
+use crate::examples::simple_addition::{
+    SimpleAdditionAIR, SimpleAdditionPublicInputs, simple_addition_trace,
+};
+use crate::proof::options::ProofOptions;
+use crate::prover::{IsStarkProver, Prover};
 use crate::table::Table;
 use crate::trace::{TraceTable, compute_frame_evaluation_points};
+use crate::traits::AIR;
+use crypto::fiat_shamir::default_transcript::DefaultTranscript;
 use itertools::Itertools;
 use math::field::{
     element::FieldElement,
+    goldilocks::GoldilocksField,
     traits::{IsField, IsSubFieldOf},
 };
 use math::polynomial::Polynomial;
@@ -10,6 +18,34 @@ use math::polynomial::Polynomial;
 #[cfg(feature = "parallel")]
 use rayon::prelude::{IntoParallelRefIterator, ParallelIterator};
 
+/// Builds a valid 2-row `SimpleAddition` proof. Shared base for the
+/// proof-tamper / rejection tests in `small_trace_tests` and
+/// `row_pair_opening_tests`.
+pub fn make_valid_simple_proof() -> (
+    SimpleAdditionAIR<GoldilocksField>,
+    crate::proof::stark::StarkProof<
+        GoldilocksField,
+        GoldilocksField,
+        SimpleAdditionPublicInputs<GoldilocksField>,
+    >,
+) {
+    let mut trace = simple_addition_trace::<GoldilocksField>(2);
+    let proof_options = ProofOptions::default_test_options();
+    let pub_inputs = SimpleAdditionPublicInputs {
+        a: FieldElement::from(1u64),
+        b: FieldElement::from(2u64),
+    };
+    let air = SimpleAdditionAIR::<GoldilocksField>::new(&proof_options);
+    let proof = Prover::prove(
+        &air,
+        &mut trace,
+        &pub_inputs,
+        &mut DefaultTranscript::<GoldilocksField>::new(&[]),
+    )
+    .unwrap();
+    (air, proof)
+}
+
 /// Reference Horner-based trace-evaluation used as an oracle by the prover
 /// tests (`tests::prover_tests`). The production prover uses the LDE-based
 /// barycentric `get_trace_evaluations_from_lde`; the two are
diff --git a/crypto/stark/src/trace.rs b/crypto/stark/src/trace.rs
index da4a53f6e..72b77947a 100644
--- a/crypto/stark/src/trace.rs
+++ b/crypto/stark/src/trace.rs
@@ -645,22 +645,6 @@ where
     Table::new(table_data, table_width)
 }
 
-pub fn columns2rows<F>(columns: Vec<Vec<F>>) -> Vec<Vec<F>>
-where
-    F: Clone,
-{
-    let num_rows = columns[0].len();
-    let num_cols = columns.len();
-
-    (0..num_rows)
-        .map(|row_index| {
-            (0..num_cols)
-                .map(|col_index| columns[col_index][row_index].clone())
-                .collect()
-        })
-        .collect()
-}
-
 pub(crate) fn compute_frame_evaluation_points<F, E>(
     x: &FieldElement<E>,
     frame_offsets: &[usize],
diff --git a/crypto/stark/src/verifier.rs b/crypto/stark/src/verifier.rs
index 68819c76b..03119f617 100644
--- a/crypto/stark/src/verifier.rs
+++ b/crypto/stark/src/verifier.rs
@@ -301,25 +301,11 @@ pub trait IsStarkVerifier<
         domain.lde_coset_element(reverse_index(raw, domain.lde_length as u64))
     }
 
-    /// Verifies the validity of the opening proof.
-    fn verify_opening<E>(
-        proof: &Proof<Commitment>,
-        root: &Commitment,
-        index: usize,
-        value: &[FieldElement<E>],
-    ) -> bool
-    where
-        FieldElement<Field>: AsBytes + Sync + Send,
-        FieldElement<E>: AsBytes + Sync + Send,
-        E: IsField,
-        Field: IsSubFieldOf<E>,
-    {
-        proof.verify::<BatchedMerkleTreeBackend<E>>(root, index, &value.to_owned())
-    }
-
-    /// Verify both (proof, evaluations) and (proof_sym, evaluations_sym) openings
-    /// of a `PolynomialOpenings` against the given `root` at iota positions
-    /// `iota*2` and `iota*2 + 1`.
+    /// Verify a row-paired `PolynomialOpenings` against `root`. The row pair
+    /// (`2·iota`, `2·iota+1`) is committed as the single leaf at position `iota`,
+    /// so one Merkle path authenticates both rows: reconstruct the leaf from
+    /// `evaluations ‖ evaluations_sym` and verify once. (Same as the composition
+    /// opening check.)
     fn verify_opening_pair<E>(
         opening: &PolynomialOpenings<E>,
         root: &Commitment,
@@ -331,13 +317,11 @@ pub trait IsStarkVerifier<
         E: IsField,
         Field: IsSubFieldOf<E>,
     {
-        Self::verify_opening::<E>(&opening.proof, root, iota * 2, &opening.evaluations)
-            && Self::verify_opening::<E>(
-                &opening.proof_sym,
-                root,
-                iota * 2 + 1,
-                &opening.evaluations_sym,
-            )
+        let mut value = opening.evaluations.clone();
+        value.extend_from_slice(&opening.evaluations_sym);
+        opening
+            .proof
+            .verify::<BatchedMerkleTreeBackend<E>>(root, iota, &value)
     }
 
     /// Verify opening Open(tⱼ(D_LDE), 𝜐) and Open(tⱼ(D_LDE), -𝜐) for all trace polynomials tⱼ,
diff --git a/prover/src/instruments.rs b/prover/src/instruments.rs
index a33fd3dad..aa5d1caa4 100644
--- a/prover/src/instruments.rs
+++ b/prover/src/instruments.rs
@@ -181,7 +181,7 @@ pub fn print_report(
             let mut sub_ops: Vec<(&str, Duration)> = vec![
                 ("R2  evaluate", total_constraints),
                 ("R2  decompose_and_extend_d2", total_comp_decompose),
-                ("R2  commit_composition_poly", total_comp_commit),
+                ("R2  commit_bit_reversed (comp-poly)", total_comp_commit),
                 ("R3  OOD evaluation", total_ood),
                 ("R4  deep_composition_poly_evals", total_deep_comp),
                 ("R4  interpolate+evaluate_fft", total_deep_extend),
diff --git a/prover/src/tables/bitwise.rs b/prover/src/tables/bitwise.rs
index 468e2a5b2..c4871765f 100644
--- a/prover/src/tables/bitwise.rs
+++ b/prover/src/tables/bitwise.rs
@@ -25,13 +25,13 @@
 //! All lookups are provided as receivers with negative multiplicity,
 //! meaning other tables send to this table.
 
-use math::fft::bit_reversing::in_place_bit_reverse_permute;
 use math::polynomial::Polynomial;
-use stark::config::{BatchedMerkleTree, Commitment};
+use stark::commitment::{ROWS_PER_LEAF, commit_bit_reversed};
+use stark::config::Commitment;
 use stark::lookup::{BusInteraction, BusValue, Multiplicity, Packing};
 use stark::proof::options::ProofOptions;
 use stark::prover::evaluate_polynomial_on_lde_domain;
-use stark::trace::{TraceTable, columns2rows};
+use stark::trace::TraceTable;
 
 #[cfg(feature = "parallel")]
 use rayon::prelude::*;
@@ -195,19 +195,19 @@ pub const fn is_preprocessed() -> bool {
 fn static_commitment(blowup_factor: u8) -> Option<Commitment> {
     match blowup_factor {
         2 => Some([
-            0xfb, 0x46, 0xff, 0x1c, 0xed, 0x4c, 0x97, 0xfb, 0xb2, 0x17, 0x55, 0x24, 0x08, 0x04,
-            0x15, 0xee, 0xbe, 0xa6, 0xee, 0x86, 0x69, 0xaf, 0x3a, 0x4f, 0x9e, 0x2a, 0x44, 0x81,
-            0xf9, 0xb0, 0xf3, 0xff,
+            0xfa, 0x3e, 0xcf, 0x80, 0xfd, 0x95, 0xe5, 0x09, 0x74, 0xd4, 0x55, 0x23, 0xf6, 0x42,
+            0xb6, 0x4b, 0x05, 0xc4, 0xf9, 0x66, 0xc2, 0x4d, 0xff, 0xda, 0x31, 0x47, 0xab, 0x7b,
+            0x0c, 0x6d, 0xc4, 0xcf,
         ]),
         4 => Some([
-            0xb5, 0xc4, 0xc0, 0x80, 0x03, 0x5b, 0xb6, 0x12, 0x78, 0x8c, 0x4d, 0xd4, 0x9e, 0x3d,
-            0xc4, 0xe2, 0xef, 0x95, 0xf0, 0xbf, 0xe8, 0x1d, 0x98, 0xec, 0x7f, 0x58, 0x3a, 0x47,
-            0x18, 0x03, 0x7e, 0xa5,
+            0xff, 0x76, 0x8e, 0x85, 0x4b, 0xdc, 0x32, 0x61, 0x96, 0x16, 0x15, 0x19, 0x73, 0x70,
+            0xf0, 0x64, 0x81, 0xfd, 0x4f, 0x5c, 0xbd, 0x9c, 0x30, 0x26, 0xd5, 0xc0, 0x81, 0xf3,
+            0xce, 0x38, 0x50, 0x3e,
         ]),
         8 => Some([
-            0x8a, 0x18, 0x70, 0x51, 0x34, 0x1a, 0x65, 0xaa, 0x79, 0x17, 0x07, 0x9a, 0xf3, 0x0b,
-            0xcb, 0xd0, 0x7c, 0xe3, 0x2a, 0xce, 0x89, 0x9a, 0xfd, 0xc8, 0x0d, 0x6b, 0x48, 0x43,
-            0x83, 0x5d, 0x18, 0xb8,
+            0x0e, 0x1b, 0xc1, 0x0d, 0xae, 0x64, 0xe7, 0xca, 0xe0, 0x2a, 0x3b, 0xab, 0xd7, 0xd2,
+            0xbb, 0x80, 0xd5, 0x24, 0x5a, 0xce, 0x25, 0xb6, 0x84, 0x77, 0x9c, 0xb5, 0xeb, 0x67,
+            0x61, 0x82, 0x78, 0x3d,
         ]),
         _ => None,
     }
@@ -283,7 +283,7 @@ pub fn compute_preprocessed_commitment(options: &ProofOptions) -> Commitment {
     let coset_offset = FE::from(options.coset_offset);
 
     #[cfg(feature = "parallel")]
-    let mut lde_columns: Vec<Vec<FE>> = polys
+    let lde_columns: Vec<Vec<FE>> = polys
         .par_iter()
         .map(|poly| {
             evaluate_polynomial_on_lde_domain(poly, blowup_factor, NUM_ROWS, &coset_offset)
@@ -292,7 +292,7 @@ pub fn compute_preprocessed_commitment(options: &ProofOptions) -> Commitment {
         .collect();
 
     #[cfg(not(feature = "parallel"))]
-    let mut lde_columns: Vec<Vec<FE>> = polys
+    let lde_columns: Vec<Vec<FE>> = polys
         .iter()
         .map(|poly| {
             evaluate_polynomial_on_lde_domain(poly, blowup_factor, NUM_ROWS, &coset_offset)
@@ -300,25 +300,9 @@ pub fn compute_preprocessed_commitment(options: &ProofOptions) -> Commitment {
         })
         .collect();
 
-    // Step 4: Bit-reverse permute (parallel)
-    #[cfg(feature = "parallel")]
-    lde_columns.par_iter_mut().for_each(|col| {
-        in_place_bit_reverse_permute(col);
-    });
-
-    #[cfg(not(feature = "parallel"))]
-    for col in lde_columns.iter_mut() {
-        in_place_bit_reverse_permute(col);
-    }
-
-    // Step 5: Convert columns to rows for Merkle tree
-    let lde_rows = columns2rows(lde_columns);
-
-    // Step 6: Build Merkle tree over LDE (N * blowup leaves)
-    let tree = BatchedMerkleTree::<GoldilocksField>::build(&lde_rows)
+    let (_, root) = commit_bit_reversed(&lde_columns, ROWS_PER_LEAF)
         .expect("Failed to build Merkle tree for bitwise LDE");
-
-    tree.root
+    root
 }
 
 /// Returns the preprocessed commitment for the bitwise table.
diff --git a/prover/src/tables/decode.rs b/prover/src/tables/decode.rs
index 7bc3c9106..509f86991 100644
--- a/prover/src/tables/decode.rs
+++ b/prover/src/tables/decode.rs
@@ -34,13 +34,13 @@
 use executor::elf::Elf;
 use executor::vm::instruction::decoding::{Instruction, InstructionError};
 use executor::vm::memory::U64HashMap;
-use math::fft::bit_reversing::in_place_bit_reverse_permute;
 use math::polynomial::Polynomial;
-use stark::config::{BatchedMerkleTree, Commitment};
+use stark::commitment::{ROWS_PER_LEAF, commit_bit_reversed};
+use stark::config::Commitment;
 use stark::lookup::{BusInteraction, BusValue, Multiplicity, Packing};
 use stark::proof::options::ProofOptions;
 use stark::prover::evaluate_polynomial_on_lde_domain;
-use stark::trace::{TraceTable, columns2rows};
+use stark::trace::TraceTable;
 
 use super::types::{BusId, FE, GoldilocksExtension, GoldilocksField, VmTable};
 
@@ -285,7 +285,7 @@ pub fn compute_precomputed_commitment(
     // Step 4: Evaluate polynomials on LDE domain (N * blowup_factor points)
     let blowup_factor = options.blowup_factor as usize;
     let coset_offset = FE::from(options.coset_offset);
-    let mut lde_columns: Vec<Vec<FE>> = polys
+    let lde_columns: Vec<Vec<FE>> = polys
         .iter()
         .map(|poly| {
             evaluate_polynomial_on_lde_domain(poly, blowup_factor, num_rows, &coset_offset)
@@ -293,19 +293,9 @@ pub fn compute_precomputed_commitment(
         })
         .collect();
 
-    // Step 5: Bit-reverse permute (same as prover)
-    for col in lde_columns.iter_mut() {
-        in_place_bit_reverse_permute(col);
-    }
-
-    // Step 6: Convert columns to rows for Merkle tree
-    let lde_rows = columns2rows(lde_columns);
-
-    // Step 7: Build Merkle tree over LDE (N * blowup leaves)
-    let tree = BatchedMerkleTree::<GoldilocksField>::build(&lde_rows)
+    let (_, root) = commit_bit_reversed(&lde_columns, ROWS_PER_LEAF)
         .expect("Failed to build Merkle tree for decode LDE");
-
-    tree.root
+    root
 }
 
 // =========================================================================
diff --git a/prover/src/tables/keccak_rc.rs b/prover/src/tables/keccak_rc.rs
index 3575c8ba1..f9f0d1cc4 100644
--- a/prover/src/tables/keccak_rc.rs
+++ b/prover/src/tables/keccak_rc.rs
@@ -8,13 +8,13 @@
 //! committed via a static lookup table (with recompute as fallback for
 //! `ProofOptions` not covered by the static table).
 
-use math::fft::bit_reversing::in_place_bit_reverse_permute;
 use math::polynomial::Polynomial;
-use stark::config::{BatchedMerkleTree, Commitment};
+use stark::commitment::{ROWS_PER_LEAF, commit_bit_reversed};
+use stark::config::Commitment;
 use stark::lookup::{BusInteraction, BusValue, Multiplicity, Packing};
 use stark::proof::options::ProofOptions;
 use stark::prover::evaluate_polynomial_on_lde_domain;
-use stark::trace::{TraceTable, columns2rows};
+use stark::trace::TraceTable;
 
 use executor::vm::instruction::execution::KECCAK_RC;
 
@@ -97,19 +97,19 @@ pub const fn generate_row(round: usize) -> [u64; NUM_PRECOMPUTED_COLS] {
 fn static_commitment(blowup_factor: u8) -> Option<Commitment> {
     match blowup_factor {
         2 => Some([
-            0xe8, 0x06, 0x8b, 0xb2, 0xbd, 0x3d, 0x80, 0xf3, 0x92, 0x95, 0x31, 0x1a, 0xfd, 0x55,
-            0xba, 0x12, 0x3f, 0x76, 0xeb, 0x44, 0x32, 0x57, 0x9d, 0xb7, 0x7f, 0x1e, 0x63, 0xb4,
-            0x98, 0xb5, 0xb0, 0xb7,
+            0xab, 0x7a, 0xad, 0xf5, 0xbf, 0xa2, 0xd5, 0x5c, 0x29, 0x83, 0x83, 0xe6, 0x2e, 0x47,
+            0xa0, 0xa5, 0x22, 0xf9, 0x57, 0x89, 0x5a, 0x5c, 0xbb, 0x1f, 0x34, 0xbc, 0x21, 0x72,
+            0xa9, 0x2c, 0x85, 0xe3,
         ]),
         4 => Some([
-            0xa9, 0xfb, 0xc9, 0x15, 0x1c, 0x22, 0x75, 0xe7, 0x56, 0xeb, 0x6d, 0xf9, 0xfe, 0x83,
-            0x2a, 0xb1, 0xa7, 0x1a, 0x20, 0x71, 0x9b, 0x0c, 0xff, 0x6b, 0x3f, 0x57, 0xc6, 0x84,
-            0x3e, 0xbf, 0xc8, 0xaa,
+            0xfb, 0x42, 0x58, 0x76, 0xf4, 0x30, 0x98, 0x04, 0xef, 0x8c, 0x4e, 0x65, 0xf7, 0x1a,
+            0x29, 0x03, 0xd2, 0xc6, 0x12, 0x0d, 0x18, 0xe1, 0x28, 0x6e, 0x70, 0xeb, 0xa8, 0x11,
+            0x3c, 0x3e, 0xe1, 0xdd,
         ]),
         8 => Some([
-            0x5c, 0x30, 0xf6, 0xa0, 0xcf, 0x78, 0x43, 0x15, 0x5b, 0x5d, 0x18, 0x34, 0x44, 0xba,
-            0x81, 0x9a, 0x64, 0x05, 0x5c, 0x79, 0x26, 0x18, 0x09, 0x24, 0x6b, 0xa2, 0x3f, 0x5f,
-            0x77, 0x09, 0xd5, 0xfc,
+            0x9f, 0x91, 0xaf, 0xb6, 0x5b, 0x75, 0x1e, 0xfb, 0x73, 0x93, 0x2c, 0xc4, 0xa8, 0xe1,
+            0xb5, 0x21, 0x91, 0x5d, 0x6a, 0x19, 0x2e, 0x1d, 0xa8, 0x80, 0x21, 0x1f, 0x36, 0x76,
+            0x9b, 0x8e, 0x3d, 0xb6,
         ]),
         _ => None,
     }
@@ -144,7 +144,7 @@ pub fn compute_preprocessed_commitment(options: &ProofOptions) -> Commitment {
     // Evaluate on LDE domain
     let blowup_factor = options.blowup_factor as usize;
     let coset_offset = FE::from(options.coset_offset);
-    let mut lde_columns: Vec<Vec<FE>> = polys
+    let lde_columns: Vec<Vec<FE>> = polys
         .iter()
         .map(|poly| {
             evaluate_polynomial_on_lde_domain(poly, blowup_factor, NUM_ROWS, &coset_offset)
@@ -152,17 +152,9 @@ pub fn compute_preprocessed_commitment(options: &ProofOptions) -> Commitment {
         })
         .collect();
 
-    // Bit-reverse permute
-    for col in lde_columns.iter_mut() {
-        in_place_bit_reverse_permute(col);
-    }
-
-    // Build Merkle tree
-    let lde_rows = columns2rows(lde_columns);
-    let tree = BatchedMerkleTree::<GoldilocksField>::build(&lde_rows)
+    let (_, root) = commit_bit_reversed(&lde_columns, ROWS_PER_LEAF)
         .expect("Failed to build Merkle tree for keccak_rc LDE");
-
-    tree.root
+    root
 }
 
 /// Returns the preprocessed commitment for the keccak_rc table.
diff --git a/prover/src/tables/page.rs b/prover/src/tables/page.rs
index 174225ffa..2d1059bcc 100644
--- a/prover/src/tables/page.rs
+++ b/prover/src/tables/page.rs
@@ -32,13 +32,13 @@
 
 use std::collections::HashMap;
 
-use math::fft::bit_reversing::in_place_bit_reverse_permute;
 use math::polynomial::Polynomial;
-use stark::config::{BatchedMerkleTree, Commitment};
+use stark::commitment::{ROWS_PER_LEAF, commit_bit_reversed};
+use stark::config::Commitment;
 use stark::lookup::{BusInteraction, BusValue, LinearTerm, Multiplicity, Packing};
 use stark::proof::options::ProofOptions;
 use stark::prover::evaluate_polynomial_on_lde_domain;
-use stark::trace::{TraceTable, columns2rows};
+use stark::trace::TraceTable;
 
 use super::types::{BusId, FE, GoldilocksExtension, GoldilocksField, VmTable};
 
@@ -250,19 +250,19 @@ pub fn generate_page_trace(
 pub(crate) fn static_zero_page_commitment(blowup_factor: u8) -> Option<Commitment> {
     match blowup_factor {
         2 => Some([
-            0xf9, 0x80, 0x0e, 0x45, 0x72, 0x5a, 0x8e, 0x8e, 0x5e, 0xd7, 0x5b, 0x60, 0xce, 0xd0,
-            0x8e, 0xa3, 0x27, 0x3b, 0x8a, 0xb5, 0x98, 0xc0, 0xe3, 0x16, 0xf6, 0x86, 0x75, 0x39,
-            0x4c, 0xe5, 0x88, 0x5e,
+            0x7d, 0x74, 0x85, 0xf0, 0x2b, 0x74, 0xe0, 0x3f, 0x14, 0x99, 0xb3, 0xa0, 0x5f, 0x1d,
+            0x6e, 0xf2, 0x21, 0xff, 0xaf, 0x24, 0x7e, 0x30, 0xb0, 0xda, 0x48, 0x79, 0xe1, 0x43,
+            0xee, 0xea, 0x6a, 0x0f,
         ]),
         4 => Some([
-            0x0f, 0xb5, 0x0c, 0xa8, 0x3b, 0x69, 0x4f, 0x91, 0x60, 0xbf, 0x0d, 0x0d, 0xd3, 0x33,
-            0x25, 0x38, 0x11, 0xbb, 0xf8, 0xfd, 0x54, 0xbd, 0x06, 0x7d, 0xd1, 0xeb, 0xa3, 0x58,
-            0xe8, 0x37, 0x45, 0x56,
+            0x5c, 0xcc, 0x5b, 0xb1, 0xe8, 0x11, 0x91, 0x81, 0xbd, 0xdd, 0x39, 0x40, 0x77, 0x87,
+            0xdc, 0x98, 0x06, 0x06, 0x8c, 0x63, 0xcd, 0xfd, 0xf1, 0xda, 0x4a, 0x55, 0x31, 0x4d,
+            0x6a, 0x16, 0x18, 0xd0,
         ]),
         8 => Some([
-            0x4a, 0xfb, 0xc9, 0x6d, 0x46, 0x29, 0xa3, 0xc2, 0x36, 0x14, 0xd8, 0x24, 0x3e, 0xef,
-            0x97, 0x3f, 0xe1, 0xda, 0x2b, 0xf7, 0x87, 0xb6, 0x54, 0xe1, 0xc6, 0x46, 0xc0, 0x85,
-            0x96, 0x7f, 0x7f, 0x48,
+            0xf0, 0xc0, 0x69, 0xed, 0xf8, 0x59, 0xd6, 0x56, 0x15, 0x3c, 0x2f, 0x93, 0x65, 0xd6,
+            0xe9, 0xe9, 0x8e, 0xd1, 0x83, 0x94, 0xf9, 0x75, 0x59, 0xd1, 0xec, 0x16, 0xe1, 0x37,
+            0xd5, 0x32, 0xd6, 0xd9,
         ]),
         _ => None,
     }
@@ -315,7 +315,7 @@ pub fn compute_precomputed_commitment(config: &PageConfig, options: &ProofOption
 
     let blowup_factor = options.blowup_factor as usize;
     let coset_offset = FE::from(options.coset_offset);
-    let mut lde_columns: Vec<Vec<FE>> = polys
+    let lde_columns: Vec<Vec<FE>> = polys
         .iter()
         .map(|poly| {
             evaluate_polynomial_on_lde_domain(poly, blowup_factor, num_rows, &coset_offset)
@@ -323,14 +323,9 @@ pub fn compute_precomputed_commitment(config: &PageConfig, options: &ProofOption
         })
         .collect();
 
-    for col in lde_columns.iter_mut() {
-        in_place_bit_reverse_permute(col);
-    }
-
-    let lde_rows = columns2rows(lde_columns);
-    let tree = BatchedMerkleTree::<GoldilocksField>::build(&lde_rows)
+    let (_, root) = commit_bit_reversed(&lde_columns, ROWS_PER_LEAF)
         .expect("Failed to build Merkle tree for page LDE");
-    tree.root
+    root
 }
 
 /// Returns the zero-init PAGE preprocessed commitment.
diff --git a/prover/src/tables/register.rs b/prover/src/tables/register.rs
index 09485595a..46c675b65 100644
--- a/prover/src/tables/register.rs
+++ b/prover/src/tables/register.rs
@@ -20,13 +20,13 @@
 
 use std::collections::HashMap;
 
-use math::fft::bit_reversing::in_place_bit_reverse_permute;
 use math::polynomial::Polynomial;
-use stark::config::{BatchedMerkleTree, Commitment};
+use stark::commitment::{ROWS_PER_LEAF, commit_bit_reversed};
+use stark::config::Commitment;
 use stark::lookup::{BusInteraction, BusValue, Multiplicity, Packing};
 use stark::proof::options::ProofOptions;
 use stark::prover::evaluate_polynomial_on_lde_domain;
-use stark::trace::{TraceTable, columns2rows};
+use stark::trace::TraceTable;
 
 #[cfg(test)]
 use executor::vm::registers::Registers;
@@ -336,7 +336,7 @@ fn commit_register_columns(options: &ProofOptions, columns: Vec<Vec<FE>>) -> Com
 
     let blowup_factor = options.blowup_factor as usize;
     let coset_offset = FE::from(options.coset_offset);
-    let mut lde_columns: Vec<Vec<FE>> = polys
+    let lde_columns: Vec<Vec<FE>> = polys
         .iter()
         .map(|poly| {
             evaluate_polynomial_on_lde_domain(poly, blowup_factor, num_rows, &coset_offset)
@@ -344,14 +344,9 @@ fn commit_register_columns(options: &ProofOptions, columns: Vec<Vec<FE>>) -> Com
         })
         .collect();
 
-    for col in lde_columns.iter_mut() {
-        in_place_bit_reverse_permute(col);
-    }
-
-    let lde_rows = columns2rows(lde_columns);
-    let tree = BatchedMerkleTree::<GoldilocksField>::build(&lde_rows)
+    let (_, root) = commit_bit_reversed(&lde_columns, ROWS_PER_LEAF)
         .expect("Failed to build Merkle tree for register LDE");
-    tree.root
+    root
 }
 
 /// Returns the preprocessed commitment for the REGISTER table.
diff --git a/prover/src/tests/decode_tests.rs b/prover/src/tests/decode_tests.rs
index 43e6991cf..a761ac929 100644
--- a/prover/src/tests/decode_tests.rs
+++ b/prover/src/tests/decode_tests.rs
@@ -242,8 +242,8 @@ fn decode_commitment_zero_bytes_rejects() {
 /// AIR or FFT pipeline changes, this drifts and the test fails —
 /// regenerate via the `print_decode_commitment_for_sub` helper below.
 const SUB_DECODE_COMMITMENT_BLOWUP_2: [u8; 32] = [
-    0x60, 0x66, 0x0b, 0x18, 0x0d, 0x41, 0x08, 0xb3, 0x3a, 0x03, 0x99, 0x03, 0x8c, 0x9d, 0x12, 0x57,
-    0x68, 0x8d, 0xed, 0x13, 0x60, 0xeb, 0x1d, 0x2b, 0xa8, 0xea, 0x1c, 0x76, 0xc9, 0xdd, 0x25, 0xaf,
+    0xe9, 0x71, 0x68, 0xd6, 0x2e, 0xb1, 0xf6, 0x56, 0x61, 0x9d, 0x04, 0x6e, 0x65, 0xed, 0x63, 0x4a,
+    0x27, 0xa3, 0x4d, 0xcb, 0x6c, 0x02, 0x11, 0xd7, 0x65, 0xc9, 0xc9, 0xfd, 0x59, 0x34, 0x41, 0x5f,
 ];
 
 #[test]
diff --git a/prover/tests/cuda_path_integration.rs b/prover/tests/cuda_path_integration.rs
index cf9bc742c..8033828bf 100644
--- a/prover/tests/cuda_path_integration.rs
+++ b/prover/tests/cuda_path_integration.rs
@@ -47,7 +47,10 @@ fn gpu_path_fires_end_to_end() {
         "R2 GPU composition LDE did not fire (neither two-halves d2 nor parts>2 path)"
     );
 
-    // R2 comp-poly Merkle tree build, paired with the parts LDE above.
+    // R2 comp-poly Merkle tree build. Dispatched unconditionally (independent of
+    // the parts-count branch above), so it fires for the common degree-2 case
+    // too; a silent CPU fallback would still verify, so this counter is what
+    // guards the GPU comp-poly-tree dispatch.
     assert!(
         gpu_comp_poly_tree_calls() > 0,
         "R2 GPU comp-poly tree did not fire"
@@ -72,3 +75,24 @@ fn gpu_path_fires_end_to_end() {
     let ok = verify(&proof, &elf).expect("verify");
     assert!(ok, "GPU-produced proof failed verification");
 }
+
+/// Focused validation of the GPU row-pair trace commitment: proves a large
+/// trace with the GPU path and verifies the resulting proof. Independent of the
+/// per-round counter assertions in `gpu_path_fires_end_to_end` (the R2 parts-LDE
+/// assertion bit-rotted on main and cuts off before the verify). A wrong GPU
+/// trace-commit leaf layout (1-row vs the new row-pair) would fail verification.
+#[test]
+#[ignore = "requires GPU; run with --ignored --nocapture"]
+fn gpu_proof_verifies_row_pair_commitment() {
+    let elf = asm_elf_bytes("fib_iterative_1M");
+    reset_all_gpu_call_counters();
+    let proof = prove(&elf).expect("prove");
+    assert!(
+        gpu_lde_calls() > 0,
+        "GPU LDE path did not fire (silent CPU fallback would not test the GPU commit)"
+    );
+    assert!(
+        verify(&proof, &elf).expect("verify"),
+        "GPU-produced proof (row-pair commitment) failed verification"
+    );
+}