From faffc88aefedf2aa1a8ba18717f2027953ee2403 Mon Sep 17 00:00:00 2001
From: MauroFab <maurotoscano2@gmail.com>
Date: Mon, 29 Jun 2026 17:45:01 -0300
Subject: [PATCH 1/5] Revert unrelated FxHashMap op-dedup change (out of scope
 for #715)

The FxHasher/FxHashMap op-dedup micro-optimization is unrelated to the row-major GPU LDE rework and was only applied to 4 of 6 dedup tables. Revert the table maps to std HashMap and drop the hasher; it can land as its own focused PR.
---
 prover/src/tables/branch.rs |  8 +++---
 prover/src/tables/dvrm.rs   |  8 ++++--
 prover/src/tables/lt.rs     |  8 +++---
 prover/src/tables/mul.rs    |  8 ++++--
 prover/src/tables/types.rs  | 55 -------------------------------------
 5 files changed, 18 insertions(+), 69 deletions(-)

diff --git a/prover/src/tables/branch.rs b/prover/src/tables/branch.rs
index 6ecd42e78..9443a81a1 100644
--- a/prover/src/tables/branch.rs
+++ b/prover/src/tables/branch.rs
@@ -33,9 +33,9 @@ use stark::lookup::{BusInteraction, BusValue, LinearTerm, Multiplicity, Packing}
 use stark::table::TableView;
 use stark::trace::TraceTable;
 
-use super::types::{
-    BusId, FE, FxHashMap, GoldilocksExtension, GoldilocksField, SHIFT_16, VmTable, alu_op,
-};
+use std::collections::HashMap;
+
+use super::types::{BusId, FE, GoldilocksExtension, GoldilocksField, SHIFT_16, VmTable, alu_op};
 
 // =========================================================================
 // Column indices for BRANCH table
@@ -161,7 +161,7 @@ pub fn generate_branch_trace(
     operations: &[BranchOperation],
 ) -> TraceTable<GoldilocksField, GoldilocksExtension> {
     // Deduplicate operations: (pc, offset, register, jalr) -> multiplicity
-    let mut op_map: FxHashMap<BranchOperation, u64> = FxHashMap::default();
+    let mut op_map: HashMap<BranchOperation, u64> = HashMap::new();
     for op in operations {
         *op_map.entry(op.clone()).or_insert(0) += 1;
     }
diff --git a/prover/src/tables/dvrm.rs b/prover/src/tables/dvrm.rs
index 2dcaae453..3da78dff5 100644
--- a/prover/src/tables/dvrm.rs
+++ b/prover/src/tables/dvrm.rs
@@ -36,9 +36,11 @@ use stark::lookup::{BusInteraction, BusValue, LinearTerm, Multiplicity, Packing}
 use stark::table::TableView;
 use stark::trace::TraceTable;
 
+use std::collections::HashMap;
+
 use super::types::{
-    BusId, FE, FxHashMap, GoldilocksExtension, GoldilocksField, NEG_INV_2_16, NEG_INV_2_32,
-    NEG_INV_2_48, NEG_INV_2_64, SHIFT_16, VmTable, alu_op,
+    BusId, FE, GoldilocksExtension, GoldilocksField, NEG_INV_2_16, NEG_INV_2_32, NEG_INV_2_48,
+    NEG_INV_2_64, SHIFT_16, VmTable, alu_op,
 };
 
 // =========================================================================
@@ -286,7 +288,7 @@ pub fn generate_dvrm_trace(
     operations: &[(DvrmOperation, bool)],
 ) -> TraceTable<GoldilocksField, GoldilocksExtension> {
     // Deduplicate: (n, d, signed) -> (mu_q, mu_r)
-    let mut op_map: FxHashMap<DvrmOperation, DvrmMultiplicities> = FxHashMap::default();
+    let mut op_map: HashMap<DvrmOperation, DvrmMultiplicities> = HashMap::new();
 
     for (op, wants_remainder) in operations {
         let entry = op_map.entry(op.clone()).or_default();
diff --git a/prover/src/tables/lt.rs b/prover/src/tables/lt.rs
index be0b60773..02ed029bd 100644
--- a/prover/src/tables/lt.rs
+++ b/prover/src/tables/lt.rs
@@ -33,9 +33,9 @@ use stark::lookup::{BusInteraction, BusValue, LinearTerm, Multiplicity, Packing}
 use stark::table::TableView;
 use stark::trace::TraceTable;
 
-use super::types::{
-    BusId, FE, FxHashMap, GoldilocksExtension, GoldilocksField, SHIFT_16, VmTable, alu_op,
-};
+use std::collections::HashMap;
+
+use super::types::{BusId, FE, GoldilocksExtension, GoldilocksField, SHIFT_16, VmTable, alu_op};
 
 // =========================================================================
 // Column indices for LT table
@@ -164,7 +164,7 @@ pub fn generate_lt_trace(
     operations: &[LtOperation],
 ) -> TraceTable<GoldilocksField, GoldilocksExtension> {
     // Deduplicate operations: (lhs, rhs, signed) -> multiplicity
-    let mut op_map: FxHashMap<LtOperation, u64> = FxHashMap::default();
+    let mut op_map: HashMap<LtOperation, u64> = HashMap::new();
     for op in operations {
         *op_map.entry(op.clone()).or_insert(0) += 1;
     }
diff --git a/prover/src/tables/mul.rs b/prover/src/tables/mul.rs
index 197a0d334..33679211c 100644
--- a/prover/src/tables/mul.rs
+++ b/prover/src/tables/mul.rs
@@ -37,9 +37,11 @@ use stark::lookup::{BusInteraction, BusValue, LinearTerm, Multiplicity, Packing}
 use stark::table::TableView;
 use stark::trace::TraceTable;
 
+use std::collections::HashMap;
+
 use super::types::{
-    BusId, FE, FxHashMap, GoldilocksExtension, GoldilocksField, INV_2_32, INV_2_64, INV_2_96,
-    INV_2_128, NEG_INV_2_16, NEG_INV_2_32, NEG_INV_2_48, NEG_INV_2_64, NEG_INV_2_80, NEG_INV_2_96,
+    BusId, FE, GoldilocksExtension, GoldilocksField, INV_2_32, INV_2_64, INV_2_96, INV_2_128,
+    NEG_INV_2_16, NEG_INV_2_32, NEG_INV_2_48, NEG_INV_2_64, NEG_INV_2_80, NEG_INV_2_96,
     NEG_INV_2_112, NEG_INV_2_128, SHIFT_16, VmTable, alu_op,
 };
 
@@ -294,7 +296,7 @@ pub fn generate_mul_trace(
     operations: &[(MulOperation, bool)],
 ) -> TraceTable<GoldilocksField, GoldilocksExtension> {
     // Deduplicate: (lhs, lhs_signed, rhs, rhs_signed) -> (mu_lo, mu_hi)
-    let mut op_map: FxHashMap<MulOperation, MulMultiplicities> = FxHashMap::default();
+    let mut op_map: HashMap<MulOperation, MulMultiplicities> = HashMap::new();
 
     for (op, wants_hi) in operations {
         let entry = op_map.entry(op.clone()).or_default();
diff --git a/prover/src/tables/types.rs b/prover/src/tables/types.rs
index 2cd5db0f0..d6091d0fd 100644
--- a/prover/src/tables/types.rs
+++ b/prover/src/tables/types.rs
@@ -968,61 +968,6 @@ impl DecodeEntry {
     }
 }
 
-// =========================================================================
-// Fast hashing for op-dedup multiplicity maps
-// =========================================================================
-
-/// Fast non-cryptographic hash for the op-dedup hot path. Skipping SipHash is
-/// safe here: the maps are per-chunk (bounded ≤ `max_rows`), keyed by the
-/// prover's own trace, and collisions only cost probes, never soundness.
-#[derive(Default)]
-pub struct FxHasher(u64);
-
-impl FxHasher {
-    const SEED: u64 = 0x51_7c_c1_b7_27_22_0a_95;
-
-    #[inline]
-    fn add(&mut self, word: u64) {
-        self.0 = (self.0.rotate_left(5) ^ word).wrapping_mul(Self::SEED);
-    }
-}
-
-impl std::hash::Hasher for FxHasher {
-    #[inline]
-    fn write(&mut self, bytes: &[u8]) {
-        for &b in bytes {
-            self.add(b as u64);
-        }
-    }
-    #[inline]
-    fn write_u8(&mut self, i: u8) {
-        self.add(i as u64);
-    }
-    #[inline]
-    fn write_u16(&mut self, i: u16) {
-        self.add(i as u64);
-    }
-    #[inline]
-    fn write_u32(&mut self, i: u32) {
-        self.add(i as u64);
-    }
-    #[inline]
-    fn write_u64(&mut self, i: u64) {
-        self.add(i);
-    }
-    #[inline]
-    fn write_usize(&mut self, i: usize) {
-        self.add(i as u64);
-    }
-    #[inline]
-    fn finish(&self) -> u64 {
-        self.0
-    }
-}
-
-/// `HashMap` keyed with [`FxHasher`].
-pub type FxHashMap<K, V> = std::collections::HashMap<K, V, std::hash::BuildHasherDefault<FxHasher>>;
-
 /// The fully sign-extended 64-bit immediate for an instruction (0 when none).
 fn imm_from_instruction(instruction: Instruction) -> u64 {
     match instruction {

From d90958997bbe8459fb7df77eb2c01958db9296b2 Mon Sep 17 00:00:00 2001
From: MauroFab <maurotoscano2@gmail.com>
Date: Mon, 29 Jun 2026 17:45:01 -0300
Subject: [PATCH 2/5] Remove redundant gpu_and_cpu_proofs_both_verify test

The GPU full path is covered by the normal prove/verify suite built with --features cuda (plus gpu_path_fires_end_to_end), the CPU path by the non-cuda suite, and GPU/CPU equivalence by the merkle/barycentric parity tests. Its force-CPU leg also never ran on CPU: gpu_lde_threshold() only re-read the env var under cfg(test), but from the prover integration crate stark compiles without cfg(test), so the OnceLock cached the first value. Simplify gpu_lde_threshold() to a single cached impl now that the per-call re-read has no consumer.
---
 crypto/stark/src/gpu_lde.rs           | 18 +++---------------
 prover/tests/cuda_path_integration.rs | 22 ----------------------
 2 files changed, 3 insertions(+), 37 deletions(-)

diff --git a/crypto/stark/src/gpu_lde.rs b/crypto/stark/src/gpu_lde.rs
index 60e8dacc6..29e9b94e6 100644
--- a/crypto/stark/src/gpu_lde.rs
+++ b/crypto/stark/src/gpu_lde.rs
@@ -41,25 +41,13 @@ use crate::trace::LDETraceTable;
 const DEFAULT_GPU_LDE_THRESHOLD: usize = 1 << 19;
 
 fn gpu_lde_threshold() -> usize {
-    // In test builds re-read the env var on every call so tests can switch
-    // between GPU and CPU paths in the same process (OnceLock can't be reset).
-    #[cfg(test)]
-    {
+    static CACHED: OnceLock<usize> = OnceLock::new();
+    *CACHED.get_or_init(|| {
         std::env::var("LAMBDA_VM_GPU_LDE_THRESHOLD")
             .ok()
             .and_then(|s| s.parse().ok())
             .unwrap_or(DEFAULT_GPU_LDE_THRESHOLD)
-    }
-    #[cfg(not(test))]
-    {
-        static CACHED: OnceLock<usize> = OnceLock::new();
-        *CACHED.get_or_init(|| {
-            std::env::var("LAMBDA_VM_GPU_LDE_THRESHOLD")
-                .ok()
-                .and_then(|s| s.parse().ok())
-                .unwrap_or(DEFAULT_GPU_LDE_THRESHOLD)
-        })
-    }
+    })
 }
 
 /// Incremented by the `try_expand_*` functions per base-field column handed to
diff --git a/prover/tests/cuda_path_integration.rs b/prover/tests/cuda_path_integration.rs
index 6de671cf1..0f7c1f3c7 100644
--- a/prover/tests/cuda_path_integration.rs
+++ b/prover/tests/cuda_path_integration.rs
@@ -66,25 +66,3 @@ fn gpu_path_fires_end_to_end() {
     let ok = verify(&proof, &elf).expect("verify");
     assert!(ok, "GPU-produced proof failed verification");
 }
-
-#[test]
-#[ignore = "requires GPU; run with --ignored --nocapture"]
-fn gpu_and_cpu_proofs_both_verify() {
-    let elf = asm_elf_bytes("fib_iterative_1M");
-
-    let proof_gpu = prove(&elf).expect("GPU prove");
-    assert!(
-        verify(&proof_gpu, &elf).expect("GPU verify"),
-        "GPU proof failed"
-    );
-
-    // Force CPU path by pushing the GPU threshold above any real table size.
-    // SAFETY: no other thread reads this env var during the test.
-    unsafe { std::env::set_var("LAMBDA_VM_GPU_LDE_THRESHOLD", "999999999") };
-    let proof_cpu = prove(&elf).expect("CPU prove");
-    unsafe { std::env::remove_var("LAMBDA_VM_GPU_LDE_THRESHOLD") };
-    assert!(
-        verify(&proof_cpu, &elf).expect("CPU verify"),
-        "CPU proof failed"
-    );
-}

From 8c4ebf5bae52435ab73a18e3395e7eede1c8a438 Mon Sep 17 00:00:00 2001
From: MauroFab <maurotoscano2@gmail.com>
Date: Mon, 29 Jun 2026 17:45:01 -0300
Subject: [PATCH 3/5] Fix stale docs and remove dead code

keccak.cu: move keccak256_leaves_base_row_major out of keccak_merkle_level's doc block so the child-pair->parent doc rejoins its kernel. prover.rs: delete columns_to_row_major, which has no callers after the row-major GPU path stopped materializing GPU-expanded columns.
---
 crypto/math-cuda/kernels/keccak.cu | 62 +++++++++++++++---------------
 crypto/stark/src/prover.rs         | 26 -------------
 2 files changed, 31 insertions(+), 57 deletions(-)

diff --git a/crypto/math-cuda/kernels/keccak.cu b/crypto/math-cuda/kernels/keccak.cu
index 86c0e519e..9937d7c6e 100644
--- a/crypto/math-cuda/kernels/keccak.cu
+++ b/crypto/math-cuda/kernels/keccak.cu
@@ -317,6 +317,37 @@ extern "C" __global__ void keccak_fri_leaves_ext3(
 //     children: nodes[parent_begin + n_pairs .. parent_begin + 3 * n_pairs]
 //     parents:  nodes[parent_begin .. parent_begin + n_pairs]
 //
+// Each thread hashes one child pair → one parent. Keccak-256 of the
+// concatenation of two 32-byte siblings, identical to
+// `FieldElementVectorBackend::hash_new_parent` on host.
+// ---------------------------------------------------------------------------
+extern "C" __global__ void keccak_merkle_level(
+    uint8_t *nodes,
+    uint64_t parent_begin,     // node index (counted in 32-byte nodes)
+    uint64_t n_pairs) {
+    uint64_t tid = (uint64_t)blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= n_pairs) return;
+
+    uint64_t st[25];
+    #pragma unroll
+    for (int i = 0; i < 25; ++i) st[i] = 0;
+
+    uint32_t rate_pos = 0;
+    // `nodes` comes from cuMemAlloc (256-byte aligned); each 32-byte node
+    // sits at a 32-byte-aligned offset, so the u64 cast is safe.
+    const uint64_t *left = reinterpret_cast<const uint64_t *>(
+        nodes + (parent_begin + n_pairs + 2 * tid) * 32);
+    #pragma unroll
+    for (int i = 0; i < 4; ++i) absorb_lane(st, rate_pos, left[i]);
+
+    const uint64_t *right = reinterpret_cast<const uint64_t *>(
+        nodes + (parent_begin + n_pairs + 2 * tid + 1) * 32);
+    #pragma unroll
+    for (int i = 0; i < 4; ++i) absorb_lane(st, rate_pos, right[i]);
+
+    finalize_keccak256(st, rate_pos, nodes + (parent_begin + tid) * 32);
+}
+
 // ---------------------------------------------------------------------------
 // Row-major base leaf hashing.
 //
@@ -350,34 +381,3 @@ extern "C" __global__ void keccak256_leaves_base_row_major(
     }
     finalize_keccak256(st, rate_pos, hashed_leaves_out + tid * 32);
 }
-
-// Each thread hashes one child pair → one parent. Keccak-256 of the
-// concatenation of two 32-byte siblings, identical to
-// `FieldElementVectorBackend::hash_new_parent` on host.
-// ---------------------------------------------------------------------------
-extern "C" __global__ void keccak_merkle_level(
-    uint8_t *nodes,
-    uint64_t parent_begin,     // node index (counted in 32-byte nodes)
-    uint64_t n_pairs) {
-    uint64_t tid = (uint64_t)blockIdx.x * blockDim.x + threadIdx.x;
-    if (tid >= n_pairs) return;
-
-    uint64_t st[25];
-    #pragma unroll
-    for (int i = 0; i < 25; ++i) st[i] = 0;
-
-    uint32_t rate_pos = 0;
-    // `nodes` comes from cuMemAlloc (256-byte aligned); each 32-byte node
-    // sits at a 32-byte-aligned offset, so the u64 cast is safe.
-    const uint64_t *left = reinterpret_cast<const uint64_t *>(
-        nodes + (parent_begin + n_pairs + 2 * tid) * 32);
-    #pragma unroll
-    for (int i = 0; i < 4; ++i) absorb_lane(st, rate_pos, left[i]);
-
-    const uint64_t *right = reinterpret_cast<const uint64_t *>(
-        nodes + (parent_begin + n_pairs + 2 * tid + 1) * 32);
-    #pragma unroll
-    for (int i = 0; i < 4; ++i) absorb_lane(st, rate_pos, right[i]);
-
-    finalize_keccak256(st, rate_pos, nodes + (parent_begin + tid) * 32);
-}
diff --git a/crypto/stark/src/prover.rs b/crypto/stark/src/prover.rs
index 9b716ff13..30554c15e 100644
--- a/crypto/stark/src/prover.rs
+++ b/crypto/stark/src/prover.rs
@@ -475,32 +475,6 @@ where
     }
 }
 
-/// Interleave column-major data into a flat row-major buffer + its column
-/// count. Used only by the cuda fast path to materialize the GPU-expanded
-/// columns in the row-major layout the table expects (CPU paths read the
-/// already-row-major trace directly, with no transpose).
-#[cfg(feature = "cuda")]
-fn columns_to_row_major<E: IsField>(
-    columns: &[Vec<FieldElement<E>>],
-) -> (Vec<FieldElement<E>>, usize) {
-    let num_cols = columns.len();
-    let n = if num_cols > 0 { columns[0].len() } else { 0 };
-    // All columns must be the same length; otherwise `col[row]` below indexes
-    // out of bounds. The producers (CPU/GPU LDE) always emit uniform columns —
-    // this guards against a future regression cheaply (debug builds only).
-    debug_assert!(
-        columns.iter().all(|c| c.len() == n),
-        "columns_to_row_major requires all columns to have equal length"
-    );
-    let mut data = Vec::with_capacity(n * num_cols);
-    for row in 0..n {
-        for col in columns {
-            data.push(col[row].clone());
-        }
-    }
-    (data, num_cols)
-}
-
 /// Compute Keccak-256 leaf hashes for `commit_columns_bit_reversed`: one
 /// leaf per row, where each row is read at `reverse_index(row_idx)` and the
 /// columns are concatenated as big-endian bytes before hashing.

From 91383bf1a11d27889ccd260c9c6a1e7c34acbb1d Mon Sep 17 00:00:00 2001
From: MauroFab <maurotoscano2@gmail.com>
Date: Mon, 29 Jun 2026 17:45:01 -0300
Subject: [PATCH 4/5] Consolidate row-major LDE pipeline; guard keccak num_rows

Extract coset_lde_row_major_inner shared by the base and ext3 _keep entry points (they differed only by m vs m*3 and the handle type), removing ~110 lines of drift-prone duplication. Add debug_assert!(num_rows >= 2) to launch_keccak_base_row_major: the kernel shifts by (64 - log_num_rows), UB at num_rows==1, matching the guard in launch_keccak_base.
---
 crypto/math-cuda/src/lde.rs | 181 ++++++++++++++----------------------
 1 file changed, 69 insertions(+), 112 deletions(-)

diff --git a/crypto/math-cuda/src/lde.rs b/crypto/math-cuda/src/lde.rs
index 8c319f764..164267684 100644
--- a/crypto/math-cuda/src/lde.rs
+++ b/crypto/math-cuda/src/lde.rs
@@ -318,6 +318,11 @@ fn launch_keccak_base_row_major(
     // threads/block, which exceeds the per-block register budget and fails the
     // launch with CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES — silently dropping the whole
     // R1 GPU path to the CPU fallback (no device handle for rounds 2-4).
+    //
+    // The kernel derives the bit-reversed row as `__brevll(tid) >> (64 - log_num_rows)`;
+    // a 64-bit shift is UB, so reject `num_rows < 2` (`log_num_rows == 0`), matching
+    // the `debug_assert!` guard in `launch_keccak_base`.
+    debug_assert!(num_rows >= 2, "row-major keccak requires num_rows >= 2");
     let cfg = keccak_launch_cfg(num_rows);
     unsafe {
         stream
@@ -368,48 +373,53 @@ fn launch_row_to_col_major(
     Ok(dst)
 }
 
-/// Row-major LDE + Keccak + Merkle, all on-device.
+/// Shared row-major LDE + Keccak + Merkle pipeline for the base and ext3 paths.
+///
+/// `total_cols` is the number of base-field columns in the row-major layout:
+/// `m` for base, `m * 3` for ext3. Because `Fp3 = [u64; 3]`, the three ext3
+/// components are just three adjacent base-field columns, so the same row-major
+/// NTT and Keccak kernels process all of them simultaneously — no de-interleave.
 ///
-/// Input: `row_major` is a flat `n * m` slice in row-major order.
-/// Returns (merkle_nodes, GpuLdeBase handle, row-major LDE Vec).
 /// Single H2D, row-major NTT, single D2H — no CPU-side extract or transpose.
-/// The returned handle is column-major (as required by downstream GPU kernels):
-/// after D2H, `buf` is transposed on-device to column-major for the handle.
-pub fn coset_lde_row_major_with_merkle_tree_keep(
+/// Returns (merkle_nodes, column-major device buffer, row-major LDE Vec). The
+/// buffer is transposed to column-major (as required by the downstream GPU
+/// kernels DEEP/barycentric); callers wrap it in the appropriate LDE handle.
+fn coset_lde_row_major_inner(
     row_major: &[u64],
     n: usize,
-    m: usize,
+    total_cols: usize,
     blowup_factor: usize,
     weights: &[u64],
-) -> Result<(Vec<u8>, GpuLdeBase, Vec<u64>)> {
-    assert_eq!(row_major.len(), n * m);
+    what: &str,
+) -> Result<(Vec<u8>, CudaSlice<u64>, Vec<u64>)> {
+    assert_eq!(row_major.len(), n * total_cols);
     assert!(n.is_power_of_two());
     assert_eq!(weights.len(), n);
     assert!(blowup_factor.is_power_of_two());
     let lde_size = n * blowup_factor;
-    assert_u32_domain(lde_size, "coset_lde_row_major lde_size");
+    assert_u32_domain(lde_size, what);
 
     let nodes_bytes = KeccakCommit::FullTree.total_nodes_bytes(lde_size);
     let log_n = n.trailing_zeros() as u64;
     let log_lde = lde_size.trailing_zeros() as u64;
     let n_u64 = n as u64;
     let lde_u64 = lde_size as u64;
-    let m_u64 = m as u64;
+    let cols_u64 = total_cols as u64;
 
     let be = backend()?;
     let stream = be.next_stream();
 
-    // H2D into a zeroed lde_size*m buffer; only the first n*m rows carry data,
-    // the remainder are already zero (zero-padding for LDE expansion).
-    let mut buf = stream.alloc_zeros::<u64>(lde_size * m)?;
-    stream.memcpy_htod(row_major, &mut buf.slice_mut(0..n * m))?;
+    // H2D into a zeroed lde_size*total_cols buffer; only the first n*total_cols
+    // rows carry data, the remainder are already zero (zero-padding for LDE).
+    let mut buf = stream.alloc_zeros::<u64>(lde_size * total_cols)?;
+    stream.memcpy_htod(row_major, &mut buf.slice_mut(0..n * total_cols))?;
 
     let inv_tw = be.inv_twiddles_for(log_n)?;
     let fwd_tw = be.fwd_twiddles_for(log_lde)?;
     let weights_dev = stream.clone_htod(weights)?;
 
     // iNTT: bit-reverse rows → per-level DIT.
-    launch_bit_reverse_row_major(stream.as_ref(), be, &mut buf, n_u64, log_n, m_u64)?;
+    launch_bit_reverse_row_major(stream.as_ref(), be, &mut buf, n_u64, log_n, cols_u64)?;
     run_row_major_ntt_body(
         stream.as_ref(),
         be,
@@ -417,14 +427,14 @@ pub fn coset_lde_row_major_with_merkle_tree_keep(
         inv_tw.as_ref(),
         n_u64,
         log_n,
-        m_u64,
+        cols_u64,
     )?;
 
-    // Coset weights: one weight per row, broadcast across all m columns.
-    launch_pointwise_mul_row_major(stream.as_ref(), be, &mut buf, &weights_dev, n_u64, m_u64)?;
+    // Coset weights: one weight per row, broadcast across all columns.
+    launch_pointwise_mul_row_major(stream.as_ref(), be, &mut buf, &weights_dev, n_u64, cols_u64)?;
 
     // Forward NTT at lde_size.
-    launch_bit_reverse_row_major(stream.as_ref(), be, &mut buf, lde_u64, log_lde, m_u64)?;
+    launch_bit_reverse_row_major(stream.as_ref(), be, &mut buf, lde_u64, log_lde, cols_u64)?;
     run_row_major_ntt_body(
         stream.as_ref(),
         be,
@@ -432,10 +442,10 @@ pub fn coset_lde_row_major_with_merkle_tree_keep(
         fwd_tw.as_ref(),
         lde_u64,
         log_lde,
-        m_u64,
+        cols_u64,
     )?;
 
-    // Keccak + Merkle on-device.
+    // Keccak + Merkle on-device. Each leaf reads `total_cols` consecutive u64s.
     let mut nodes_dev = unsafe { stream.alloc::<u8>(nodes_bytes) }?;
     let leaves_offset = KeccakCommit::FullTree.leaves_offset_bytes(lde_size);
     {
@@ -444,7 +454,7 @@ pub fn coset_lde_row_major_with_merkle_tree_keep(
             stream.as_ref(),
             be,
             &buf,
-            m_u64,
+            cols_u64,
             lde_u64,
             log_lde,
             &mut leaves_view,
@@ -457,11 +467,11 @@ pub fn coset_lde_row_major_with_merkle_tree_keep(
     let lde_out = {
         let staging_slot = be.pinned_staging();
         let mut staging = staging_slot.lock().unwrap();
-        staging.ensure_capacity(lde_size * m, &be.ctx)?;
-        let pinned = unsafe { staging.as_mut_slice(lde_size * m) };
+        staging.ensure_capacity(lde_size * total_cols, &be.ctx)?;
+        let pinned = unsafe { staging.as_mut_slice(lde_size * total_cols) };
         stream.memcpy_dtoh(&buf, pinned)?;
         stream.synchronize()?;
-        let out = pinned[..lde_size * m].to_vec();
+        let out = pinned[..lde_size * total_cols].to_vec();
         drop(staging);
         out
     };
@@ -471,16 +481,39 @@ pub fn coset_lde_row_major_with_merkle_tree_keep(
 
     // Transpose row-major buf → column-major for the handle. Downstream kernels
     // (DEEP, barycentric) expect buf[c * lde_size + r] (column-major).
-    let col_major_dev = launch_row_to_col_major(&stream, be, &buf, lde_size, m, lde_u64)?;
+    let col_major_dev = launch_row_to_col_major(&stream, be, &buf, lde_size, total_cols, lde_u64)?;
     // Synchronize before returning: the handle crosses stream boundaries — downstream
     // consumers call be.next_stream() and read handle.buf on a different stream.
     // Without this, a barycentric or DEEP kernel can start before the transpose finishes.
     stream.synchronize()?;
 
+    Ok((nodes_out, col_major_dev, lde_out))
+}
+
+/// Row-major LDE + Keccak + Merkle, all on-device.
+///
+/// Input: `row_major` is a flat `n * m` slice in row-major order.
+/// Returns (merkle_nodes, GpuLdeBase handle, row-major LDE Vec).
+/// The returned handle is column-major (as required by downstream GPU kernels).
+pub fn coset_lde_row_major_with_merkle_tree_keep(
+    row_major: &[u64],
+    n: usize,
+    m: usize,
+    blowup_factor: usize,
+    weights: &[u64],
+) -> Result<(Vec<u8>, GpuLdeBase, Vec<u64>)> {
+    let (nodes_out, col_major_dev, lde_out) = coset_lde_row_major_inner(
+        row_major,
+        n,
+        m,
+        blowup_factor,
+        weights,
+        "coset_lde_row_major lde_size",
+    )?;
     let handle = GpuLdeBase {
         buf: Arc::new(col_major_dev),
         m,
-        lde_size,
+        lde_size: n * blowup_factor,
     };
     Ok((nodes_out, handle, lde_out))
 }
@@ -502,94 +535,18 @@ pub fn coset_lde_ext3_row_major_with_merkle_tree_keep(
     blowup_factor: usize,
     weights: &[u64],
 ) -> Result<(Vec<u8>, GpuLdeExt3, Vec<u64>)> {
-    let m3 = m * 3;
-    assert_eq!(row_major.len(), n * m3);
-    assert!(n.is_power_of_two());
-    assert_eq!(weights.len(), n);
-    assert!(blowup_factor.is_power_of_two());
-    let lde_size = n * blowup_factor;
-    assert_u32_domain(lde_size, "coset_lde_ext3_row_major lde_size");
-
-    let nodes_bytes = KeccakCommit::FullTree.total_nodes_bytes(lde_size);
-    let log_n = n.trailing_zeros() as u64;
-    let log_lde = lde_size.trailing_zeros() as u64;
-    let n_u64 = n as u64;
-    let lde_u64 = lde_size as u64;
-    let m3_u64 = m3 as u64;
-
-    let be = backend()?;
-    let stream = be.next_stream();
-
-    let mut buf = stream.alloc_zeros::<u64>(lde_size * m3)?;
-    stream.memcpy_htod(row_major, &mut buf.slice_mut(0..n * m3))?;
-
-    let inv_tw = be.inv_twiddles_for(log_n)?;
-    let fwd_tw = be.fwd_twiddles_for(log_lde)?;
-    let weights_dev = stream.clone_htod(weights)?;
-
-    // iNTT + coset weights + forward NTT — same row-major kernels as base-field
-    // but with m3 = m*3 (all 3 components processed simultaneously).
-    launch_bit_reverse_row_major(stream.as_ref(), be, &mut buf, n_u64, log_n, m3_u64)?;
-    run_row_major_ntt_body(
-        stream.as_ref(),
-        be,
-        &mut buf,
-        inv_tw.as_ref(),
-        n_u64,
-        log_n,
-        m3_u64,
-    )?;
-    launch_pointwise_mul_row_major(stream.as_ref(), be, &mut buf, &weights_dev, n_u64, m3_u64)?;
-    launch_bit_reverse_row_major(stream.as_ref(), be, &mut buf, lde_u64, log_lde, m3_u64)?;
-    run_row_major_ntt_body(
-        stream.as_ref(),
-        be,
-        &mut buf,
-        fwd_tw.as_ref(),
-        lde_u64,
-        log_lde,
-        m3_u64,
+    let (nodes_out, col_major_dev, lde_out) = coset_lde_row_major_inner(
+        row_major,
+        n,
+        m * 3,
+        blowup_factor,
+        weights,
+        "coset_lde_ext3_row_major lde_size",
     )?;
-
-    // Keccak: same row-major kernel — each leaf reads m3 consecutive u64s (= m ext3 elements).
-    let mut nodes_dev = unsafe { stream.alloc::<u8>(nodes_bytes) }?;
-    let leaves_offset = KeccakCommit::FullTree.leaves_offset_bytes(lde_size);
-    {
-        let mut leaves_view = nodes_dev.slice_mut(leaves_offset..leaves_offset + lde_size * 32);
-        launch_keccak_base_row_major(
-            stream.as_ref(),
-            be,
-            &buf,
-            m3_u64,
-            lde_u64,
-            log_lde,
-            &mut leaves_view,
-        )?;
-    }
-    crate::merkle::build_inner_tree_levels(stream.as_ref(), be, &mut nodes_dev, lde_size)?;
-
-    let lde_out = {
-        let staging_slot = be.pinned_staging();
-        let mut staging = staging_slot.lock().unwrap();
-        staging.ensure_capacity(lde_size * m3, &be.ctx)?;
-        let pinned = unsafe { staging.as_mut_slice(lde_size * m3) };
-        stream.memcpy_dtoh(&buf, pinned)?;
-        stream.synchronize()?;
-        let out = pinned[..lde_size * m3].to_vec();
-        drop(staging);
-        out
-    };
-
-    let mut nodes_out = vec![0u8; nodes_bytes];
-    d2h_bytes_via_pinned_hashes(&stream, be, &nodes_dev, &mut nodes_out)?;
-
-    let col_major_dev = launch_row_to_col_major(&stream, be, &buf, lde_size, m3, lde_u64)?;
-    stream.synchronize()?;
-
     let handle = GpuLdeExt3 {
         buf: Arc::new(col_major_dev),
         m,
-        lde_size,
+        lde_size: n * blowup_factor,
     };
     Ok((nodes_out, handle, lde_out))
 }

From b0a38f9ec19727fc63e5affdb908d13889c52542 Mon Sep 17 00:00:00 2001
From: MauroFab <maurotoscano2@gmail.com>
Date: Mon, 29 Jun 2026 18:02:06 -0300
Subject: [PATCH 5/5] Fix stale R2 composition-LDE assertion in
 gpu_path_fires_end_to_end
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The assert checked gpu_parts_lde_calls() > 0 with a comment claiming branch/shift tables are degree-3 — both false: fib_iterative_1M tables all have number_of_parts <= 2, and the common degree-2 case fires the fused two-halves path (gpu_extend_halves_calls), counted separately from the parts>2 path (gpu_parts_lde_calls) since #700. Assert on the sum so either composition-LDE path satisfies it. Validated on RTX 5090 / CUDA 13.1: make test-math-cuda 78/78, make test-cuda-integration green, proof verifies.
---
 prover/tests/cuda_path_integration.rs | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/prover/tests/cuda_path_integration.rs b/prover/tests/cuda_path_integration.rs
index 0f7c1f3c7..cf9bc742c 100644
--- a/prover/tests/cuda_path_integration.rs
+++ b/prover/tests/cuda_path_integration.rs
@@ -12,7 +12,8 @@ use lambda_vm_prover::test_utils::asm_elf_bytes;
 use lambda_vm_prover::{prove, verify};
 use stark::gpu_lde::{
     gpu_bary_calls, gpu_batch_invert_calls, gpu_comp_poly_tree_calls, gpu_deep_calls,
-    gpu_fri_calls, gpu_lde_calls, gpu_parts_lde_calls, reset_all_gpu_call_counters,
+    gpu_extend_halves_calls, gpu_fri_calls, gpu_lde_calls, gpu_parts_lde_calls,
+    reset_all_gpu_call_counters,
 };
 
 #[test]
@@ -36,10 +37,15 @@ fn gpu_path_fires_end_to_end() {
     // path.
     assert!(gpu_bary_calls() > 0, "R3 GPU barycentric did not fire");
 
-    // R2 ext3 LDE of composition-poly parts. Only fires when an AIR's
-    // `number_of_parts > 2`. The branch and shift tables have degree-3
-    // transition constraints, so this triggers on any non-trivial prove.
-    assert!(gpu_parts_lde_calls() > 0, "R2 GPU parts LDE did not fire");
+    // R2 GPU composition-poly LDE. Fires via one of two paths depending on the
+    // AIR's `number_of_parts`: the fused two-halves quotient decomposition for
+    // the common degree-2 case (`== 2`, counted by `gpu_extend_halves_calls`),
+    // or the batched parts LDE for `> 2` (counted by `gpu_parts_lde_calls`).
+    // fib_iterative_1M only exercises the degree-2 path, so assert on either.
+    assert!(
+        gpu_extend_halves_calls() + gpu_parts_lde_calls() > 0,
+        "R2 GPU composition LDE did not fire (neither two-halves d2 nor parts>2 path)"
+    );
 
     // R2 comp-poly Merkle tree build, paired with the parts LDE above.
     assert!(