diff --git a/crypto/math-cuda/src/lde.rs b/crypto/math-cuda/src/lde.rs
index ee5dc3fce..e98a42691 100644
--- a/crypto/math-cuda/src/lde.rs
+++ b/crypto/math-cuda/src/lde.rs
@@ -1296,15 +1296,40 @@ pub fn coset_lde_batch_ext3_into(
     weights: &[u64],
     outputs: &mut [&mut [u64]],
 ) -> Result<()> {
+    coset_lde_batch_ext3_into_inner(columns, n, blowup_factor, weights, outputs, false).map(|_| ())
+}
+
+/// Same as [`coset_lde_batch_ext3_into`] but RETAINS the de-interleaved device
+/// LDE buffer as a [`GpuLdeExt3`] handle for downstream on-device reuse (e.g. R4
+/// DEEP), instead of freeing it. No Merkle tree is built. Returns `None` when
+/// the input is empty (`columns.is_empty()` or `n == 0`).
+pub fn coset_lde_batch_ext3_into_keep(
+    columns: &[&[u64]],
+    n: usize,
+    blowup_factor: usize,
+    weights: &[u64],
+    outputs: &mut [&mut [u64]],
+) -> Result<Option<GpuLdeExt3>> {
+    coset_lde_batch_ext3_into_inner(columns, n, blowup_factor, weights, outputs, true)
+}
+
+fn coset_lde_batch_ext3_into_inner(
+    columns: &[&[u64]],
+    n: usize,
+    blowup_factor: usize,
+    weights: &[u64],
+    outputs: &mut [&mut [u64]],
+    keep_device_buf: bool,
+) -> Result<Option<GpuLdeExt3>> {
     if columns.is_empty() {
-        return Ok(());
+        return Ok(None);
     }
     let m = columns.len();
     assert_eq!(outputs.len(), m, "outputs must match columns count");
     // Empty domain must short-circuit before the power-of-two assert
     // (is_power_of_two returns false for 0).
     if n == 0 {
-        return Ok(());
+        return Ok(None);
     }
     assert!(n.is_power_of_two(), "n must be a power of two");
     assert_eq!(weights.len(), n, "weights length must match n");
@@ -1408,7 +1433,16 @@ pub fn coset_lde_batch_ext3_into(
     // ext3-per-element layout.
     unpack_pinned_slabs_to_ext3(pinned, outputs, lde_size);
     drop(staging);
-    Ok(())
+    if keep_device_buf {
+        Ok(Some(GpuLdeExt3 {
+            buf: std::sync::Arc::new(buf),
+            m,
+            lde_size,
+        }))
+    } else {
+        drop(buf);
+        Ok(None)
+    }
 }
 
 /// Run the DIT butterfly body of a bit-reversed-input NTT over `m` batched
diff --git a/crypto/stark/src/gpu_lde.rs b/crypto/stark/src/gpu_lde.rs
index 36756b40b..ff986e795 100644
--- a/crypto/stark/src/gpu_lde.rs
+++ b/crypto/stark/src/gpu_lde.rs
@@ -349,101 +349,101 @@ where
     Some(())
 }
 
-/// GPU path for `Prover::extend_half_to_lde`.
+/// GPU fused `_keep` path for `Prover::decompose_and_extend_d2` (R2 quotient
+/// decomposition).
 ///
-/// Inside `decompose_and_extend_d2` (R2 quotient decomposition) the prover
-/// does `rayon::join` of two calls: `iFFT(N on g²-coset) → FFT(2N on g-coset)`
-/// over ext3 halves H0 and H1. They share the same domain/offset and sizes,
-/// so we batch them into a single GPU call with M=2 ext3 columns.
+/// Extends both ext3 halves H0/H1 from `N` evals on the g²-coset to `2N` on the
+/// g-coset, hashes the row-pair Keccak leaves, builds the composition Merkle
+/// tree, AND retains the de-interleaved device buffer as a `GpuLdeExt3` handle —
+/// all in one GPU call. The kept handle lets R4 DEEP read the composition LDE
+/// straight from device memory (no `2 * 3 * lde_size * 8` byte re-H2D), and the
+/// returned tree IS the composition commitment (no separate
+/// `try_build_comp_poly_tree_gpu` round-trip).
 ///
-/// Weights = `[1/N, g^(-1)/N, g^(-2)/N, …, g^(-(N-1))/N]`. This bakes the
-/// `(g²)^(-k)` input-coset-undo from `interpolate_offset_fft` together with
-/// the `g^k` forward-coset-shift from `evaluate_polynomial_on_lde_domain` —
-/// net is `g^(-k)` — plus the `1/N` iFFT normalisation.
+/// `columns` must be `[H0, H1]`, each `N` ext3 evals on the g²-coset; on success
+/// they are expanded in place to `lde_size`. Weights are built here as
+/// `[1/N, g^(-1)/N, …, g^(-(N-1))/N]` (the same construction the prior D2H
+/// `try_extend_two_halves_gpu` used, hence known-correct on CUDA): this bakes the
+/// `(g²)^(-k)` input-coset-undo from `interpolate_offset_fft`, the `g^k` forward
+/// coset-shift from `evaluate_polynomial_on_lde_domain` (net `g^(-k)`), and the
+/// `1/N` iFFT normalisation. They are NOT reused from `CompositionLdeTwiddles`
+/// because the CPU `coset_lde_full` and the GPU `coset_lde_batch` need not share
+/// a weight-application convention.
 ///
-/// Returns `None` when the GPU path doesn't apply (too small, or CPU path
-/// should be used); in that case the caller runs its existing rayon::join.
-#[allow(clippy::type_complexity)]
-pub(crate) fn try_extend_two_halves_gpu<F, E>(
-    h0: &[FieldElement<E>],
-    h1: &[FieldElement<E>],
+/// Returns `None` when the GPU path doesn't apply (too small / non-ext3 / non-
+/// Goldilocks — all checked by `check_ext3_layout` inside the delegate, which
+/// also restores `columns` to their original `N`-length contents on any failure);
+/// the caller then runs the CPU `rayon::join`.
+pub(crate) fn try_extend_two_halves_gpu_keep<F, E>(
+    columns: &mut [Vec<FieldElement<E>>],
     domain: &Domain<F>,
-) -> Option<(Vec<FieldElement<E>>, Vec<FieldElement<E>>)>
+) -> Option<math_cuda::lde::GpuLdeExt3>
 where
     F: IsFFTField + IsField + 'static,
     E: IsField + 'static,
     F: IsSubFieldOf<E>,
 {
-    if h0.len() != h1.len() {
-        return None;
-    }
-    let n = h0.len();
-    let blowup = 2; // extend_half_to_lde extends N → 2N always
-    let lde_size = n * blowup;
-    if lde_size < gpu_lde_threshold() {
-        return None;
-    }
-    if TypeId::of::<E>() != TypeId::of::<Degree3GoldilocksExtensionField>() {
-        return None;
-    }
-    if TypeId::of::<F>() != TypeId::of::<GoldilocksField>() {
-        return None;
-    }
-    GPU_EXTEND_HALVES_CALLS.fetch_add(1, Ordering::Relaxed);
-    // Weights are built from `g = domain.coset_offset` directly: the
-    // CPU caller previously passed `g²` redundantly. See the
-    // `g^(-k) / N` weight loop below.
-
-    // Flatten ext3 slices to raw 3*n u64 buffers.
-    let to_u64 = |col: &[FieldElement<E>]| -> Vec<u64> {
-        let len = col.len() * 3;
-        let ptr = col.as_ptr() as *const u64;
-        unsafe { from_raw_parts(ptr, len) }.to_vec()
+    // Self-guards size / threshold / Goldilocks-ext3 / equal-length. blowup = 2
+    // (extend_half_to_lde always extends N -> 2N).
+    let (n, lde_size) = match check_ext3_layout::<F, E>(columns, 2) {
+        LayoutDispatch::Empty | LayoutDispatch::Skip => return None,
+        LayoutDispatch::Run { n, lde_size } => (n, lde_size),
     };
-    let h0_raw = to_u64(h0);
-    let h1_raw = to_u64(h1);
 
-    // weights[k] = g^(-k) / N as a u64.
+    // weights[k] = g^(-k) / N (g = domain.coset_offset): bakes the (g^2)^(-k)
+    // input-coset-undo, the g^k output-coset-shift (net g^(-k)) and the 1/N iFFT
+    // normalisation. Identical to CompositionLdeTwiddles::weights and to the
+    // prior D2H path (known-correct on CUDA).
     let inv_n = FieldElement::<F>::from(n as u64).inv().expect("N nonzero");
-    let g = &domain.coset_offset;
-    let g_inv = g.inv().expect("g nonzero");
-    let mut weights_u64 = Vec::with_capacity(n);
-    let mut w = inv_n.clone();
+    let g_inv = domain.coset_offset.inv().expect("g nonzero");
+    let mut weights = Vec::with_capacity(n);
+    let mut w = inv_n;
     for _ in 0..n {
-        // F == GoldilocksField by TypeId check above, so value is u64.
-        let v: u64 = unsafe { *(w.value() as *const _ as *const u64) };
-        weights_u64.push(v);
+        weights.push(w.clone());
         w *= &g_inv;
     }
 
-    // Pre-allocate outputs.
-    let mut lde_h0 = vec![FieldElement::<E>::zero(); lde_size];
-    let mut lde_h1 = vec![FieldElement::<E>::zero(); lde_size];
+    // SAFETY: layout checked above. `columns_to_u64_ext3` copies the N input
+    // values out before we presize, so input and output don't alias.
+    let raw_columns = unsafe { columns_to_u64_ext3::<E>(columns) };
+    let weights_u64 = unsafe { weights_to_u64::<F>(&weights) };
+    let slices: Vec<&[u64]> = raw_columns.iter().map(|c| c.as_slice()).collect();
 
-    // Two ext3 columns (h0 + h1), each composed of 3 base-field components.
-    const NUM_COLS: usize = 2;
-    GPU_LDE_CALLS.fetch_add((NUM_COLS * 3) as u64, Ordering::Relaxed);
-    {
-        let inputs: [&[u64]; 2] = [&h0_raw, &h1_raw];
-        // View each output Vec<FieldElement<E>> as &mut [u64] of length 3*lde_size.
-        let out0_ptr = lde_h0.as_mut_ptr() as *mut u64;
-        let out1_ptr = lde_h1.as_mut_ptr() as *mut u64;
-        // SAFETY: ext3 FieldElement is [u64; 3] in memory, and the Vec has len
-        // = lde_size so the backing is 3*lde_size u64s.
-        let ext3_len = lde_size
-            .checked_mul(3)
-            .expect("ext3 output length overflow");
-        let out0_slice = unsafe { from_raw_parts_mut(out0_ptr, ext3_len) };
-        let out1_slice = unsafe { from_raw_parts_mut(out1_ptr, ext3_len) };
-        let mut outputs: [&mut [u64]; 2] = [out0_slice, out1_slice];
-        if math_cuda::lde::coset_lde_batch_ext3_into(&inputs, n, blowup, &weights_u64, &mut outputs)
-            .is_err()
-        {
-            return None;
+    // `presize_and_view_ext3` does `set_len(lde_size)` in place, so each column
+    // needs `capacity >= lde_size`. H0/H1 come from `map_unzip` with capacity N.
+    for col in columns.iter_mut() {
+        if col.capacity() < lde_size {
+            col.reserve_exact(lde_size - col.len());
         }
     }
 
-    Some((lde_h0, lde_h1))
+    GPU_LDE_CALLS.fetch_add((columns.len() * 3) as u64, Ordering::Relaxed);
+
+    // No-tree keep variant: writes the LDE into `columns` (presized in place) and
+    // retains the de-interleaved device buffer as a `GpuLdeExt3` handle for R4
+    // DEEP. The composition Merkle tree is built separately by
+    // `try_build_comp_poly_tree_gpu` (the keep pipeline's on-device tree is in the
+    // wrong order for the composition's commitment).
+    let handle = {
+        let mut raw_outputs = unsafe { presize_and_view_ext3::<E>(columns, lde_size) };
+        math_cuda::lde::coset_lde_batch_ext3_into_keep(
+            &slices,
+            n,
+            2,
+            &weights_u64,
+            &mut raw_outputs,
+        )
+    };
+    match handle {
+        Ok(Some(h)) => {
+            GPU_EXTEND_HALVES_CALLS.fetch_add(1, Ordering::Relaxed);
+            Some(h)
+        }
+        Ok(None) | Err(_) => {
+            restore_columns_on_err(columns, n);
+            None
+        }
+    }
 }
 
 pub(crate) static GPU_LEAF_HASH_CALLS: AtomicU64 = AtomicU64::new(0);
diff --git a/crypto/stark/src/prover.rs b/crypto/stark/src/prover.rs
index eed0e512a..76a8aaa9d 100644
--- a/crypto/stark/src/prover.rs
+++ b/crypto/stark/src/prover.rs
@@ -421,15 +421,29 @@ where
     pub(crate) composition_poly_merkle_tree: BatchedMerkleTree<F>,
     /// The commitment to the composition polynomial parts.
     pub(crate) composition_poly_root: Commitment,
-    /// Device-resident de-interleaved LDE handle from the R2 fused GPU path
-    /// (`try_evaluate_parts_on_lde_gpu_keep`). When present, R4 DEEP skips
-    /// the `num_parts * 3 * lde_size * 8` byte H2D and reads parts on
-    /// device. `None` when the GPU R2 path didn't run (number_of_parts <= 2,
-    /// below threshold, or any CPU fallback).
+    /// Device-resident de-interleaved LDE handle from an R2 fused GPU `_keep`
+    /// path: the 2-part `try_extend_two_halves_gpu_keep` (the common case, after
+    /// the degree-2 quotient decomposition) or the >2-part
+    /// `try_evaluate_parts_on_lde_gpu_keep`. When present, R4 DEEP skips the
+    /// `num_parts * 3 * lde_size * 8` byte H2D and reads parts on device. `None`
+    /// when the GPU R2 path didn't run (single-part AIR, below threshold, or any
+    /// CPU fallback).
     #[cfg(feature = "cuda")]
     pub(crate) gpu_composition_parts: Option<math_cuda::lde::GpuLdeExt3>,
 }
 
+/// Output of [`Prover::decompose_and_extend_d2`]: the two composition-poly part
+/// LDE evaluation vectors, plus (under `cuda`) the retained `GpuLdeExt3` device
+/// handle when the GPU no-tree keep path ran (consumed by R4 DEEP to avoid the
+/// composition-LDE re-H2D). `gpu_keep` is `None` on the CPU fallback. The
+/// composition Merkle tree is built separately (the keep pipeline's on-device
+/// tree is in the wrong order for the composition commitment).
+pub(crate) struct D2Result<E: IsField> {
+    pub(crate) evals: Vec<Vec<FieldElement<E>>>,
+    #[cfg(feature = "cuda")]
+    pub(crate) gpu_keep: Option<math_cuda::lde::GpuLdeExt3>,
+}
+
 /// A container for the results of the third round of the STARK Prove protocol.
 pub(crate) struct Round3<F: IsField> {
     /// Evaluations of the trace polynomials, main ans auxiliary, at the out-of-domain challenge.
@@ -1178,7 +1192,7 @@ pub trait IsStarkProver<
         constraint_evaluations: &[FieldElement<FieldExtension>],
         domain: &Domain<Field>,
         twiddles: &LdeTwiddles<Field>,
-    ) -> Vec<Vec<FieldElement<FieldExtension>>>
+    ) -> D2Result<FieldExtension>
     where
         FieldElement<Field>: AsBytes + Sync + Send,
         FieldElement<FieldExtension>: AsBytes + Sync + Send,
@@ -1211,21 +1225,48 @@ pub trait IsStarkProver<
         // Step 3: Extend each part from n evals on the g²-coset to 2n evals on the
         // g-coset (the full LDE domain).
 
-        // GPU fast path: batch both halves into one ext3 LDE call. Requires
-        // `cuda` feature and a qualifying size. Falls through to CPU when not.
+        // GPU fast path (no-tree `_keep`): one on-device call does the LDE of BOTH
+        // halves and retains the de-interleaved device buffer as a `GpuLdeExt3`
+        // handle, which feeds R4 DEEP and eliminates the composition-LDE re-H2D.
+        // (The composition Merkle tree is built separately below — the keep
+        // pipeline's on-device tree is in the wrong order for the composition.)
+        // Falls through to the CPU `rayon::join` when the GPU path doesn't apply
+        // (`try_extend_two_halves_gpu_keep` restores `cols` to [H0, H1] on `None`).
         #[cfg(feature = "cuda")]
-        if let Some((lde_h0, lde_h1)) =
-            crate::gpu_lde::try_extend_two_halves_gpu(&h0_evals, &h1_evals, domain)
         {
-            return vec![lde_h0, lde_h1];
+            let mut cols = vec![h0_evals, h1_evals];
+            if let Some(handle) = crate::gpu_lde::try_extend_two_halves_gpu_keep::<
+                Field,
+                FieldExtension,
+            >(&mut cols, domain)
+            {
+                return D2Result {
+                    evals: cols,
+                    gpu_keep: Some(handle),
+                };
+            }
+            let composition_twiddles = twiddles.composition(domain);
+            let (lde_h0, lde_h1) = crate::par::join(
+                || Self::extend_half_to_lde(&cols[0], composition_twiddles),
+                || Self::extend_half_to_lde(&cols[1], composition_twiddles),
+            );
+            return D2Result {
+                evals: vec![lde_h0, lde_h1],
+                gpu_keep: None,
+            };
         }
 
-        let composition_twiddles = twiddles.composition(domain);
-        let (lde_h0, lde_h1) = crate::par::join(
-            || Self::extend_half_to_lde(&h0_evals, composition_twiddles),
-            || Self::extend_half_to_lde(&h1_evals, composition_twiddles),
-        );
-        vec![lde_h0, lde_h1]
+        #[cfg(not(feature = "cuda"))]
+        {
+            let composition_twiddles = twiddles.composition(domain);
+            let (lde_h0, lde_h1) = crate::par::join(
+                || Self::extend_half_to_lde(&h0_evals, composition_twiddles),
+                || Self::extend_half_to_lde(&h1_evals, composition_twiddles),
+            );
+            D2Result {
+                evals: vec![lde_h0, lde_h1],
+            }
+        }
     }
 
     /// Extend `half_evals` — `n = lde_size/2` evaluations of a degree-`<n` polynomial
@@ -1302,7 +1343,14 @@ pub trait IsStarkProver<
             //   H₀(x²) = (H(x) + H(-x)) / 2
             //   H₁(x²) = (H(x) - H(-x)) / (2x)
             // On the LDE coset {g·ω^i}, we have -g·ω^i = g·ω^{i+N} since ω^N = -1.
-            Self::decompose_and_extend_d2(&constraint_evaluations, domain, twiddles)
+            let d2 = Self::decompose_and_extend_d2(&constraint_evaluations, domain, twiddles);
+            #[cfg(feature = "cuda")]
+            if let Some(handle) = d2.gpu_keep {
+                // Kept composition-LDE device buffer: R4 DEEP reads it on-device
+                // instead of re-H2D'ing the composition parts.
+                gpu_composition_parts = Some(handle);
+            }
+            d2.evals
         } else if number_of_parts == 1 {
             // Degree bound equals trace length: constraint evals are the LDE directly.
             vec![constraint_evaluations]
diff --git a/crypto/stark/src/tests/prover_tests.rs b/crypto/stark/src/tests/prover_tests.rs
index 318dacb81..07036232c 100644
--- a/crypto/stark/src/tests/prover_tests.rs
+++ b/crypto/stark/src/tests/prover_tests.rs
@@ -274,7 +274,8 @@ fn test_decompose_and_extend_d2_matches_original() {
         &constraint_evaluations,
         &domain,
         &twiddles,
-    );
+    )
+    .evals;
     #[cfg(not(feature = "cuda"))]
     assert!(twiddles.has_composition_cache());
 
diff --git a/prover/tests/cuda_path_integration.rs b/prover/tests/cuda_path_integration.rs
index 0f7c1f3c7..be64d34ec 100644
--- a/prover/tests/cuda_path_integration.rs
+++ b/prover/tests/cuda_path_integration.rs
@@ -11,8 +11,8 @@
 use lambda_vm_prover::test_utils::asm_elf_bytes;
 use lambda_vm_prover::{prove, verify};
 use stark::gpu_lde::{
-    gpu_bary_calls, gpu_batch_invert_calls, gpu_comp_poly_tree_calls, gpu_deep_calls,
-    gpu_fri_calls, gpu_lde_calls, gpu_parts_lde_calls, reset_all_gpu_call_counters,
+    gpu_bary_calls, gpu_batch_invert_calls, gpu_deep_calls, gpu_extend_halves_calls, gpu_fri_calls,
+    gpu_lde_calls, reset_all_gpu_call_counters,
 };
 
 #[test]
@@ -36,15 +36,23 @@ fn gpu_path_fires_end_to_end() {
     // path.
     assert!(gpu_bary_calls() > 0, "R3 GPU barycentric did not fire");
 
-    // R2 ext3 LDE of composition-poly parts. Only fires when an AIR's
-    // `number_of_parts > 2`. The branch and shift tables have degree-3
-    // transition constraints, so this triggers on any non-trivial prove.
-    assert!(gpu_parts_lde_calls() > 0, "R2 GPU parts LDE did not fire");
-
-    // R2 comp-poly Merkle tree build, paired with the parts LDE above.
+    // R2 fused composition LDE + tree + keep. After #699/#700 every VM AIR's
+    // composition poly has `number_of_parts == 2`, so the degree-2 quotient
+    // decomposition routes through `try_extend_two_halves_gpu_keep`: one call
+    // does the LDE of both halves, builds the composition Merkle tree, and
+    // retains the device handle for R4 DEEP. A silent fallback to the host
+    // `commit_composition_polynomial` (or the CPU `extend_half_to_lde`) would
+    // drop this to zero. (Replaces the `gpu_parts_lde_calls` /
+    // `gpu_comp_poly_tree_calls` >2-part assertions, dead since no AIR has
+    // number_of_parts > 2.)
+    // R2 fused composition LDE + no-tree keep. After #699/#700 every VM AIR's
+    // composition poly has `number_of_parts == 2`, so the degree-2 quotient
+    // decomposition routes through `try_extend_two_halves_gpu_keep`: one call does
+    // the LDE of both halves and retains the device handle for R4 DEEP. A silent
+    // fallback to the CPU `extend_half_to_lde` would drop this to zero.
     assert!(
-        gpu_comp_poly_tree_calls() > 0,
-        "R2 GPU comp-poly tree did not fire"
+        gpu_extend_halves_calls() > 0,
+        "R2 fused composition LDE keep path did not fire"
     );
 
     // DEEP fires once per table that took the R1 GPU path.