diff --git a/crypto/math-cuda/src/lde.rs b/crypto/math-cuda/src/lde.rs
index ee5dc3fce..e98a42691 100644
--- a/crypto/math-cuda/src/lde.rs
+++ b/crypto/math-cuda/src/lde.rs
@@ -1296,15 +1296,40 @@ pub fn coset_lde_batch_ext3_into(
weights: &[u64],
outputs: &mut [&mut [u64]],
) -> Result<()> {
+ coset_lde_batch_ext3_into_inner(columns, n, blowup_factor, weights, outputs, false).map(|_| ())
+}
+
+/// Same as [`coset_lde_batch_ext3_into`] but RETAINS the de-interleaved device
+/// LDE buffer as a [`GpuLdeExt3`] handle for downstream on-device reuse (e.g. R4
+/// DEEP), instead of freeing it. No Merkle tree is built. Returns `None` when
+/// the input is empty (`columns.is_empty()` or `n == 0`).
+pub fn coset_lde_batch_ext3_into_keep(
+ columns: &[&[u64]],
+ n: usize,
+ blowup_factor: usize,
+ weights: &[u64],
+ outputs: &mut [&mut [u64]],
+) -> Result