diff --git a/crypto/math-cuda/src/lde.rs b/crypto/math-cuda/src/lde.rs index ee5dc3fce..e98a42691 100644 --- a/crypto/math-cuda/src/lde.rs +++ b/crypto/math-cuda/src/lde.rs @@ -1296,15 +1296,40 @@ pub fn coset_lde_batch_ext3_into( weights: &[u64], outputs: &mut [&mut [u64]], ) -> Result<()> { + coset_lde_batch_ext3_into_inner(columns, n, blowup_factor, weights, outputs, false).map(|_| ()) +} + +/// Same as [`coset_lde_batch_ext3_into`] but RETAINS the de-interleaved device +/// LDE buffer as a [`GpuLdeExt3`] handle for downstream on-device reuse (e.g. R4 +/// DEEP), instead of freeing it. No Merkle tree is built. Returns `None` when +/// the input is empty (`columns.is_empty()` or `n == 0`). +pub fn coset_lde_batch_ext3_into_keep( + columns: &[&[u64]], + n: usize, + blowup_factor: usize, + weights: &[u64], + outputs: &mut [&mut [u64]], +) -> Result> { + coset_lde_batch_ext3_into_inner(columns, n, blowup_factor, weights, outputs, true) +} + +fn coset_lde_batch_ext3_into_inner( + columns: &[&[u64]], + n: usize, + blowup_factor: usize, + weights: &[u64], + outputs: &mut [&mut [u64]], + keep_device_buf: bool, +) -> Result> { if columns.is_empty() { - return Ok(()); + return Ok(None); } let m = columns.len(); assert_eq!(outputs.len(), m, "outputs must match columns count"); // Empty domain must short-circuit before the power-of-two assert // (is_power_of_two returns false for 0). if n == 0 { - return Ok(()); + return Ok(None); } assert!(n.is_power_of_two(), "n must be a power of two"); assert_eq!(weights.len(), n, "weights length must match n"); @@ -1408,7 +1433,16 @@ pub fn coset_lde_batch_ext3_into( // ext3-per-element layout. unpack_pinned_slabs_to_ext3(pinned, outputs, lde_size); drop(staging); - Ok(()) + if keep_device_buf { + Ok(Some(GpuLdeExt3 { + buf: std::sync::Arc::new(buf), + m, + lde_size, + })) + } else { + drop(buf); + Ok(None) + } } /// Run the DIT butterfly body of a bit-reversed-input NTT over `m` batched diff --git a/crypto/stark/src/gpu_lde.rs b/crypto/stark/src/gpu_lde.rs index 36756b40b..ff986e795 100644 --- a/crypto/stark/src/gpu_lde.rs +++ b/crypto/stark/src/gpu_lde.rs @@ -349,101 +349,101 @@ where Some(()) } -/// GPU path for `Prover::extend_half_to_lde`. +/// GPU fused `_keep` path for `Prover::decompose_and_extend_d2` (R2 quotient +/// decomposition). /// -/// Inside `decompose_and_extend_d2` (R2 quotient decomposition) the prover -/// does `rayon::join` of two calls: `iFFT(N on g²-coset) → FFT(2N on g-coset)` -/// over ext3 halves H0 and H1. They share the same domain/offset and sizes, -/// so we batch them into a single GPU call with M=2 ext3 columns. +/// Extends both ext3 halves H0/H1 from `N` evals on the g²-coset to `2N` on the +/// g-coset, hashes the row-pair Keccak leaves, builds the composition Merkle +/// tree, AND retains the de-interleaved device buffer as a `GpuLdeExt3` handle — +/// all in one GPU call. The kept handle lets R4 DEEP read the composition LDE +/// straight from device memory (no `2 * 3 * lde_size * 8` byte re-H2D), and the +/// returned tree IS the composition commitment (no separate +/// `try_build_comp_poly_tree_gpu` round-trip). /// -/// Weights = `[1/N, g^(-1)/N, g^(-2)/N, …, g^(-(N-1))/N]`. This bakes the -/// `(g²)^(-k)` input-coset-undo from `interpolate_offset_fft` together with -/// the `g^k` forward-coset-shift from `evaluate_polynomial_on_lde_domain` — -/// net is `g^(-k)` — plus the `1/N` iFFT normalisation. +/// `columns` must be `[H0, H1]`, each `N` ext3 evals on the g²-coset; on success +/// they are expanded in place to `lde_size`. Weights are built here as +/// `[1/N, g^(-1)/N, …, g^(-(N-1))/N]` (the same construction the prior D2H +/// `try_extend_two_halves_gpu` used, hence known-correct on CUDA): this bakes the +/// `(g²)^(-k)` input-coset-undo from `interpolate_offset_fft`, the `g^k` forward +/// coset-shift from `evaluate_polynomial_on_lde_domain` (net `g^(-k)`), and the +/// `1/N` iFFT normalisation. They are NOT reused from `CompositionLdeTwiddles` +/// because the CPU `coset_lde_full` and the GPU `coset_lde_batch` need not share +/// a weight-application convention. /// -/// Returns `None` when the GPU path doesn't apply (too small, or CPU path -/// should be used); in that case the caller runs its existing rayon::join. -#[allow(clippy::type_complexity)] -pub(crate) fn try_extend_two_halves_gpu( - h0: &[FieldElement], - h1: &[FieldElement], +/// Returns `None` when the GPU path doesn't apply (too small / non-ext3 / non- +/// Goldilocks — all checked by `check_ext3_layout` inside the delegate, which +/// also restores `columns` to their original `N`-length contents on any failure); +/// the caller then runs the CPU `rayon::join`. +pub(crate) fn try_extend_two_halves_gpu_keep( + columns: &mut [Vec>], domain: &Domain, -) -> Option<(Vec>, Vec>)> +) -> Option where F: IsFFTField + IsField + 'static, E: IsField + 'static, F: IsSubFieldOf, { - if h0.len() != h1.len() { - return None; - } - let n = h0.len(); - let blowup = 2; // extend_half_to_lde extends N → 2N always - let lde_size = n * blowup; - if lde_size < gpu_lde_threshold() { - return None; - } - if TypeId::of::() != TypeId::of::() { - return None; - } - if TypeId::of::() != TypeId::of::() { - return None; - } - GPU_EXTEND_HALVES_CALLS.fetch_add(1, Ordering::Relaxed); - // Weights are built from `g = domain.coset_offset` directly: the - // CPU caller previously passed `g²` redundantly. See the - // `g^(-k) / N` weight loop below. - - // Flatten ext3 slices to raw 3*n u64 buffers. - let to_u64 = |col: &[FieldElement]| -> Vec { - let len = col.len() * 3; - let ptr = col.as_ptr() as *const u64; - unsafe { from_raw_parts(ptr, len) }.to_vec() + // Self-guards size / threshold / Goldilocks-ext3 / equal-length. blowup = 2 + // (extend_half_to_lde always extends N -> 2N). + let (n, lde_size) = match check_ext3_layout::(columns, 2) { + LayoutDispatch::Empty | LayoutDispatch::Skip => return None, + LayoutDispatch::Run { n, lde_size } => (n, lde_size), }; - let h0_raw = to_u64(h0); - let h1_raw = to_u64(h1); - // weights[k] = g^(-k) / N as a u64. + // weights[k] = g^(-k) / N (g = domain.coset_offset): bakes the (g^2)^(-k) + // input-coset-undo, the g^k output-coset-shift (net g^(-k)) and the 1/N iFFT + // normalisation. Identical to CompositionLdeTwiddles::weights and to the + // prior D2H path (known-correct on CUDA). let inv_n = FieldElement::::from(n as u64).inv().expect("N nonzero"); - let g = &domain.coset_offset; - let g_inv = g.inv().expect("g nonzero"); - let mut weights_u64 = Vec::with_capacity(n); - let mut w = inv_n.clone(); + let g_inv = domain.coset_offset.inv().expect("g nonzero"); + let mut weights = Vec::with_capacity(n); + let mut w = inv_n; for _ in 0..n { - // F == GoldilocksField by TypeId check above, so value is u64. - let v: u64 = unsafe { *(w.value() as *const _ as *const u64) }; - weights_u64.push(v); + weights.push(w.clone()); w *= &g_inv; } - // Pre-allocate outputs. - let mut lde_h0 = vec![FieldElement::::zero(); lde_size]; - let mut lde_h1 = vec![FieldElement::::zero(); lde_size]; + // SAFETY: layout checked above. `columns_to_u64_ext3` copies the N input + // values out before we presize, so input and output don't alias. + let raw_columns = unsafe { columns_to_u64_ext3::(columns) }; + let weights_u64 = unsafe { weights_to_u64::(&weights) }; + let slices: Vec<&[u64]> = raw_columns.iter().map(|c| c.as_slice()).collect(); - // Two ext3 columns (h0 + h1), each composed of 3 base-field components. - const NUM_COLS: usize = 2; - GPU_LDE_CALLS.fetch_add((NUM_COLS * 3) as u64, Ordering::Relaxed); - { - let inputs: [&[u64]; 2] = [&h0_raw, &h1_raw]; - // View each output Vec> as &mut [u64] of length 3*lde_size. - let out0_ptr = lde_h0.as_mut_ptr() as *mut u64; - let out1_ptr = lde_h1.as_mut_ptr() as *mut u64; - // SAFETY: ext3 FieldElement is [u64; 3] in memory, and the Vec has len - // = lde_size so the backing is 3*lde_size u64s. - let ext3_len = lde_size - .checked_mul(3) - .expect("ext3 output length overflow"); - let out0_slice = unsafe { from_raw_parts_mut(out0_ptr, ext3_len) }; - let out1_slice = unsafe { from_raw_parts_mut(out1_ptr, ext3_len) }; - let mut outputs: [&mut [u64]; 2] = [out0_slice, out1_slice]; - if math_cuda::lde::coset_lde_batch_ext3_into(&inputs, n, blowup, &weights_u64, &mut outputs) - .is_err() - { - return None; + // `presize_and_view_ext3` does `set_len(lde_size)` in place, so each column + // needs `capacity >= lde_size`. H0/H1 come from `map_unzip` with capacity N. + for col in columns.iter_mut() { + if col.capacity() < lde_size { + col.reserve_exact(lde_size - col.len()); } } - Some((lde_h0, lde_h1)) + GPU_LDE_CALLS.fetch_add((columns.len() * 3) as u64, Ordering::Relaxed); + + // No-tree keep variant: writes the LDE into `columns` (presized in place) and + // retains the de-interleaved device buffer as a `GpuLdeExt3` handle for R4 + // DEEP. The composition Merkle tree is built separately by + // `try_build_comp_poly_tree_gpu` (the keep pipeline's on-device tree is in the + // wrong order for the composition's commitment). + let handle = { + let mut raw_outputs = unsafe { presize_and_view_ext3::(columns, lde_size) }; + math_cuda::lde::coset_lde_batch_ext3_into_keep( + &slices, + n, + 2, + &weights_u64, + &mut raw_outputs, + ) + }; + match handle { + Ok(Some(h)) => { + GPU_EXTEND_HALVES_CALLS.fetch_add(1, Ordering::Relaxed); + Some(h) + } + Ok(None) | Err(_) => { + restore_columns_on_err(columns, n); + None + } + } } pub(crate) static GPU_LEAF_HASH_CALLS: AtomicU64 = AtomicU64::new(0); diff --git a/crypto/stark/src/prover.rs b/crypto/stark/src/prover.rs index eed0e512a..76a8aaa9d 100644 --- a/crypto/stark/src/prover.rs +++ b/crypto/stark/src/prover.rs @@ -421,15 +421,29 @@ where pub(crate) composition_poly_merkle_tree: BatchedMerkleTree, /// The commitment to the composition polynomial parts. pub(crate) composition_poly_root: Commitment, - /// Device-resident de-interleaved LDE handle from the R2 fused GPU path - /// (`try_evaluate_parts_on_lde_gpu_keep`). When present, R4 DEEP skips - /// the `num_parts * 3 * lde_size * 8` byte H2D and reads parts on - /// device. `None` when the GPU R2 path didn't run (number_of_parts <= 2, - /// below threshold, or any CPU fallback). + /// Device-resident de-interleaved LDE handle from an R2 fused GPU `_keep` + /// path: the 2-part `try_extend_two_halves_gpu_keep` (the common case, after + /// the degree-2 quotient decomposition) or the >2-part + /// `try_evaluate_parts_on_lde_gpu_keep`. When present, R4 DEEP skips the + /// `num_parts * 3 * lde_size * 8` byte H2D and reads parts on device. `None` + /// when the GPU R2 path didn't run (single-part AIR, below threshold, or any + /// CPU fallback). #[cfg(feature = "cuda")] pub(crate) gpu_composition_parts: Option, } +/// Output of [`Prover::decompose_and_extend_d2`]: the two composition-poly part +/// LDE evaluation vectors, plus (under `cuda`) the retained `GpuLdeExt3` device +/// handle when the GPU no-tree keep path ran (consumed by R4 DEEP to avoid the +/// composition-LDE re-H2D). `gpu_keep` is `None` on the CPU fallback. The +/// composition Merkle tree is built separately (the keep pipeline's on-device +/// tree is in the wrong order for the composition commitment). +pub(crate) struct D2Result { + pub(crate) evals: Vec>>, + #[cfg(feature = "cuda")] + pub(crate) gpu_keep: Option, +} + /// A container for the results of the third round of the STARK Prove protocol. pub(crate) struct Round3 { /// Evaluations of the trace polynomials, main ans auxiliary, at the out-of-domain challenge. @@ -1178,7 +1192,7 @@ pub trait IsStarkProver< constraint_evaluations: &[FieldElement], domain: &Domain, twiddles: &LdeTwiddles, - ) -> Vec>> + ) -> D2Result where FieldElement: AsBytes + Sync + Send, FieldElement: AsBytes + Sync + Send, @@ -1211,21 +1225,48 @@ pub trait IsStarkProver< // Step 3: Extend each part from n evals on the g²-coset to 2n evals on the // g-coset (the full LDE domain). - // GPU fast path: batch both halves into one ext3 LDE call. Requires - // `cuda` feature and a qualifying size. Falls through to CPU when not. + // GPU fast path (no-tree `_keep`): one on-device call does the LDE of BOTH + // halves and retains the de-interleaved device buffer as a `GpuLdeExt3` + // handle, which feeds R4 DEEP and eliminates the composition-LDE re-H2D. + // (The composition Merkle tree is built separately below — the keep + // pipeline's on-device tree is in the wrong order for the composition.) + // Falls through to the CPU `rayon::join` when the GPU path doesn't apply + // (`try_extend_two_halves_gpu_keep` restores `cols` to [H0, H1] on `None`). #[cfg(feature = "cuda")] - if let Some((lde_h0, lde_h1)) = - crate::gpu_lde::try_extend_two_halves_gpu(&h0_evals, &h1_evals, domain) { - return vec![lde_h0, lde_h1]; + let mut cols = vec![h0_evals, h1_evals]; + if let Some(handle) = crate::gpu_lde::try_extend_two_halves_gpu_keep::< + Field, + FieldExtension, + >(&mut cols, domain) + { + return D2Result { + evals: cols, + gpu_keep: Some(handle), + }; + } + let composition_twiddles = twiddles.composition(domain); + let (lde_h0, lde_h1) = crate::par::join( + || Self::extend_half_to_lde(&cols[0], composition_twiddles), + || Self::extend_half_to_lde(&cols[1], composition_twiddles), + ); + return D2Result { + evals: vec![lde_h0, lde_h1], + gpu_keep: None, + }; } - let composition_twiddles = twiddles.composition(domain); - let (lde_h0, lde_h1) = crate::par::join( - || Self::extend_half_to_lde(&h0_evals, composition_twiddles), - || Self::extend_half_to_lde(&h1_evals, composition_twiddles), - ); - vec![lde_h0, lde_h1] + #[cfg(not(feature = "cuda"))] + { + let composition_twiddles = twiddles.composition(domain); + let (lde_h0, lde_h1) = crate::par::join( + || Self::extend_half_to_lde(&h0_evals, composition_twiddles), + || Self::extend_half_to_lde(&h1_evals, composition_twiddles), + ); + D2Result { + evals: vec![lde_h0, lde_h1], + } + } } /// Extend `half_evals` — `n = lde_size/2` evaluations of a degree-` 0, "R3 GPU barycentric did not fire"); - // R2 ext3 LDE of composition-poly parts. Only fires when an AIR's - // `number_of_parts > 2`. The branch and shift tables have degree-3 - // transition constraints, so this triggers on any non-trivial prove. - assert!(gpu_parts_lde_calls() > 0, "R2 GPU parts LDE did not fire"); - - // R2 comp-poly Merkle tree build, paired with the parts LDE above. + // R2 fused composition LDE + tree + keep. After #699/#700 every VM AIR's + // composition poly has `number_of_parts == 2`, so the degree-2 quotient + // decomposition routes through `try_extend_two_halves_gpu_keep`: one call + // does the LDE of both halves, builds the composition Merkle tree, and + // retains the device handle for R4 DEEP. A silent fallback to the host + // `commit_composition_polynomial` (or the CPU `extend_half_to_lde`) would + // drop this to zero. (Replaces the `gpu_parts_lde_calls` / + // `gpu_comp_poly_tree_calls` >2-part assertions, dead since no AIR has + // number_of_parts > 2.) + // R2 fused composition LDE + no-tree keep. After #699/#700 every VM AIR's + // composition poly has `number_of_parts == 2`, so the degree-2 quotient + // decomposition routes through `try_extend_two_halves_gpu_keep`: one call does + // the LDE of both halves and retains the device handle for R4 DEEP. A silent + // fallback to the CPU `extend_half_to_lde` would drop this to zero. assert!( - gpu_comp_poly_tree_calls() > 0, - "R2 GPU comp-poly tree did not fire" + gpu_extend_halves_calls() > 0, + "R2 fused composition LDE keep path did not fire" ); // DEEP fires once per table that took the R1 GPU path.