Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 37 additions & 3 deletions crypto/math-cuda/src/lde.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1296,15 +1296,40 @@ pub fn coset_lde_batch_ext3_into(
weights: &[u64],
outputs: &mut [&mut [u64]],
) -> Result<()> {
coset_lde_batch_ext3_into_inner(columns, n, blowup_factor, weights, outputs, false).map(|_| ())
}

/// Same as [`coset_lde_batch_ext3_into`] but RETAINS the de-interleaved device
/// LDE buffer as a [`GpuLdeExt3`] handle for downstream on-device reuse (e.g. R4
/// DEEP), instead of freeing it. No Merkle tree is built. Returns `None` when
/// the input is empty (`columns.is_empty()` or `n == 0`).
pub fn coset_lde_batch_ext3_into_keep(
columns: &[&[u64]],
n: usize,
blowup_factor: usize,
weights: &[u64],
outputs: &mut [&mut [u64]],
) -> Result<Option<GpuLdeExt3>> {
coset_lde_batch_ext3_into_inner(columns, n, blowup_factor, weights, outputs, true)
}

fn coset_lde_batch_ext3_into_inner(
columns: &[&[u64]],
n: usize,
blowup_factor: usize,
weights: &[u64],
outputs: &mut [&mut [u64]],
keep_device_buf: bool,
) -> Result<Option<GpuLdeExt3>> {
if columns.is_empty() {
return Ok(());
return Ok(None);
}
let m = columns.len();
assert_eq!(outputs.len(), m, "outputs must match columns count");
// Empty domain must short-circuit before the power-of-two assert
// (is_power_of_two returns false for 0).
if n == 0 {
return Ok(());
return Ok(None);
}
assert!(n.is_power_of_two(), "n must be a power of two");
assert_eq!(weights.len(), n, "weights length must match n");
Expand Down Expand Up @@ -1408,7 +1433,16 @@ pub fn coset_lde_batch_ext3_into(
// ext3-per-element layout.
unpack_pinned_slabs_to_ext3(pinned, outputs, lde_size);
drop(staging);
Ok(())
if keep_device_buf {
Ok(Some(GpuLdeExt3 {
buf: std::sync::Arc::new(buf),
m,
lde_size,
}))
} else {
drop(buf);
Ok(None)
}
}

/// Run the DIT butterfly body of a bit-reversed-input NTT over `m` batched
Expand Down
150 changes: 75 additions & 75 deletions crypto/stark/src/gpu_lde.rs
Original file line number Diff line number Diff line change
Expand Up @@ -349,101 +349,101 @@ where
Some(())
}

/// GPU path for `Prover::extend_half_to_lde`.
/// GPU fused `_keep` path for `Prover::decompose_and_extend_d2` (R2 quotient
/// decomposition).
///
/// Inside `decompose_and_extend_d2` (R2 quotient decomposition) the prover
/// does `rayon::join` of two calls: `iFFT(N on g²-coset) → FFT(2N on g-coset)`
/// over ext3 halves H0 and H1. They share the same domain/offset and sizes,
/// so we batch them into a single GPU call with M=2 ext3 columns.
/// Extends both ext3 halves H0/H1 from `N` evals on the g²-coset to `2N` on the
/// g-coset, hashes the row-pair Keccak leaves, builds the composition Merkle
/// tree, AND retains the de-interleaved device buffer as a `GpuLdeExt3` handle —
/// all in one GPU call. The kept handle lets R4 DEEP read the composition LDE
/// straight from device memory (no `2 * 3 * lde_size * 8` byte re-H2D), and the
/// returned tree IS the composition commitment (no separate
/// `try_build_comp_poly_tree_gpu` round-trip).
///
/// Weights = `[1/N, g^(-1)/N, g^(-2)/N, …, g^(-(N-1))/N]`. This bakes the
/// `(g²)^(-k)` input-coset-undo from `interpolate_offset_fft` together with
/// the `g^k` forward-coset-shift from `evaluate_polynomial_on_lde_domain` —
/// net is `g^(-k)` — plus the `1/N` iFFT normalisation.
/// `columns` must be `[H0, H1]`, each `N` ext3 evals on the g²-coset; on success
/// they are expanded in place to `lde_size`. Weights are built here as
/// `[1/N, g^(-1)/N, …, g^(-(N-1))/N]` (the same construction the prior D2H
/// `try_extend_two_halves_gpu` used, hence known-correct on CUDA): this bakes the
/// `(g²)^(-k)` input-coset-undo from `interpolate_offset_fft`, the `g^k` forward
/// coset-shift from `evaluate_polynomial_on_lde_domain` (net `g^(-k)`), and the
/// `1/N` iFFT normalisation. They are NOT reused from `CompositionLdeTwiddles`
/// because the CPU `coset_lde_full` and the GPU `coset_lde_batch` need not share
/// a weight-application convention.
///
/// Returns `None` when the GPU path doesn't apply (too small, or CPU path
/// should be used); in that case the caller runs its existing rayon::join.
#[allow(clippy::type_complexity)]
pub(crate) fn try_extend_two_halves_gpu<F, E>(
h0: &[FieldElement<E>],
h1: &[FieldElement<E>],
/// Returns `None` when the GPU path doesn't apply (too small / non-ext3 / non-
/// Goldilocks — all checked by `check_ext3_layout` inside the delegate, which
/// also restores `columns` to their original `N`-length contents on any failure);
/// the caller then runs the CPU `rayon::join`.
pub(crate) fn try_extend_two_halves_gpu_keep<F, E>(
columns: &mut [Vec<FieldElement<E>>],
domain: &Domain<F>,
) -> Option<(Vec<FieldElement<E>>, Vec<FieldElement<E>>)>
) -> Option<math_cuda::lde::GpuLdeExt3>
where
F: IsFFTField + IsField + 'static,
E: IsField + 'static,
F: IsSubFieldOf<E>,
{
if h0.len() != h1.len() {
return None;
}
let n = h0.len();
let blowup = 2; // extend_half_to_lde extends N → 2N always
let lde_size = n * blowup;
if lde_size < gpu_lde_threshold() {
return None;
}
if TypeId::of::<E>() != TypeId::of::<Degree3GoldilocksExtensionField>() {
return None;
}
if TypeId::of::<F>() != TypeId::of::<GoldilocksField>() {
return None;
}
GPU_EXTEND_HALVES_CALLS.fetch_add(1, Ordering::Relaxed);
// Weights are built from `g = domain.coset_offset` directly: the
// CPU caller previously passed `g²` redundantly. See the
// `g^(-k) / N` weight loop below.

// Flatten ext3 slices to raw 3*n u64 buffers.
let to_u64 = |col: &[FieldElement<E>]| -> Vec<u64> {
let len = col.len() * 3;
let ptr = col.as_ptr() as *const u64;
unsafe { from_raw_parts(ptr, len) }.to_vec()
// Self-guards size / threshold / Goldilocks-ext3 / equal-length. blowup = 2
// (extend_half_to_lde always extends N -> 2N).
let (n, lde_size) = match check_ext3_layout::<F, E>(columns, 2) {
LayoutDispatch::Empty | LayoutDispatch::Skip => return None,
LayoutDispatch::Run { n, lde_size } => (n, lde_size),
};
let h0_raw = to_u64(h0);
let h1_raw = to_u64(h1);

// weights[k] = g^(-k) / N as a u64.
// weights[k] = g^(-k) / N (g = domain.coset_offset): bakes the (g^2)^(-k)
// input-coset-undo, the g^k output-coset-shift (net g^(-k)) and the 1/N iFFT
// normalisation. Identical to CompositionLdeTwiddles::weights and to the
// prior D2H path (known-correct on CUDA).
let inv_n = FieldElement::<F>::from(n as u64).inv().expect("N nonzero");
let g = &domain.coset_offset;
let g_inv = g.inv().expect("g nonzero");
let mut weights_u64 = Vec::with_capacity(n);
let mut w = inv_n.clone();
let g_inv = domain.coset_offset.inv().expect("g nonzero");
let mut weights = Vec::with_capacity(n);
let mut w = inv_n;
for _ in 0..n {
// F == GoldilocksField by TypeId check above, so value is u64.
let v: u64 = unsafe { *(w.value() as *const _ as *const u64) };
weights_u64.push(v);
weights.push(w.clone());
w *= &g_inv;
}

// Pre-allocate outputs.
let mut lde_h0 = vec![FieldElement::<E>::zero(); lde_size];
let mut lde_h1 = vec![FieldElement::<E>::zero(); lde_size];
// SAFETY: layout checked above. `columns_to_u64_ext3` copies the N input
// values out before we presize, so input and output don't alias.
let raw_columns = unsafe { columns_to_u64_ext3::<E>(columns) };
let weights_u64 = unsafe { weights_to_u64::<F>(&weights) };
let slices: Vec<&[u64]> = raw_columns.iter().map(|c| c.as_slice()).collect();

// Two ext3 columns (h0 + h1), each composed of 3 base-field components.
const NUM_COLS: usize = 2;
GPU_LDE_CALLS.fetch_add((NUM_COLS * 3) as u64, Ordering::Relaxed);
{
let inputs: [&[u64]; 2] = [&h0_raw, &h1_raw];
// View each output Vec<FieldElement<E>> as &mut [u64] of length 3*lde_size.
let out0_ptr = lde_h0.as_mut_ptr() as *mut u64;
let out1_ptr = lde_h1.as_mut_ptr() as *mut u64;
// SAFETY: ext3 FieldElement is [u64; 3] in memory, and the Vec has len
// = lde_size so the backing is 3*lde_size u64s.
let ext3_len = lde_size
.checked_mul(3)
.expect("ext3 output length overflow");
let out0_slice = unsafe { from_raw_parts_mut(out0_ptr, ext3_len) };
let out1_slice = unsafe { from_raw_parts_mut(out1_ptr, ext3_len) };
let mut outputs: [&mut [u64]; 2] = [out0_slice, out1_slice];
if math_cuda::lde::coset_lde_batch_ext3_into(&inputs, n, blowup, &weights_u64, &mut outputs)
.is_err()
{
return None;
// `presize_and_view_ext3` does `set_len(lde_size)` in place, so each column
// needs `capacity >= lde_size`. H0/H1 come from `map_unzip` with capacity N.
for col in columns.iter_mut() {
if col.capacity() < lde_size {
col.reserve_exact(lde_size - col.len());
}
}

Some((lde_h0, lde_h1))
GPU_LDE_CALLS.fetch_add((columns.len() * 3) as u64, Ordering::Relaxed);

// No-tree keep variant: writes the LDE into `columns` (presized in place) and
// retains the de-interleaved device buffer as a `GpuLdeExt3` handle for R4
// DEEP. The composition Merkle tree is built separately by
// `try_build_comp_poly_tree_gpu` (the keep pipeline's on-device tree is in the
// wrong order for the composition's commitment).
let handle = {
let mut raw_outputs = unsafe { presize_and_view_ext3::<E>(columns, lde_size) };
math_cuda::lde::coset_lde_batch_ext3_into_keep(
&slices,
n,
2,
&weights_u64,
&mut raw_outputs,
)
};
match handle {
Ok(Some(h)) => {
GPU_EXTEND_HALVES_CALLS.fetch_add(1, Ordering::Relaxed);
Some(h)
}
Ok(None) | Err(_) => {
restore_columns_on_err(columns, n);
None
}
}
}

pub(crate) static GPU_LEAF_HASH_CALLS: AtomicU64 = AtomicU64::new(0);
Expand Down
84 changes: 66 additions & 18 deletions crypto/stark/src/prover.rs
Original file line number Diff line number Diff line change
Expand Up @@ -421,15 +421,29 @@ where
pub(crate) composition_poly_merkle_tree: BatchedMerkleTree<F>,
/// The commitment to the composition polynomial parts.
pub(crate) composition_poly_root: Commitment,
/// Device-resident de-interleaved LDE handle from the R2 fused GPU path
/// (`try_evaluate_parts_on_lde_gpu_keep`). When present, R4 DEEP skips
/// the `num_parts * 3 * lde_size * 8` byte H2D and reads parts on
/// device. `None` when the GPU R2 path didn't run (number_of_parts <= 2,
/// below threshold, or any CPU fallback).
/// Device-resident de-interleaved LDE handle from an R2 fused GPU `_keep`
/// path: the 2-part `try_extend_two_halves_gpu_keep` (the common case, after
/// the degree-2 quotient decomposition) or the >2-part
/// `try_evaluate_parts_on_lde_gpu_keep`. When present, R4 DEEP skips the
/// `num_parts * 3 * lde_size * 8` byte H2D and reads parts on device. `None`
/// when the GPU R2 path didn't run (single-part AIR, below threshold, or any
/// CPU fallback).
#[cfg(feature = "cuda")]
pub(crate) gpu_composition_parts: Option<math_cuda::lde::GpuLdeExt3>,
}

/// Output of [`Prover::decompose_and_extend_d2`]: the two composition-poly part
/// LDE evaluation vectors, plus (under `cuda`) the retained `GpuLdeExt3` device
/// handle when the GPU no-tree keep path ran (consumed by R4 DEEP to avoid the
/// composition-LDE re-H2D). `gpu_keep` is `None` on the CPU fallback. The
/// composition Merkle tree is built separately (the keep pipeline's on-device
/// tree is in the wrong order for the composition commitment).
pub(crate) struct D2Result<E: IsField> {
pub(crate) evals: Vec<Vec<FieldElement<E>>>,
#[cfg(feature = "cuda")]
pub(crate) gpu_keep: Option<math_cuda::lde::GpuLdeExt3>,
}

/// A container for the results of the third round of the STARK Prove protocol.
pub(crate) struct Round3<F: IsField> {
/// Evaluations of the trace polynomials, main ans auxiliary, at the out-of-domain challenge.
Expand Down Expand Up @@ -1178,7 +1192,7 @@ pub trait IsStarkProver<
constraint_evaluations: &[FieldElement<FieldExtension>],
domain: &Domain<Field>,
twiddles: &LdeTwiddles<Field>,
) -> Vec<Vec<FieldElement<FieldExtension>>>
) -> D2Result<FieldExtension>
where
FieldElement<Field>: AsBytes + Sync + Send,
FieldElement<FieldExtension>: AsBytes + Sync + Send,
Expand Down Expand Up @@ -1211,21 +1225,48 @@ pub trait IsStarkProver<
// Step 3: Extend each part from n evals on the g²-coset to 2n evals on the
// g-coset (the full LDE domain).

// GPU fast path: batch both halves into one ext3 LDE call. Requires
// `cuda` feature and a qualifying size. Falls through to CPU when not.
// GPU fast path (no-tree `_keep`): one on-device call does the LDE of BOTH
// halves and retains the de-interleaved device buffer as a `GpuLdeExt3`
// handle, which feeds R4 DEEP and eliminates the composition-LDE re-H2D.
// (The composition Merkle tree is built separately below — the keep
// pipeline's on-device tree is in the wrong order for the composition.)
// Falls through to the CPU `rayon::join` when the GPU path doesn't apply
// (`try_extend_two_halves_gpu_keep` restores `cols` to [H0, H1] on `None`).
#[cfg(feature = "cuda")]
if let Some((lde_h0, lde_h1)) =
crate::gpu_lde::try_extend_two_halves_gpu(&h0_evals, &h1_evals, domain)
{
return vec![lde_h0, lde_h1];
let mut cols = vec![h0_evals, h1_evals];
if let Some(handle) = crate::gpu_lde::try_extend_two_halves_gpu_keep::<
Field,
FieldExtension,
>(&mut cols, domain)
{
return D2Result {
evals: cols,
gpu_keep: Some(handle),
};
}
let composition_twiddles = twiddles.composition(domain);
let (lde_h0, lde_h1) = crate::par::join(
|| Self::extend_half_to_lde(&cols[0], composition_twiddles),
|| Self::extend_half_to_lde(&cols[1], composition_twiddles),
);
return D2Result {
evals: vec![lde_h0, lde_h1],
gpu_keep: None,
};
}

let composition_twiddles = twiddles.composition(domain);
let (lde_h0, lde_h1) = crate::par::join(
|| Self::extend_half_to_lde(&h0_evals, composition_twiddles),
|| Self::extend_half_to_lde(&h1_evals, composition_twiddles),
);
vec![lde_h0, lde_h1]
#[cfg(not(feature = "cuda"))]
{
let composition_twiddles = twiddles.composition(domain);
let (lde_h0, lde_h1) = crate::par::join(
|| Self::extend_half_to_lde(&h0_evals, composition_twiddles),
|| Self::extend_half_to_lde(&h1_evals, composition_twiddles),
);
D2Result {
evals: vec![lde_h0, lde_h1],
}
}
}

/// Extend `half_evals` — `n = lde_size/2` evaluations of a degree-`<n` polynomial
Expand Down Expand Up @@ -1302,7 +1343,14 @@ pub trait IsStarkProver<
// H₀(x²) = (H(x) + H(-x)) / 2
// H₁(x²) = (H(x) - H(-x)) / (2x)
// On the LDE coset {g·ω^i}, we have -g·ω^i = g·ω^{i+N} since ω^N = -1.
Self::decompose_and_extend_d2(&constraint_evaluations, domain, twiddles)
let d2 = Self::decompose_and_extend_d2(&constraint_evaluations, domain, twiddles);
#[cfg(feature = "cuda")]
if let Some(handle) = d2.gpu_keep {
// Kept composition-LDE device buffer: R4 DEEP reads it on-device
// instead of re-H2D'ing the composition parts.
gpu_composition_parts = Some(handle);
}
d2.evals
} else if number_of_parts == 1 {
// Degree bound equals trace length: constraint evals are the LDE directly.
vec![constraint_evaluations]
Expand Down
3 changes: 2 additions & 1 deletion crypto/stark/src/tests/prover_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,8 @@ fn test_decompose_and_extend_d2_matches_original() {
&constraint_evaluations,
&domain,
&twiddles,
);
)
.evals;
#[cfg(not(feature = "cuda"))]
assert!(twiddles.has_composition_cache());

Expand Down
Loading
Loading