diff --git a/crates/opentake-media/src/encode/mix.rs b/crates/opentake-media/src/encode/mix.rs new file mode 100644 index 0000000..d06bc13 --- /dev/null +++ b/crates/opentake-media/src/encode/mix.rs @@ -0,0 +1,247 @@ +//! Pure linear audio mixdown for the export pipeline. +//! +//! The export orchestrator decodes each audio clip's source window to mono f32 +//! PCM (via [`crate::decode::extract_pcm`]), then this module lays every clip +//! into one shared timeline buffer at its frame-derived sample offset, applies a +//! per-sample gain (the clip's `volume_at` envelope, projected to the mix rate), +//! sums overlapping clips, and hard-limits the result to `[-1.0, 1.0]`. +//! +//! Everything here is a pure function over plain `f32` slices — no ffmpeg, no +//! domain types — so the linear-mix math is unit-tested offline. The encoder +//! ([`crate::encode::VideoEncoder`]) muxes the produced buffer as a second +//! ffmpeg input; the orchestrator (`src-tauri/src/export.rs`) supplies the clip +//! placements. +//! +//! Scope of this first cut: a **linear** mixdown skeleton (sum + clamp). No +//! resampling curve, no pan/stereo field, no dynamics — those are follow-ups. +//! All clips are decoded at the mix sample rate up front, so mixing is a plain +//! sample-aligned add. + +/// The canonical mixdown sample rate. 48 kHz is the export-audio standard and +/// what the encoder requests from ffmpeg for the muxed AAC/LPCM track. +pub const MIX_SAMPLE_RATE: u32 = 48_000; + +/// One audio clip's contribution to the mix: a mono f32 source window plus the +/// per-sample gain to apply, laid down starting at `start_sample` on the shared +/// timeline buffer. +/// +/// `gains` is either empty (→ unity gain for every sample) or exactly as long as +/// `samples` (→ element-wise gain, e.g. a `volume_at` fade envelope sampled at +/// the mix rate). A mismatched non-empty length is treated as a hard error by +/// [`mix_clips`] so callers can't silently drift the envelope. +#[derive(Clone, Debug, PartialEq)] +pub struct ClipAudio { + /// Sample offset of this clip's first sample on the timeline (>= 0). + pub start_sample: usize, + /// Mono f32 PCM for the clip's visible source window, at [`MIX_SAMPLE_RATE`]. + pub samples: Vec, + /// Per-sample linear gain. Empty = unity; else must match `samples.len()`. + pub gains: Vec, +} + +impl ClipAudio { + /// A clip with a single static `gain` applied to every sample. + pub fn with_static_gain(start_sample: usize, samples: Vec, gain: f32) -> Self { + let gains = if (gain - 1.0).abs() < f32::EPSILON { + Vec::new() + } else { + vec![gain; samples.len()] + }; + ClipAudio { + start_sample, + samples, + gains, + } + } + + /// Last timeline sample index this clip touches (exclusive end). + fn end_sample(&self) -> usize { + self.start_sample + self.samples.len() + } +} + +/// Mix every clip into one mono f32 buffer. +/// +/// The output length is the furthest `end_sample` across all clips (so trailing +/// silence past the last clip is not emitted). Overlapping clips sum; the final +/// buffer is hard-limited to `[-1.0, 1.0]`. An empty input yields an empty +/// buffer (the caller then mux's no audio). +/// +/// Returns `Err` if any clip's non-empty `gains` length doesn't match its +/// `samples` length — a programming error in the caller's per-sample envelope. +pub fn mix_clips(clips: &[ClipAudio]) -> Result, String> { + for (i, c) in clips.iter().enumerate() { + if !c.gains.is_empty() && c.gains.len() != c.samples.len() { + return Err(format!( + "clip {i}: gains len {} != samples len {}", + c.gains.len(), + c.samples.len() + )); + } + } + + let total = clips.iter().map(ClipAudio::end_sample).max().unwrap_or(0); + let mut out = vec![0.0f32; total]; + + for c in clips { + for (k, &s) in c.samples.iter().enumerate() { + let g = if c.gains.is_empty() { 1.0 } else { c.gains[k] }; + out[c.start_sample + k] += s * g; + } + } + + for v in &mut out { + *v = v.clamp(-1.0, 1.0); + } + Ok(out) +} + +/// Convert a mono f32 buffer to interleaved 16-bit little-endian PCM bytes (the +/// wire format the encoder writes into a temporary WAV for muxing). Each sample +/// is scaled by 32767 and clamped, matching ffmpeg's `s16le` expectation. +pub fn mono_f32_to_s16le(samples: &[f32]) -> Vec { + let mut out = Vec::with_capacity(samples.len() * 2); + for &s in samples { + let scaled = (s.clamp(-1.0, 1.0) * 32767.0).round() as i16; + out.extend_from_slice(&scaled.to_le_bytes()); + } + out +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn empty_input_yields_empty_buffer() { + assert_eq!(mix_clips(&[]).unwrap(), Vec::::new()); + } + + #[test] + fn single_clip_unity_gain_passes_through() { + let c = ClipAudio { + start_sample: 0, + samples: vec![0.1, -0.2, 0.3], + gains: Vec::new(), + }; + assert_eq!(mix_clips(&[c]).unwrap(), vec![0.1, -0.2, 0.3]); + } + + #[test] + fn clip_offset_lays_after_leading_silence() { + let c = ClipAudio { + start_sample: 2, + samples: vec![0.5, 0.5], + gains: Vec::new(), + }; + // two leading zeros, then the clip + assert_eq!(mix_clips(&[c]).unwrap(), vec![0.0, 0.0, 0.5, 0.5]); + } + + #[test] + fn overlapping_clips_sum() { + let a = ClipAudio { + start_sample: 0, + samples: vec![0.2, 0.2, 0.2], + gains: Vec::new(), + }; + let b = ClipAudio { + start_sample: 1, + samples: vec![0.3, 0.3], + gains: Vec::new(), + }; + // index1: 0.2+0.3=0.5 ; index2: 0.2+0.3=0.5 + assert_eq!(mix_clips(&[a, b]).unwrap(), vec![0.2, 0.5, 0.5]); + } + + #[test] + fn summed_overshoot_is_hard_limited() { + let a = ClipAudio { + start_sample: 0, + samples: vec![0.8], + gains: Vec::new(), + }; + let b = ClipAudio { + start_sample: 0, + samples: vec![0.8], + gains: Vec::new(), + }; + // 1.6 -> clamped to 1.0 + assert_eq!(mix_clips(&[a, b]).unwrap(), vec![1.0]); + // and the negative rail + let c = ClipAudio { + start_sample: 0, + samples: vec![-0.9, -0.9], + gains: Vec::new(), + }; + let d = ClipAudio { + start_sample: 0, + samples: vec![-0.9, -0.9], + gains: Vec::new(), + }; + assert_eq!(mix_clips(&[c, d]).unwrap(), vec![-1.0, -1.0]); + } + + #[test] + fn per_sample_gain_is_applied() { + let c = ClipAudio { + start_sample: 0, + samples: vec![1.0, 1.0, 1.0], + gains: vec![0.0, 0.5, 1.0], + }; + assert_eq!(mix_clips(&[c]).unwrap(), vec![0.0, 0.5, 1.0]); + } + + #[test] + fn static_gain_helper_skips_envelope_at_unity() { + let c = ClipAudio::with_static_gain(0, vec![0.4, 0.4], 1.0); + assert!(c.gains.is_empty(), "unity gain stores no envelope"); + let c2 = ClipAudio::with_static_gain(0, vec![0.4, 0.4], 0.5); + assert_eq!(c2.gains, vec![0.5, 0.5]); + assert_eq!(mix_clips(&[c2]).unwrap(), vec![0.2, 0.2]); + } + + #[test] + fn mismatched_gain_length_errors() { + let c = ClipAudio { + start_sample: 0, + samples: vec![0.1, 0.2], + gains: vec![1.0], // wrong length + }; + let err = mix_clips(&[c]).unwrap_err(); + assert!(err.contains("gains len"), "got: {err}"); + } + + #[test] + fn output_length_is_furthest_clip_end() { + let a = ClipAudio { + start_sample: 0, + samples: vec![0.1], + gains: Vec::new(), + }; + let b = ClipAudio { + start_sample: 10, + samples: vec![0.1, 0.1], + gains: Vec::new(), + }; + // furthest end = 10 + 2 = 12 + assert_eq!(mix_clips(&[a, b]).unwrap().len(), 12); + } + + #[test] + fn s16le_encodes_unit_floats() { + // 0.0 -> 0 ; 1.0 -> 32767 ; -1.0 -> -32767 + let bytes = mono_f32_to_s16le(&[0.0, 1.0, -1.0]); + assert_eq!(bytes.len(), 6); + assert_eq!(i16::from_le_bytes([bytes[0], bytes[1]]), 0); + assert_eq!(i16::from_le_bytes([bytes[2], bytes[3]]), 32767); + assert_eq!(i16::from_le_bytes([bytes[4], bytes[5]]), -32767); + } + + #[test] + fn s16le_clamps_out_of_range() { + let bytes = mono_f32_to_s16le(&[2.0, -2.0]); + assert_eq!(i16::from_le_bytes([bytes[0], bytes[1]]), 32767); + assert_eq!(i16::from_le_bytes([bytes[2], bytes[3]]), -32767); + } +} diff --git a/crates/opentake-media/src/encode/mod.rs b/crates/opentake-media/src/encode/mod.rs index 062bf02..c4f5547 100644 --- a/crates/opentake-media/src/encode/mod.rs +++ b/crates/opentake-media/src/encode/mod.rs @@ -7,12 +7,14 @@ //! frames (SPEC §2.4 / §8.2). The arg builder ([`encode_args`]) is pure and //! unit-tested; the encode itself requires ffmpeg. +pub mod mix; pub mod preset; +pub use mix::{mix_clips, mono_f32_to_s16le, ClipAudio, MIX_SAMPLE_RATE}; pub use preset::{even_dimension, ExportPreset, ExportResolution, VideoCodec}; use std::io::Write; -use std::path::Path; +use std::path::{Path, PathBuf}; use crate::decode::pcm::PcmBuffer; use crate::error::{MediaError, Result}; @@ -49,16 +51,60 @@ fn encode_args(out: &Path, w: u32, h: u32, fps: i32, preset: &ExportPreset) -> V args } +/// Build the ffmpeg arg list for the second mux pass: take the already-encoded +/// (audio-less) video at `video_in` and a raw mono `s16le` PCM stream at +/// `pcm_in`, copy the video stream untouched, encode the audio with `acodec`, +/// and write the muxed container to `out`. Pure so the CLI contract is testable. +/// +/// `-shortest` trims the muxed output to the shorter of the two streams, so a +/// trailing audio tail past the last video frame doesn't extend the video. +fn mux_args( + video_in: &Path, + pcm_in: &Path, + out: &Path, + sample_rate: u32, + acodec: &str, +) -> Vec { + vec![ + "-y".into(), + // Input 0: the encoded video (audio-less). + "-i".into(), + video_in.to_string_lossy().into_owned(), + // Input 1: raw mono s16le PCM (the mixed audio). + "-f".into(), + "s16le".into(), + "-ar".into(), + sample_rate.to_string(), + "-ac".into(), + "1".into(), + "-i".into(), + pcm_in.to_string_lossy().into_owned(), + // Copy the video stream verbatim; (re-)encode the audio. + "-c:v".into(), + "copy".into(), + "-c:a".into(), + acodec.into(), + "-shortest".into(), + out.to_string_lossy().into_owned(), + ] +} + /// A streaming RGBA → video encoder. Push frames in order, then `finish`. /// -/// Audio muxing for a pre-rendered mix is intentionally limited here: the export -/// pipeline composites/mixes audio in `opentake-render`; a follow-up wires the -/// mixed PCM as a second ffmpeg input. For now [`push_audio`] records the PCM so -/// the render layer can supply it, and the video-only path is fully functional. +/// When [`push_audio`] has supplied a mixed PCM buffer, `finish` runs a second +/// ffmpeg pass that mux's the audio into the encoded container (`-c:v copy` + +/// `-c:a aac`/`pcm_s16le`). Without audio the video-only first pass *is* the +/// final file. The mux-args builder ([`mux_args`]) is pure and unit-tested; the +/// mux itself requires ffmpeg. pub struct VideoEncoder { child: ffmpeg_sidecar::child::FfmpegChild, stdin: Option, expected_frame_bytes: usize, + /// Final output path (the video first pass writes here; the mux pass, when + /// audio is present, rewrites it from a temp video + the PCM). + out_path: PathBuf, + /// ffmpeg `-c:a` token for the mux pass (from the preset). + acodec: &'static str, pending_audio: Option, } @@ -74,6 +120,8 @@ impl VideoEncoder { child, stdin, expected_frame_bytes: w as usize * h as usize * 4, + out_path: out.to_path_buf(), + acodec: preset.acodec_arg(), pending_audio: None, }) } @@ -98,24 +146,93 @@ impl VideoEncoder { Ok(()) } - /// Record the mixed audio PCM to mux. (Muxing is completed by the render - /// export pipeline; see the type docs.) + /// Record the mixed-down mono audio buffer to mux on `finish`. The buffer's + /// `spec.sample_rate` is the rate ffmpeg is told to read the muxed PCM at + /// (the orchestrator decodes/mixes at [`MIX_SAMPLE_RATE`]). An empty buffer + /// is ignored — `finish` then keeps the video-only output. pub fn push_audio(&mut self, pcm: PcmBuffer) { - self.pending_audio = Some(pcm); + if pcm.samples_f32.is_empty() { + self.pending_audio = None; + } else { + self.pending_audio = Some(pcm); + } } - /// Finish encoding: close stdin and wait for ffmpeg to flush the container. + /// Finish encoding: close stdin, wait for the video pass, then — when a + /// mixed audio buffer was supplied — run a second ffmpeg pass to mux it in. + /// + /// The video first pass writes `out_path` directly. To mux, the encoded + /// video is moved aside to a sibling temp file, the mixed PCM is written to + /// another temp file, and ffmpeg copies the video stream while encoding the + /// audio back into `out_path`. Both temp files are removed afterward (best + /// effort). Without audio this is exactly the old video-only `finish`. pub fn finish(mut self) -> Result<()> { - // Drop stdin to signal EOF to ffmpeg. + // Drop stdin to signal EOF to ffmpeg, then wait for the video pass. self.stdin.take(); let status = self.child.wait().map_err(MediaError::Io)?; if !status.success() { return Err(MediaError::Encode(format!("ffmpeg exited {status}"))); } - Ok(()) + + let Some(pcm) = self.pending_audio.take() else { + return Ok(()); // video-only: the first pass is the final file. + }; + + self.mux_audio(&pcm) + } + + /// Second ffmpeg pass: mux `pcm` (mono f32, written as s16le) into the + /// already-encoded video at `self.out_path`, in place. + fn mux_audio(&self, pcm: &PcmBuffer) -> Result<()> { + let out = &self.out_path; + // Sibling temp paths next to the output (same dir → cheap rename, same + // filesystem). Suffixes keep them distinct from the final artifact. + let video_tmp = sibling_temp(out, "video"); + let pcm_tmp = sibling_temp(out, "pcm"); + + // Move the encoded video aside so ffmpeg can rewrite `out` from it. + std::fs::rename(out, &video_tmp).map_err(MediaError::Io)?; + + // Run the mux, cleaning up temps regardless of outcome. + let result = (|| { + let bytes = mix::mono_f32_to_s16le(&pcm.samples_f32); + std::fs::write(&pcm_tmp, &bytes).map_err(MediaError::Io)?; + + let args = mux_args(&video_tmp, &pcm_tmp, out, pcm.spec.sample_rate, self.acodec); + let mut child = crate::ff::ffmpeg() + .args(args) + .spawn() + .map_err(|e| MediaError::Encode(format!("mux spawn: {e}")))?; + let status = child.wait().map_err(MediaError::Io)?; + if !status.success() { + return Err(MediaError::Encode(format!("ffmpeg mux exited {status}"))); + } + Ok(()) + })(); + + // Best-effort cleanup. If the mux failed, restore the video-only file so + // the caller still has a valid (audio-less) export rather than nothing. + let _ = std::fs::remove_file(&pcm_tmp); + if result.is_err() { + let _ = std::fs::rename(&video_tmp, out); + } else { + let _ = std::fs::remove_file(&video_tmp); + } + result } } +/// Build a sibling temp path next to `out`: `..tmp`. Stays on the same +/// filesystem so the rename in `mux_audio` is atomic and cheap. +fn sibling_temp(out: &Path, tag: &str) -> PathBuf { + let mut name = out + .file_name() + .map(|n| n.to_os_string()) + .unwrap_or_default(); + name.push(format!(".{tag}.tmp")); + out.with_file_name(name) +} + #[cfg(test)] mod tests { use super::*; @@ -151,4 +268,44 @@ mod tests { // ProRes path does not add BT.709 color tags here. assert!(!args.windows(2).any(|w| w == ["-colorspace", "bt709"])); } + + #[test] + fn mux_args_copy_video_and_encode_audio() { + let args = mux_args( + Path::new("/v.mp4"), + Path::new("/a.pcm"), + Path::new("/out.mp4"), + 48_000, + "aac", + ); + // video input first, then the raw s16le PCM input declared with rate/ch. + assert!(args.windows(2).any(|w| w == ["-i", "/v.mp4"])); + assert!(args.windows(2).any(|w| w == ["-f", "s16le"])); + assert!(args.windows(2).any(|w| w == ["-ar", "48000"])); + assert!(args.windows(2).any(|w| w == ["-ac", "1"])); + assert!(args.windows(2).any(|w| w == ["-i", "/a.pcm"])); + // copy the video stream, encode audio with the preset codec. + assert!(args.windows(2).any(|w| w == ["-c:v", "copy"])); + assert!(args.windows(2).any(|w| w == ["-c:a", "aac"])); + assert!(args.iter().any(|a| a == "-shortest")); + assert_eq!(args.last().unwrap(), "/out.mp4"); + } + + #[test] + fn mux_args_threads_prores_lpcm_codec() { + let args = mux_args( + Path::new("/v.mov"), + Path::new("/a.pcm"), + Path::new("/out.mov"), + 48_000, + "pcm_s16le", + ); + assert!(args.windows(2).any(|w| w == ["-c:a", "pcm_s16le"])); + } + + #[test] + fn sibling_temp_keeps_directory_and_tags_name() { + let t = sibling_temp(Path::new("/tmp/clip/out.mp4"), "video"); + assert_eq!(t, PathBuf::from("/tmp/clip/out.mp4.video.tmp")); + } } diff --git a/src-tauri/src/export.rs b/src-tauri/src/export.rs index 1031ffb..bd8f8b9 100644 --- a/src-tauri/src/export.rs +++ b/src-tauri/src/export.rs @@ -7,9 +7,13 @@ //! (`opentake_media::VideoEncoder`) to produce a real `.mp4` on disk. //! //! Scope of this first cut (SPEC §2.4 / §8.2): -//! - **Pure video** (no audio mix), **H.264 / .mp4** only. The encoder already -//! supports H.265 / ProRes presets and an audio side-channel; those land in a -//! follow-up so this slice stays a clean, verifiable spine. +//! - **H.264 / .mp4** only. The encoder already supports H.265 / ProRes presets; +//! those land in a follow-up so this slice stays a clean, verifiable spine. +//! - **Linear audio mixdown**: every audio-bearing clip's source window is +//! decoded to mono f32 at the mix rate, placed at its frame-derived sample +//! offset, scaled by its `volume_at` envelope, summed, hard-limited, and mux'd +//! in by the encoder (`-c:v copy` + AAC). A timeline with no audio still +//! produces the same video-only file as before. //! - Export renders at the **full** export resolution //! ([`opentake_render::export_render_size`]), not the preview cap. //! - No progress callback / cancellation yet (the orchestrator runs to @@ -29,10 +33,11 @@ use serde::{Deserialize, Serialize}; use tauri::State; use opentake_core::AppCore; -use opentake_domain::{ClipType, MediaSource, TextStyle}; +use opentake_domain::{Clip, ClipType, MediaSource, TextStyle}; +use opentake_media::encode::{mix, ClipAudio, MIX_SAMPLE_RATE}; use opentake_media::{ - decode_frame_at, ExportPreset, ExportResolution as EncodeResolution, FrameRequest, RgbaFrame, - VideoCodec, VideoEncoder, + decode_frame_at, extract_pcm, ExportPreset, ExportResolution as EncodeResolution, FrameRequest, + PcmBuffer, PcmFormat, PcmSpec, RgbaFrame, VideoCodec, VideoEncoder, }; use opentake_render::gpu::texture::upload_rgba; use opentake_render::{ @@ -314,6 +319,124 @@ fn project_media( (sizes, media) } +/// PCM spec the export decodes every audio source window into: mono f32 at the +/// shared mix sample rate. Decoding at the mix rate up front makes the mixdown a +/// plain sample-aligned add (no per-clip resampling in this cut). +const AUDIO_DECODE_SPEC: PcmSpec = PcmSpec { + sample_rate: MIX_SAMPLE_RATE, + channels: 1, + format: PcmFormat::F32, +}; + +/// Project one audio clip into a [`ClipAudio`] for the mixdown: decode its +/// visible source window, place it at its frame-derived sample offset, and build +/// the per-sample `volume_at` gain envelope. +/// +/// Returns `Ok(None)` when the clip contributes no audio (no media path, no +/// audio track, zero-length window, or a fully-decoded-to-empty buffer). Decode +/// failures other than "no audio track" propagate as `Err`. +fn project_clip_audio( + clip: &Clip, + media: &HashMap, + timeline_fps: i32, +) -> Result, String> { + if clip.duration_frames <= 0 || timeline_fps <= 0 { + return Ok(None); + } + let Some(info) = media.get(&clip.media_ref) else { + return Ok(None); + }; + + // Source window in seconds: the clip's trim start through the frames it + // consumes, at the *source* fps. Falls back to the timeline fps when the + // source rate is unknown (audio-only assets often report no fps). + let src_fps = if info.fps > 0.0 { + info.fps + } else { + timeline_fps as f64 + }; + let lo = clip.trim_start_frame.max(0) as f64 / src_fps; + let consumed = clip.source_frames_consumed().max(0); + if consumed == 0 { + return Ok(None); + } + let hi = lo + consumed as f64 / src_fps; + + let pcm = match extract_pcm(&info.path, &AUDIO_DECODE_SPEC, Some((lo, hi))) { + Ok(p) => p, + // A clip pointing at a video with no audio track simply contributes + // silence — not an export failure. + Err(opentake_media::MediaError::NoTrack(_, _)) => return Ok(None), + Err(e) => return Err(format!("audio decode failed for {}: {e}", clip.media_ref)), + }; + if pcm.samples_f32.is_empty() { + return Ok(None); + } + + // Placement: the clip's timeline start frame, in mix samples. + let start_sample = ((clip.start_frame.max(0) as f64) / timeline_fps as f64 + * MIX_SAMPLE_RATE as f64) + .round() as usize; + + // Per-sample gain from `volume_at`, sampled at the timeline frame each mix + // sample falls on. Unity throughout collapses to an empty envelope. + let samples_per_frame = MIX_SAMPLE_RATE as f64 / timeline_fps as f64; + let mut gains = Vec::with_capacity(pcm.samples_f32.len()); + let mut all_unity = true; + for k in 0..pcm.samples_f32.len() { + let tl_frame = clip.start_frame + (k as f64 / samples_per_frame).floor() as i32; + let g = clip.volume_at(tl_frame) as f32; + if (g - 1.0).abs() > f32::EPSILON { + all_unity = false; + } + gains.push(g); + } + + Ok(Some(ClipAudio { + start_sample, + samples: pcm.samples_f32, + gains: if all_unity { Vec::new() } else { gains }, + })) +} + +/// Decode + mix every audio-bearing clip on the timeline into one mono buffer. +/// +/// Walks audio and video clips (video clips can carry an audio track), projects +/// each through [`project_clip_audio`], and linearly mixes the lot. Returns +/// `None` when nothing contributes audio (→ the caller keeps the video-only +/// output). Errors surface decode/mix failures to the front-end. +fn mix_timeline_audio( + timeline: &opentake_domain::Timeline, + media: &HashMap, +) -> Result, String> { + let mut clips_audio: Vec = Vec::new(); + for track in &timeline.tracks { + if track.muted { + continue; + } + for clip in &track.clips { + // Only audio and video clips carry sound; text/image/lottie don't. + if clip.media_type != ClipType::Audio && clip.media_type != ClipType::Video { + continue; + } + if let Some(ca) = project_clip_audio(clip, media, timeline.fps)? { + clips_audio.push(ca); + } + } + } + if clips_audio.is_empty() { + return Ok(None); + } + let mixed = mix::mix_clips(&clips_audio).map_err(|e| format!("audio mix failed: {e}"))?; + if mixed.is_empty() { + return Ok(None); + } + Ok(Some(PcmBuffer { + spec: AUDIO_DECODE_SPEC, + samples_f32: mixed, + })) +} + /// `export_video`: render the whole timeline to a video file on disk. /// /// Composites every frame at the full export resolution and encodes them to @@ -402,6 +525,12 @@ pub fn run_export( .map_err(|e| format!("encode frame {f} failed: {e}"))?; } + // Decode + linearly mix every audio-bearing clip, then hand the mixed PCM to + // the encoder so `finish` mux's it into the container. No audio → video-only. + if let Some(pcm) = mix_timeline_audio(timeline, &media)? { + encoder.push_audio(pcm); + } + encoder .finish() .map_err(|e| format!("encoder finish failed: {e}"))?; @@ -507,4 +636,66 @@ mod tests { .expect("parse"); assert_eq!(req.quality, ExportQuality::P720); } + + use opentake_domain::{Timeline, Track}; + + #[test] + fn project_clip_audio_skips_clip_with_no_media_entry() { + // No matching manifest entry → no audio contribution, no decode attempt. + let clip = Clip::new("c1", "missing-asset", 0, 30); + let media: HashMap = HashMap::new(); + let got = project_clip_audio(&clip, &media, 30).expect("ok"); + assert!(got.is_none()); + } + + #[test] + fn project_clip_audio_skips_zero_duration() { + let clip = Clip::new("c1", "asset-1", 0, 0); + let mut media: HashMap = HashMap::new(); + media.insert( + "asset-1".into(), + MediaInfo { + path: PathBuf::from("/nonexistent.wav"), + fps: 0.0, + }, + ); + // duration 0 short-circuits before any decode is attempted. + assert!(project_clip_audio(&clip, &media, 30).expect("ok").is_none()); + } + + #[test] + fn mix_timeline_audio_none_when_only_text_clips() { + // A text clip carries no sound; with no audio/video clips there's nothing + // to decode, so the result is None without touching the media map. + let mut tl = Timeline::new(); + let mut track = Track::new("t1", ClipType::Text); + let mut clip = Clip::new("c1", "asset-1", 0, 30); + clip.media_type = ClipType::Text; + track.clips.push(clip); + tl.tracks.push(track); + let media: HashMap = HashMap::new(); + assert!(mix_timeline_audio(&tl, &media).expect("ok").is_none()); + } + + #[test] + fn mix_timeline_audio_skips_muted_tracks() { + // A muted audio track is excluded; with no other audio the result is None + // and the (missing-path) asset is never decoded. + let mut tl = Timeline::new(); + let mut track = Track::new("t1", ClipType::Audio); + track.muted = true; + let mut clip = Clip::new("c1", "asset-1", 0, 30); + clip.media_type = ClipType::Audio; + track.clips.push(clip); + tl.tracks.push(track); + let mut media: HashMap = HashMap::new(); + media.insert( + "asset-1".into(), + MediaInfo { + path: PathBuf::from("/nonexistent.wav"), + fps: 0.0, + }, + ); + assert!(mix_timeline_audio(&tl, &media).expect("ok").is_none()); + } } diff --git a/src-tauri/tests/export_integration.rs b/src-tauri/tests/export_integration.rs index b071a74..217f80b 100644 --- a/src-tauri/tests/export_integration.rs +++ b/src-tauri/tests/export_integration.rs @@ -54,6 +54,58 @@ fn make_video(path: &Path, w: u32, h: u32, fps: u32, frames: u32) -> bool { .unwrap_or(false) } +/// Generate an N-frame test video *with* a sine audio track. Returns false on +/// failure (→ skip). +fn make_video_with_audio(path: &Path, w: u32, h: u32, fps: u32, frames: u32) -> bool { + let dur = frames as f64 / fps as f64; + Command::new("ffmpeg") + .args([ + "-v", + "error", + "-f", + "lavfi", + "-i", + &format!("testsrc=duration={dur}:size={w}x{h}:rate={fps}"), + "-f", + "lavfi", + "-i", + &format!("sine=frequency=440:duration={dur}"), + "-c:v", + "libx264", + "-pix_fmt", + "yuv420p", + "-c:a", + "aac", + "-shortest", + "-y", + ]) + .arg(path) + .status() + .map(|s| s.success()) + .unwrap_or(false) +} + +/// True when the file has at least one audio stream (per ffprobe). +fn has_audio_stream(path: &Path) -> bool { + let out = Command::new("ffprobe") + .args([ + "-v", + "error", + "-select_streams", + "a:0", + "-show_entries", + "stream=codec_type", + "-of", + "default=noprint_wrappers=1:nokey=1", + ]) + .arg(path) + .output(); + match out { + Ok(o) => String::from_utf8_lossy(&o.stdout).trim() == "audio", + Err(_) => false, + } +} + /// ffprobe a single stream field as a trimmed string. fn probe_field(path: &Path, entry: &str) -> Option { let out = Command::new("ffprobe") @@ -111,6 +163,18 @@ fn build_timeline(frames: i32, src_w: i32, src_h: i32, src_fps: f64) -> Timeline /// Build a manifest with one external video asset pointing at `media_path`. fn build_manifest(media_path: &Path, src_w: i32, src_h: i32, src_fps: f64) -> MediaManifest { + build_manifest_with_audio(media_path, src_w, src_h, src_fps, false) +} + +/// Like [`build_manifest`] but lets the test declare whether the asset carries +/// an audio track (so the export's audio mixdown path is exercised). +fn build_manifest_with_audio( + media_path: &Path, + src_w: i32, + src_h: i32, + src_fps: f64, + has_audio: bool, +) -> MediaManifest { let mut manifest = MediaManifest::new(); manifest.entries.push(MediaManifestEntry { id: "asset-1".into(), @@ -124,7 +188,7 @@ fn build_manifest(media_path: &Path, src_w: i32, src_h: i32, src_fps: f64) -> Me source_width: Some(src_w), source_height: Some(src_h), source_fps: Some(src_fps), - has_audio: Some(false), + has_audio: Some(has_audio), folder_id: None, cached_remote_url: None, cached_remote_url_expires_at: None, @@ -204,4 +268,91 @@ fn export_full_timeline_produces_playable_mp4() { nframes, frames as u64, "encoded frame count matches timeline" ); + + // The video-only source has no audio track → export stays video-only. + assert!( + !has_audio_stream(&out), + "video-only timeline must not gain an audio stream" + ); +} + +#[test] +fn export_with_audio_clip_mux_aac_stream() { + if !ffmpeg_ready() { + eprintln!("skip: ffmpeg/ffprobe not available"); + return; + } + + let dir = tempfile::tempdir().unwrap(); + let src = dir.path().join("src_audio.mp4"); + let out = dir.path().join("out_audio.mp4"); + + // Source: 320x240 @ 10fps, 10 frames (1.0s), WITH a 440 Hz sine track. + let (sw, sh, sfps, frames) = (320u32, 240u32, 10u32, 10u32); + if !make_video_with_audio(&src, sw, sh, sfps, frames) { + eprintln!("skip: could not generate audio fixture media"); + return; + } + // Sanity: the fixture really has audio (else the assertion below is vacuous). + if !has_audio_stream(&src) { + eprintln!("skip: fixture lacks an audio stream"); + return; + } + + let timeline = build_timeline(frames as i32, sw as i32, sh as i32, sfps as f64); + let manifest = build_manifest_with_audio(&src, sw as i32, sh as i32, sfps as f64, true); + + let req = ExportRequest { + out_path: out.to_string_lossy().into_owned(), + codec: Default::default(), // H.264 → AAC audio + quality: ExportQuality::P720, + }; + + let summary = match run_export(&timeline, &manifest, &None, &req) { + Ok(s) => s, + Err(e) => { + if e.contains("no GPU device") { + eprintln!("skip: no GPU adapter available ({e})"); + return; + } + panic!("export failed: {e}"); + } + }; + + assert!(out.exists(), "output file should exist"); + assert_eq!(summary.frame_count, frames as i32); + + // Video stream is still H.264 at the reported size. + let vcodec = probe_field(&out, "stream=codec_name").unwrap(); + assert_eq!(vcodec, "h264", "video codec should be H.264"); + + // The mixdown muxed an audio stream into the container. + assert!( + has_audio_stream(&out), + "audio-bearing timeline must produce an audio stream" + ); + + // The muxed audio codec is AAC (H.264 preset's `-c:a aac`). + let acodec = Command::new("ffprobe") + .args([ + "-v", + "error", + "-select_streams", + "a:0", + "-show_entries", + "stream=codec_name", + "-of", + "default=noprint_wrappers=1:nokey=1", + ]) + .arg(&out) + .output() + .ok() + .map(|o| String::from_utf8_lossy(&o.stdout).trim().to_string()); + assert_eq!(acodec.as_deref(), Some("aac"), "muxed audio should be AAC"); + + // The temp mux artifacts are cleaned up (no `.tmp` siblings left behind). + let leftover_video = dir.path().join("out_audio.mp4.video.tmp"); + let leftover_pcm = dir.path().join("out_audio.mp4.pcm.tmp"); + assert!(!leftover_video.exists(), "video temp should be removed"); + assert!(!leftover_pcm.exists(), "pcm temp should be removed"); }