From 2371cd45b5156ab0279b00920596aec58ff5df7e Mon Sep 17 00:00:00 2001 From: Kevin Ahrendt Date: Thu, 11 Jun 2026 09:58:53 -0400 Subject: [PATCH 1/2] Reduce initial playback stutter with extra startup silence and underflow keepalive Two changes give the decode pipeline more slack at stream start: - Insert extra silence after the first playback notification, before the first decoded chunk reaches the sink, so the decoder has time to stay ahead. The amount is configurable via PlayerRoleConfig::extra_startup_silence_ms (default 50 ms; 0 disables). - On encoded ring-buffer underflow, feed silence to keep the DAC fed instead of letting it run dry. Gated on `aligning` so it only runs during startup/post-seek alignment, not in steady state where stuffing silence at stream end would pile up in the sink and delay a rapid restart. Adds the frame_aligned_silence_bytes() helper and fill_underflow_silence() to keep the sync task readable, and documents the behavior in the integration guide and internals docs. --- docs/integration-guide.md | 2 ++ docs/internals.md | 4 +-- include/sendspin/config.h | 10 ++++++ src/sync_task.cpp | 65 ++++++++++++++++++++++++++++++++++----- src/sync_task.h | 6 ++++ 5 files changed, 77 insertions(+), 10 deletions(-) diff --git a/docs/integration-guide.md b/docs/integration-guide.md index 9f13917..5620341 100644 --- a/docs/integration-guide.md +++ b/docs/integration-guide.md @@ -69,6 +69,7 @@ player_config.audio_formats = { player_config.audio_buffer_capacity = 1000000; // Ring buffer size in bytes (default: 1000000) player_config.fixed_delay_us = 0; // Fixed delay offset in microseconds player_config.initial_static_delay_ms = 0; // Initial user-adjustable delay +player_config.extra_startup_silence_ms = 50; // Extra startup silence for decode headroom (default: 50) auto& player = client.add_player(std::move(player_config)); ``` @@ -720,6 +721,7 @@ Configuration passed to `client.add_player()`. | `audio_buffer_capacity` | `size_t` | `1000000` | Internal ring buffer size in bytes. Larger buffers absorb more jitter at the cost of memory. | | `fixed_delay_us` | `int32_t` | `0` | Fixed platform-level delay offset in microseconds (e.g., a known I2S pipeline delay). Applied on top of the user-adjustable static delay. | | `initial_static_delay_ms` | `uint16_t` | `0` | Initial value for the user-adjustable static delay in milliseconds. Overridden by the persisted value if a `SendspinPersistenceProvider` is set. | +| `extra_startup_silence_ms` | `uint16_t` | `50` | Extra silence inserted at stream start, after the first playback notification and before the first decoded chunk reaches the sink. Added on top of the initial-sync priming silence to give the decode pipeline more slack to stay ahead of the sink, preventing the initial-playback stutter caused by the decoder briefly falling behind. Larger values trade a longer startup delay for more underflow protection; set to `0` to disable. | | `psram_stack` | `bool` | `false` | Allocate sync/decode task stack in PSRAM (ESP-IDF only) | | `priority` | `unsigned` | `6` | FreeRTOS priority for the sync/decode task (ESP-IDF only). The default value, `6`, is one above the default `httpd_priority` (`5`). If you customize priorities, keep this above `httpd_priority` so the HTTP server task cannot starve the decoder during the initial burst of encoded audio that fills the buffer at stream start. | | `decode_buffer_location` | `MemoryLocation` | `PREFER_EXTERNAL` | Memory placement preference for the decode transfer buffer. `PREFER_EXTERNAL` tries SPIRAM first and falls back to internal RAM; `PREFER_INTERNAL` does the reverse. ESP-IDF only; ignored on host. | diff --git a/docs/internals.md b/docs/internals.md index fb3d3f4..bfdfc2d 100644 --- a/docs/internals.md +++ b/docs/internals.md @@ -319,9 +319,9 @@ INITIAL_SYNC ──→ LOAD_CHUNK ──→ SYNCHRONIZE_AUDIO ──→ TRANSFER └──→ LOAD_CHUNK (once first playback progress callback confirms frames were consumed) ``` -**INITIAL_SYNC**: Fills the audio pipeline with silence to prime DMA buffers. Sleeps briefly after sending to let the audio stack start consuming. +**INITIAL_SYNC**: Fills the audio pipeline with silence to prime DMA buffers. Sleeps briefly after sending to let the audio stack start consuming. Once the first playback-progress callback confirms frames were consumed, it queues `extra_startup_silence_ms` of additional silence (see `PlayerRoleConfig`) and drains it before advancing to LOAD_CHUNK. This extra lead gives the decode pipeline slack to stay ahead of the sink at stream start, preventing the initial-playback stutter caused by the decoder briefly falling behind. -**LOAD_CHUNK**: Reads the next encoded chunk from the ring buffer. Waits for time sync if not yet available. Decodes audio via FLAC/Opus/PCM decoder. +**LOAD_CHUNK**: Reads the next encoded chunk from the ring buffer. Waits for time sync if not yet available. Decodes audio via FLAC/Opus/PCM decoder. On a ring-buffer underflow (no chunk ready) **while still aligning** (startup or post-seek), it feeds silence toward the sink to keep the DAC fed while the decode pipeline catches up, instead of letting it run dry; SYNCHRONIZE_AUDIO then re-aligns the next chunk against wherever the silence carried us. In steady state it does **not** fill — an empty buffer there means the stream is winding down, and stuffing silence would pile up in the sink and delay a rapid restart (a genuine underrun instead surfaces as an error in SYNCHRONIZE_AUDIO). **SYNCHRONIZE_AUDIO**: Computes the sync error: diff --git a/include/sendspin/config.h b/include/sendspin/config.h index 37b1fa7..bd90f15 100644 --- a/include/sendspin/config.h +++ b/include/sendspin/config.h @@ -120,6 +120,16 @@ struct PlayerRoleConfig { size_t audio_buffer_capacity{DEFAULT_AUDIO_BUFFER_CAPACITY}; int32_t fixed_delay_us{0}; uint16_t initial_static_delay_ms{0}; + + /// @brief Default extra silence (ms) inserted at stream start for decode-pipeline headroom + static constexpr uint16_t DEFAULT_EXTRA_STARTUP_SILENCE_MS = 50U; + + /// @brief Extra silence (ms) inserted at stream start, after the first playback notification + /// and before the first decoded chunk, on top of the initial-sync priming silence. Gives the + /// decode pipeline slack to stay ahead of the sink, preventing the initial-playback stutter. + /// Larger values trade longer startup latency for more underflow protection; 0 disables. + uint16_t extra_startup_silence_ms{DEFAULT_EXTRA_STARTUP_SILENCE_MS}; + bool psram_stack{false}; ///< Allocate sync task stack in PSRAM (ESP-IDF only) /// @brief Default FreeRTOS priority for the sync/decode task (ESP-IDF only). diff --git a/src/sync_task.cpp b/src/sync_task.cpp index b9ceb51..9cf5111 100644 --- a/src/sync_task.cpp +++ b/src/sync_task.cpp @@ -48,6 +48,10 @@ static constexpr uint32_t WAIT_FOR_TIME_SYNC_MS = 15U; /// @brief Timeout (ms) for receiving the next encoded audio chunk from the ring buffer static constexpr uint32_t ENCODED_CHUNK_RECEIVE_TIMEOUT_MS = 15U; +/// @brief Silence (ms) fed to the sink per retry on a ring-buffer underflow, to keep the DAC fed +/// instead of running dry. Larger than the load-wait timeout so each retry outpaces the drain. +static constexpr uint32_t UNDERFLOW_SILENCE_KEEPALIVE_MS = 20U; + /// @brief Timeout (ms) for on_audio_write pushes; bounds how long the sync task blocks on the /// sink before returning to its inner loop to re-check flags and drift. static constexpr uint32_t AUDIO_WRITE_TIMEOUT_MS = 20U; @@ -68,6 +72,15 @@ static constexpr size_t SILENCE_SCRATCH_BYTES = 1024; /// flood. .bss costs no heap and no flash; it is reserved and zeroed once at startup. static uint8_t silence_scratch[SILENCE_SCRATCH_BYTES] = {}; +/// @brief Byte count for `duration_ms` of silence, rounded down to whole frames so per-write chunks +/// and track_sent_audio() accounting stay on frame boundaries (the ms->bytes result need not +/// align). +static size_t frame_aligned_silence_bytes(const AudioStreamInfo& stream_info, + uint32_t duration_ms) { + return stream_info.frames_to_bytes( + stream_info.bytes_to_frames(stream_info.ms_to_bytes(duration_ms))); +} + static const char* const TAG = "sendspin.sync_task"; // ============================================================================ @@ -167,18 +180,19 @@ void SyncTask::notify_audio_played(uint32_t frames, int64_t timestamp) { SyncTaskState SyncTask::handle_initial_sync(SyncContext& sync_context) { if (!sync_context.initial_decode) { - // Priming done (the audio stack has started consuming) - drop any leftover priming silence - // so it is not injected before the first real chunk. - sync_context.silence_remaining = 0; + // Priming done. process_playback_progress() queued the extra startup silence on the first + // playback notification; drain it before the first real chunk so the decode pipeline has + // slack to stay ahead of the sink. + this->send_pending_silence(sync_context); + if (sync_context.silence_remaining > 0) { + return SyncTaskState::INITIAL_SYNC; + } return SyncTaskState::LOAD_CHUNK; } if (sync_context.silence_remaining == 0) { - // Keep the silence run frame-aligned so per-write chunks (and the playtime accounting in - // track_sent_audio) land on whole frames. - sync_context.silence_remaining = sync_context.current_stream_info.frames_to_bytes( - sync_context.current_stream_info.bytes_to_frames( - sync_context.current_stream_info.ms_to_bytes(INITIAL_SYNC_ZEROS_DURATION_MS))); + sync_context.silence_remaining = frame_aligned_silence_bytes( + sync_context.current_stream_info, INITIAL_SYNC_ZEROS_DURATION_MS); } this->send_pending_silence(sync_context); @@ -194,6 +208,13 @@ SyncTaskState SyncTask::handle_load_chunk(SyncContext& sync_context) { return SyncTaskState::LOAD_CHUNK; } if (!this->load_next_chunk(sync_context)) { + // Bridge underflows with silence only while aligning (startup/post-seek). In steady state + // an empty buffer means the stream is winding down; stuffing silence would pile up in the + // sink and delay a rapid restart (and a real underrun is better surfaced as an error than + // masked). + if (sync_context.aligning) { + this->fill_underflow_silence(sync_context); + } return SyncTaskState::LOAD_CHUNK; } DecodeResult decode_result = this->decode_chunk(sync_context); @@ -360,6 +381,28 @@ void SyncTask::send_pending_silence(SyncContext& sync_context) { } } +void SyncTask::fill_underflow_silence(SyncContext& sync_context) { + // Bridge a startup/post-seek underflow: keep the sink fed with silence until the next chunk + // arrives instead of spinning and draining the DAC. Feeding silence advances + // new_audio_client_playtime, so handle_synchronize_audio() re-aligns the next chunk against it + // once one arrives. + if (this->player_impl_->listener == nullptr) { + return; + } + if (sync_context.silence_remaining == 0) { + sync_context.silence_remaining = frame_aligned_silence_bytes( + sync_context.current_stream_info, UNDERFLOW_SILENCE_KEEPALIVE_MS); + } + // Drain the window block by block; send_pending_silence() blocks on sink backpressure, and we + // bail the instant a chunk lands or a lifecycle command fires. + while ( + (sync_context.silence_remaining > 0) && + (this->encoded_ring_buffer_->chunks_waiting() == 0) && + !(this->event_flags_.get() & (COMMAND_STOP | COMMAND_STREAM_END | COMMAND_STREAM_CLEAR))) { + this->send_pending_silence(sync_context); + } +} + bool SyncTask::transfer_audio(SyncContext& sync_context) { // Pending silence (priming or hard-sync gap fill) goes out before the decoded chunk this->send_pending_silence(sync_context); @@ -703,7 +746,13 @@ void SyncTask::process_playback_progress(SyncContext& sync_context) { uint32_t frames_played = playback_progress.frames_played; if (sync_context.initial_decode && frames_played) { + // First audio reached the sink. Queue the extra startup silence (replacing any unsent + // priming silence) for decode-pipeline slack before the sink drains; + // handle_initial_sync() drains it before the first real chunk. sync_context.initial_decode = false; + sync_context.silence_remaining = + frame_aligned_silence_bytes(sync_context.current_stream_info, + this->player_impl_->config.extra_startup_silence_ms); } if (frames_played > sync_context.buffered_frames) { diff --git a/src/sync_task.h b/src/sync_task.h index 6bf0cb6..1d76413 100644 --- a/src/sync_task.h +++ b/src/sync_task.h @@ -205,6 +205,12 @@ class SyncTask { /// the sink and updates the playtime estimate. No-op when no silence is pending. void send_pending_silence(SyncContext& sync_context); + /// @brief Bridges an encoded-chunk underflow while aligning (startup/post-seek) by feeding the + /// sink silence (up to UNDERFLOW_SILENCE_KEEPALIVE_MS) instead of letting the DAC run dry, + /// bailing the instant a chunk lands or a lifecycle command fires. Only called while aligning; + /// see handle_load_chunk(). + void fill_underflow_silence(SyncContext& sync_context); + /// @brief Transfers pending silence (if any) then the decoded chunk to the sink /// Returns true when all data has been sent, false if more transfers are needed. bool transfer_audio(SyncContext& sync_context); From 2522a8f1f0417ba56f63eaaaa43c9207cdc34543 Mon Sep 17 00:00:00 2001 From: Kevin Ahrendt Date: Thu, 11 Jun 2026 10:51:35 -0400 Subject: [PATCH 2/2] Rephrase two code comemnts for clarity --- src/sync_task.cpp | 10 ++++++---- src/sync_task.h | 4 ++-- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/sync_task.cpp b/src/sync_task.cpp index 9cf5111..967dcb1 100644 --- a/src/sync_task.cpp +++ b/src/sync_task.cpp @@ -48,8 +48,9 @@ static constexpr uint32_t WAIT_FOR_TIME_SYNC_MS = 15U; /// @brief Timeout (ms) for receiving the next encoded audio chunk from the ring buffer static constexpr uint32_t ENCODED_CHUNK_RECEIVE_TIMEOUT_MS = 15U; -/// @brief Silence (ms) fed to the sink per retry on a ring-buffer underflow, to keep the DAC fed -/// instead of running dry. Larger than the load-wait timeout so each retry outpaces the drain. +/// @brief Silence (ms) queued per encoded-chunk underflow to keep the DAC fed between chunks. A bit +/// above ENCODED_CHUNK_RECEIVE_TIMEOUT_MS so it spans one more load wait, though not a strict +/// bound: the fill bails as soon as a chunk lands and is paced by sink backpressure. static constexpr uint32_t UNDERFLOW_SILENCE_KEEPALIVE_MS = 20U; /// @brief Timeout (ms) for on_audio_write pushes; bounds how long the sync task blocks on the @@ -393,8 +394,9 @@ void SyncTask::fill_underflow_silence(SyncContext& sync_context) { sync_context.silence_remaining = frame_aligned_silence_bytes( sync_context.current_stream_info, UNDERFLOW_SILENCE_KEEPALIVE_MS); } - // Drain the window block by block; send_pending_silence() blocks on sink backpressure, and we - // bail the instant a chunk lands or a lifecycle command fires. + // Drain the window block by block; send_pending_silence() blocks on sink backpressure. The loop + // re-checks between blocks, so it stops after the current write once a chunk lands or a + // lifecycle command fires. while ( (sync_context.silence_remaining > 0) && (this->encoded_ring_buffer_->chunks_waiting() == 0) && diff --git a/src/sync_task.h b/src/sync_task.h index 1d76413..e07d4ed 100644 --- a/src/sync_task.h +++ b/src/sync_task.h @@ -207,8 +207,8 @@ class SyncTask { /// @brief Bridges an encoded-chunk underflow while aligning (startup/post-seek) by feeding the /// sink silence (up to UNDERFLOW_SILENCE_KEEPALIVE_MS) instead of letting the DAC run dry, - /// bailing the instant a chunk lands or a lifecycle command fires. Only called while aligning; - /// see handle_load_chunk(). + /// stopping after the current silence write once a chunk lands or a lifecycle command fires. + /// Only called while aligning; see handle_load_chunk(). void fill_underflow_silence(SyncContext& sync_context); /// @brief Transfers pending silence (if any) then the decoded chunk to the sink