From 2371cd45b5156ab0279b00920596aec58ff5df7e Mon Sep 17 00:00:00 2001
From: Kevin Ahrendt <kahrendt@gmail.com>
Date: Thu, 11 Jun 2026 09:58:53 -0400
Subject: [PATCH 1/2] Reduce initial playback stutter with extra startup
 silence and underflow keepalive

Two changes give the decode pipeline more slack at stream start:

- Insert extra silence after the first playback notification, before the
  first decoded chunk reaches the sink, so the decoder has time to stay
  ahead. The amount is configurable via
  PlayerRoleConfig::extra_startup_silence_ms (default 50 ms; 0 disables).
- On encoded ring-buffer underflow, feed silence to keep the DAC fed
  instead of letting it run dry. Gated on `aligning` so it only runs
  during startup/post-seek alignment, not in steady state where stuffing
  silence at stream end would pile up in the sink and delay a rapid
  restart.

Adds the frame_aligned_silence_bytes() helper and fill_underflow_silence()
to keep the sync task readable, and documents the behavior in the
integration guide and internals docs.
---
 docs/integration-guide.md |  2 ++
 docs/internals.md         |  4 +--
 include/sendspin/config.h | 10 ++++++
 src/sync_task.cpp         | 65 ++++++++++++++++++++++++++++++++++-----
 src/sync_task.h           |  6 ++++
 5 files changed, 77 insertions(+), 10 deletions(-)

diff --git a/docs/integration-guide.md b/docs/integration-guide.md
index 9f13917..5620341 100644
--- a/docs/integration-guide.md
+++ b/docs/integration-guide.md
@@ -69,6 +69,7 @@ player_config.audio_formats = {
 player_config.audio_buffer_capacity = 1000000;   // Ring buffer size in bytes (default: 1000000)
 player_config.fixed_delay_us = 0;                // Fixed delay offset in microseconds
 player_config.initial_static_delay_ms = 0;       // Initial user-adjustable delay
+player_config.extra_startup_silence_ms = 50;     // Extra startup silence for decode headroom (default: 50)
 
 auto& player = client.add_player(std::move(player_config));
 ```
@@ -720,6 +721,7 @@ Configuration passed to `client.add_player()`.
 | `audio_buffer_capacity` | `size_t` | `1000000` | Internal ring buffer size in bytes. Larger buffers absorb more jitter at the cost of memory. |
 | `fixed_delay_us` | `int32_t` | `0` | Fixed platform-level delay offset in microseconds (e.g., a known I2S pipeline delay). Applied on top of the user-adjustable static delay. |
 | `initial_static_delay_ms` | `uint16_t` | `0` | Initial value for the user-adjustable static delay in milliseconds. Overridden by the persisted value if a `SendspinPersistenceProvider` is set. |
+| `extra_startup_silence_ms` | `uint16_t` | `50` | Extra silence inserted at stream start, after the first playback notification and before the first decoded chunk reaches the sink. Added on top of the initial-sync priming silence to give the decode pipeline more slack to stay ahead of the sink, preventing the initial-playback stutter caused by the decoder briefly falling behind. Larger values trade a longer startup delay for more underflow protection; set to `0` to disable. |
 | `psram_stack` | `bool` | `false` | Allocate sync/decode task stack in PSRAM (ESP-IDF only) |
 | `priority` | `unsigned` | `6` | FreeRTOS priority for the sync/decode task (ESP-IDF only). The default value, `6`, is one above the default `httpd_priority` (`5`). If you customize priorities, keep this above `httpd_priority` so the HTTP server task cannot starve the decoder during the initial burst of encoded audio that fills the buffer at stream start. |
 | `decode_buffer_location` | `MemoryLocation` | `PREFER_EXTERNAL` | Memory placement preference for the decode transfer buffer. `PREFER_EXTERNAL` tries SPIRAM first and falls back to internal RAM; `PREFER_INTERNAL` does the reverse. ESP-IDF only; ignored on host. |
diff --git a/docs/internals.md b/docs/internals.md
index fb3d3f4..bfdfc2d 100644
--- a/docs/internals.md
+++ b/docs/internals.md
@@ -319,9 +319,9 @@ INITIAL_SYNC ──→ LOAD_CHUNK ──→ SYNCHRONIZE_AUDIO ──→ TRANSFER
      └──→ LOAD_CHUNK (once first playback progress callback confirms frames were consumed)
 ```
 
-**INITIAL_SYNC**: Fills the audio pipeline with silence to prime DMA buffers. Sleeps briefly after sending to let the audio stack start consuming.
+**INITIAL_SYNC**: Fills the audio pipeline with silence to prime DMA buffers. Sleeps briefly after sending to let the audio stack start consuming. Once the first playback-progress callback confirms frames were consumed, it queues `extra_startup_silence_ms` of additional silence (see `PlayerRoleConfig`) and drains it before advancing to LOAD_CHUNK. This extra lead gives the decode pipeline slack to stay ahead of the sink at stream start, preventing the initial-playback stutter caused by the decoder briefly falling behind.
 
-**LOAD_CHUNK**: Reads the next encoded chunk from the ring buffer. Waits for time sync if not yet available. Decodes audio via FLAC/Opus/PCM decoder.
+**LOAD_CHUNK**: Reads the next encoded chunk from the ring buffer. Waits for time sync if not yet available. Decodes audio via FLAC/Opus/PCM decoder. On a ring-buffer underflow (no chunk ready) **while still aligning** (startup or post-seek), it feeds silence toward the sink to keep the DAC fed while the decode pipeline catches up, instead of letting it run dry; SYNCHRONIZE_AUDIO then re-aligns the next chunk against wherever the silence carried us. In steady state it does **not** fill — an empty buffer there means the stream is winding down, and stuffing silence would pile up in the sink and delay a rapid restart (a genuine underrun instead surfaces as an error in SYNCHRONIZE_AUDIO).
 
 **SYNCHRONIZE_AUDIO**: Computes the sync error:
 
diff --git a/include/sendspin/config.h b/include/sendspin/config.h
index 37b1fa7..bd90f15 100644
--- a/include/sendspin/config.h
+++ b/include/sendspin/config.h
@@ -120,6 +120,16 @@ struct PlayerRoleConfig {
     size_t audio_buffer_capacity{DEFAULT_AUDIO_BUFFER_CAPACITY};
     int32_t fixed_delay_us{0};
     uint16_t initial_static_delay_ms{0};
+
+    /// @brief Default extra silence (ms) inserted at stream start for decode-pipeline headroom
+    static constexpr uint16_t DEFAULT_EXTRA_STARTUP_SILENCE_MS = 50U;
+
+    /// @brief Extra silence (ms) inserted at stream start, after the first playback notification
+    /// and before the first decoded chunk, on top of the initial-sync priming silence. Gives the
+    /// decode pipeline slack to stay ahead of the sink, preventing the initial-playback stutter.
+    /// Larger values trade longer startup latency for more underflow protection; 0 disables.
+    uint16_t extra_startup_silence_ms{DEFAULT_EXTRA_STARTUP_SILENCE_MS};
+
     bool psram_stack{false};  ///< Allocate sync task stack in PSRAM (ESP-IDF only)
 
     /// @brief Default FreeRTOS priority for the sync/decode task (ESP-IDF only).
diff --git a/src/sync_task.cpp b/src/sync_task.cpp
index b9ceb51..9cf5111 100644
--- a/src/sync_task.cpp
+++ b/src/sync_task.cpp
@@ -48,6 +48,10 @@ static constexpr uint32_t WAIT_FOR_TIME_SYNC_MS = 15U;
 /// @brief Timeout (ms) for receiving the next encoded audio chunk from the ring buffer
 static constexpr uint32_t ENCODED_CHUNK_RECEIVE_TIMEOUT_MS = 15U;
 
+/// @brief Silence (ms) fed to the sink per retry on a ring-buffer underflow, to keep the DAC fed
+/// instead of running dry. Larger than the load-wait timeout so each retry outpaces the drain.
+static constexpr uint32_t UNDERFLOW_SILENCE_KEEPALIVE_MS = 20U;
+
 /// @brief Timeout (ms) for on_audio_write pushes; bounds how long the sync task blocks on the
 /// sink before returning to its inner loop to re-check flags and drift.
 static constexpr uint32_t AUDIO_WRITE_TIMEOUT_MS = 20U;
@@ -68,6 +72,15 @@ static constexpr size_t SILENCE_SCRATCH_BYTES = 1024;
 /// flood. .bss costs no heap and no flash; it is reserved and zeroed once at startup.
 static uint8_t silence_scratch[SILENCE_SCRATCH_BYTES] = {};
 
+/// @brief Byte count for `duration_ms` of silence, rounded down to whole frames so per-write chunks
+/// and track_sent_audio() accounting stay on frame boundaries (the ms->bytes result need not
+/// align).
+static size_t frame_aligned_silence_bytes(const AudioStreamInfo& stream_info,
+                                          uint32_t duration_ms) {
+    return stream_info.frames_to_bytes(
+        stream_info.bytes_to_frames(stream_info.ms_to_bytes(duration_ms)));
+}
+
 static const char* const TAG = "sendspin.sync_task";
 
 // ============================================================================
@@ -167,18 +180,19 @@ void SyncTask::notify_audio_played(uint32_t frames, int64_t timestamp) {
 
 SyncTaskState SyncTask::handle_initial_sync(SyncContext& sync_context) {
     if (!sync_context.initial_decode) {
-        // Priming done (the audio stack has started consuming) - drop any leftover priming silence
-        // so it is not injected before the first real chunk.
-        sync_context.silence_remaining = 0;
+        // Priming done. process_playback_progress() queued the extra startup silence on the first
+        // playback notification; drain it before the first real chunk so the decode pipeline has
+        // slack to stay ahead of the sink.
+        this->send_pending_silence(sync_context);
+        if (sync_context.silence_remaining > 0) {
+            return SyncTaskState::INITIAL_SYNC;
+        }
         return SyncTaskState::LOAD_CHUNK;
     }
 
     if (sync_context.silence_remaining == 0) {
-        // Keep the silence run frame-aligned so per-write chunks (and the playtime accounting in
-        // track_sent_audio) land on whole frames.
-        sync_context.silence_remaining = sync_context.current_stream_info.frames_to_bytes(
-            sync_context.current_stream_info.bytes_to_frames(
-                sync_context.current_stream_info.ms_to_bytes(INITIAL_SYNC_ZEROS_DURATION_MS)));
+        sync_context.silence_remaining = frame_aligned_silence_bytes(
+            sync_context.current_stream_info, INITIAL_SYNC_ZEROS_DURATION_MS);
     }
     this->send_pending_silence(sync_context);
 
@@ -194,6 +208,13 @@ SyncTaskState SyncTask::handle_load_chunk(SyncContext& sync_context) {
         return SyncTaskState::LOAD_CHUNK;
     }
     if (!this->load_next_chunk(sync_context)) {
+        // Bridge underflows with silence only while aligning (startup/post-seek). In steady state
+        // an empty buffer means the stream is winding down; stuffing silence would pile up in the
+        // sink and delay a rapid restart (and a real underrun is better surfaced as an error than
+        // masked).
+        if (sync_context.aligning) {
+            this->fill_underflow_silence(sync_context);
+        }
         return SyncTaskState::LOAD_CHUNK;
     }
     DecodeResult decode_result = this->decode_chunk(sync_context);
@@ -360,6 +381,28 @@ void SyncTask::send_pending_silence(SyncContext& sync_context) {
     }
 }
 
+void SyncTask::fill_underflow_silence(SyncContext& sync_context) {
+    // Bridge a startup/post-seek underflow: keep the sink fed with silence until the next chunk
+    // arrives instead of spinning and draining the DAC. Feeding silence advances
+    // new_audio_client_playtime, so handle_synchronize_audio() re-aligns the next chunk against it
+    // once one arrives.
+    if (this->player_impl_->listener == nullptr) {
+        return;
+    }
+    if (sync_context.silence_remaining == 0) {
+        sync_context.silence_remaining = frame_aligned_silence_bytes(
+            sync_context.current_stream_info, UNDERFLOW_SILENCE_KEEPALIVE_MS);
+    }
+    // Drain the window block by block; send_pending_silence() blocks on sink backpressure, and we
+    // bail the instant a chunk lands or a lifecycle command fires.
+    while (
+        (sync_context.silence_remaining > 0) &&
+        (this->encoded_ring_buffer_->chunks_waiting() == 0) &&
+        !(this->event_flags_.get() & (COMMAND_STOP | COMMAND_STREAM_END | COMMAND_STREAM_CLEAR))) {
+        this->send_pending_silence(sync_context);
+    }
+}
+
 bool SyncTask::transfer_audio(SyncContext& sync_context) {
     // Pending silence (priming or hard-sync gap fill) goes out before the decoded chunk
     this->send_pending_silence(sync_context);
@@ -703,7 +746,13 @@ void SyncTask::process_playback_progress(SyncContext& sync_context) {
         uint32_t frames_played = playback_progress.frames_played;
 
         if (sync_context.initial_decode && frames_played) {
+            // First audio reached the sink. Queue the extra startup silence (replacing any unsent
+            // priming silence) for decode-pipeline slack before the sink drains;
+            // handle_initial_sync() drains it before the first real chunk.
             sync_context.initial_decode = false;
+            sync_context.silence_remaining =
+                frame_aligned_silence_bytes(sync_context.current_stream_info,
+                                            this->player_impl_->config.extra_startup_silence_ms);
         }
 
         if (frames_played > sync_context.buffered_frames) {
diff --git a/src/sync_task.h b/src/sync_task.h
index 6bf0cb6..1d76413 100644
--- a/src/sync_task.h
+++ b/src/sync_task.h
@@ -205,6 +205,12 @@ class SyncTask {
     /// the sink and updates the playtime estimate. No-op when no silence is pending.
     void send_pending_silence(SyncContext& sync_context);
 
+    /// @brief Bridges an encoded-chunk underflow while aligning (startup/post-seek) by feeding the
+    /// sink silence (up to UNDERFLOW_SILENCE_KEEPALIVE_MS) instead of letting the DAC run dry,
+    /// bailing the instant a chunk lands or a lifecycle command fires. Only called while aligning;
+    /// see handle_load_chunk().
+    void fill_underflow_silence(SyncContext& sync_context);
+
     /// @brief Transfers pending silence (if any) then the decoded chunk to the sink
     /// Returns true when all data has been sent, false if more transfers are needed.
     bool transfer_audio(SyncContext& sync_context);

From 2522a8f1f0417ba56f63eaaaa43c9207cdc34543 Mon Sep 17 00:00:00 2001
From: Kevin Ahrendt <kahrendt@gmail.com>
Date: Thu, 11 Jun 2026 10:51:35 -0400
Subject: [PATCH 2/2] Rephrase two code comemnts for clarity

---
 src/sync_task.cpp | 10 ++++++----
 src/sync_task.h   |  4 ++--
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/sync_task.cpp b/src/sync_task.cpp
index 9cf5111..967dcb1 100644
--- a/src/sync_task.cpp
+++ b/src/sync_task.cpp
@@ -48,8 +48,9 @@ static constexpr uint32_t WAIT_FOR_TIME_SYNC_MS = 15U;
 /// @brief Timeout (ms) for receiving the next encoded audio chunk from the ring buffer
 static constexpr uint32_t ENCODED_CHUNK_RECEIVE_TIMEOUT_MS = 15U;
 
-/// @brief Silence (ms) fed to the sink per retry on a ring-buffer underflow, to keep the DAC fed
-/// instead of running dry. Larger than the load-wait timeout so each retry outpaces the drain.
+/// @brief Silence (ms) queued per encoded-chunk underflow to keep the DAC fed between chunks. A bit
+/// above ENCODED_CHUNK_RECEIVE_TIMEOUT_MS so it spans one more load wait, though not a strict
+/// bound: the fill bails as soon as a chunk lands and is paced by sink backpressure.
 static constexpr uint32_t UNDERFLOW_SILENCE_KEEPALIVE_MS = 20U;
 
 /// @brief Timeout (ms) for on_audio_write pushes; bounds how long the sync task blocks on the
@@ -393,8 +394,9 @@ void SyncTask::fill_underflow_silence(SyncContext& sync_context) {
         sync_context.silence_remaining = frame_aligned_silence_bytes(
             sync_context.current_stream_info, UNDERFLOW_SILENCE_KEEPALIVE_MS);
     }
-    // Drain the window block by block; send_pending_silence() blocks on sink backpressure, and we
-    // bail the instant a chunk lands or a lifecycle command fires.
+    // Drain the window block by block; send_pending_silence() blocks on sink backpressure. The loop
+    // re-checks between blocks, so it stops after the current write once a chunk lands or a
+    // lifecycle command fires.
     while (
         (sync_context.silence_remaining > 0) &&
         (this->encoded_ring_buffer_->chunks_waiting() == 0) &&
diff --git a/src/sync_task.h b/src/sync_task.h
index 1d76413..e07d4ed 100644
--- a/src/sync_task.h
+++ b/src/sync_task.h
@@ -207,8 +207,8 @@ class SyncTask {
 
     /// @brief Bridges an encoded-chunk underflow while aligning (startup/post-seek) by feeding the
     /// sink silence (up to UNDERFLOW_SILENCE_KEEPALIVE_MS) instead of letting the DAC run dry,
-    /// bailing the instant a chunk lands or a lifecycle command fires. Only called while aligning;
-    /// see handle_load_chunk().
+    /// stopping after the current silence write once a chunk lands or a lifecycle command fires.
+    /// Only called while aligning; see handle_load_chunk().
     void fill_underflow_silence(SyncContext& sync_context);
 
     /// @brief Transfers pending silence (if any) then the decoded chunk to the sink