diff --git a/extras/havpe-relay/firmware/voice-chronicle.yaml b/extras/havpe-relay/firmware/voice-chronicle.yaml index 387af022..c1140331 100644 --- a/extras/havpe-relay/firmware/voice-chronicle.yaml +++ b/extras/havpe-relay/firmware/voice-chronicle.yaml @@ -322,8 +322,7 @@ voice_kit: id: voice_kit_component i2c_id: internal_i2c reset_pin: GPIO4 - channel_0_stage: AGC # Speech-ready output (AEC→IC→NS→AGC) - channel_1_stage: NS # Pre-AGC tap (useful for wake word or analysis) + # No channel_0_stage/channel_1_stage — use XMOS defaults (matches official HA firmware) firmware: url: https://github.com/esphome/voice-kit-xmos-firmware/releases/download/v1.3.1/ffva_v1.3.1_upgrade.bin version: "1.3.1" @@ -361,20 +360,16 @@ i2s_audio: number: GPIO13 # ─────────── Microphone (TCP streaming to relay) ─────────── -# The XMOS voice kit outputs stereo 32-bit I2S (secondary mode). Channel 0 -# (right) carries the fully-processed speech signal (AEC→IC→NS→AGC), while -# channel 1 (left) is a pre-AGC tap useful for wake-word engines. +# The XMOS voice kit outputs stereo 32-bit I2S (secondary mode). +# ESPHome interleaves stereo as [L, R, L, R, ...] where channel 0 = left (even) +# and channel 1 = right (odd). +# Source: esphome/components/microphone/microphone_source.h +# https://github.com/esphome/esphome/blob/dev/esphome/components/microphone/microphone_source.h # -# We match the official HA Voice PE microphone config (stereo, 32-bit) and -# extract+downconvert in the on_data lambda before sending over TCP. -# -# References: -# Official Voice PE config: -# https://github.com/esphome/home-assistant-voice-pe/blob/dev/home-assistant-voice.yaml -# XMOS voice kit audio pipeline: -# https://deepwiki.com/esphome/home-assistant-voice-pe/3.2-audio-processing-pipeline -# I2S microphone component docs: -# https://esphome.io/components/microphone/i2s_audio/ +# The official HA Voice PE firmware uses channels: 0 (left) for voice assistant +# and channels: 1 (right) with gain_factor: 4 for wake word. +# Source: home-assistant-voice.yaml +# https://github.com/esphome/home-assistant-voice-pe/blob/dev/home-assistant-voice.yaml microphone: - platform: i2s_audio id: mic_in @@ -417,10 +412,8 @@ microphone: } // Extract channel 0 (AGC-processed speech) and convert 32→16 bit. - // Input: interleaved stereo 32-bit samples [L0, R0, L1, R1, ...] - // XMOS channel 0 (AGC) maps to right (odd indices), - // channel 1 (NS/pre-AGC) maps to left (even indices). - // Output: mono 16-bit samples + // ESPHome stereo interleave: channel 0 = left (even), channel 1 = right (odd). + // XMOS voice_kit: channel_0_stage=AGC → left, channel_1_stage=NS → right. const int32_t* samples = reinterpret_cast(x.data()); size_t total_samples = x.size() / sizeof(int32_t); // total L+R samples size_t num_frames = total_samples / 2; // stereo frames @@ -432,11 +425,8 @@ microphone: } for (size_t i = 0; i < num_frames; i++) { - int32_t right = samples[i * 2 + 1]; // channel 0 (right) = AGC output - int32_t s16 = right >> 14; // 32→16 bit with 4x gain (>> 14 vs >> 16) - if (s16 > 32767) s16 = 32767; - if (s16 < -32768) s16 = -32768; - mono_buf[i] = (int16_t)s16; + int32_t left = samples[i * 2]; // channel 0 (left) = AGC output + mono_buf[i] = (int16_t)(left >> 16); // 32→16 bit: take upper 16 bits } size_t mono_bytes = num_frames * sizeof(int16_t);