diff --git a/extras/speaker-recognition/.env.template b/extras/speaker-recognition/.env.template index 63da518e..bc264b44 100644 --- a/extras/speaker-recognition/.env.template +++ b/extras/speaker-recognition/.env.template @@ -19,7 +19,8 @@ COMPUTE_MODE=cpu SIMILARITY_THRESHOLD=0.15 # Service Configuration -SPEAKER_SERVICE_HOST=speaker-service +# SPEAKER_SERVICE_HOST: Interface to bind to (0.0.0.0 = all interfaces, allows cross-network access) +SPEAKER_SERVICE_HOST=0.0.0.0 SPEAKER_SERVICE_PORT=8085 SPEAKER_SERVICE_URL=http://speaker-service:8085 diff --git a/extras/speaker-recognition/src/simple_speaker_recognition/core/audio_backend.py b/extras/speaker-recognition/src/simple_speaker_recognition/core/audio_backend.py index f6e50d0a..040c8ac8 100644 --- a/extras/speaker-recognition/src/simple_speaker_recognition/core/audio_backend.py +++ b/extras/speaker-recognition/src/simple_speaker_recognition/core/audio_backend.py @@ -77,10 +77,20 @@ def diarize(self, path: Path, min_speakers: Optional[int] = None, max_speakers: kwargs['min_speakers'] = min_speakers if max_speakers is not None: kwargs['max_speakers'] = max_speakers - - diarization = self.diar(str(path), **kwargs) - logger.info(f"Diarization: {diarization}") - + + output = self.diar(str(path), **kwargs) + logger.info(f"Diarization output: {output}") + + # In pyannote.audio 4.0+, the pipeline returns a DiarizeOutput object + # We need to access .speaker_diarization to get the Annotation object + if hasattr(output, 'speaker_diarization'): + diarization = output.speaker_diarization + logger.info(f"Using speaker_diarization from output (pyannote 4.0+)") + else: + # Fallback for older versions (3.x) that return Annotation directly + diarization = output + logger.info(f"Using output directly as Annotation (pyannote 3.x)") + # Apply PyAnnote's built-in gap filling using support() method with configurable collar # This fills gaps shorter than collar seconds between segments from same speaker diarization = diarization.support(collar=collar)