From 316ac2ba550f786d67161669550a6e05003c9c23 Mon Sep 17 00:00:00 2001 From: Ankush Malaker <43288948+AnkushMalaker@users.noreply.github.com> Date: Tue, 21 Oct 2025 16:31:23 +0000 Subject: [PATCH] eleven labs support --- CLAUDE.md | 7 +- backends/advanced/.env.template | 8 +- .../advanced/Docs/elevenlabs-integration.md | 920 ++++++++++++++++++ backends/advanced/init.py | 27 +- .../models/transcription.py | 2 +- .../services/transcription/__init__.py | 15 + .../services/transcription/elevenlabs.py | 173 ++++ extras/speaker-recognition/.env.template | 1 + extras/speaker-recognition/init.py | 10 + .../api/routers/__init__.py | 6 +- .../api/routers/elevenlabs_wrapper.py | 391 ++++++++ .../simple_speaker_recognition/api/service.py | 9 +- .../utils/elevenlabs_parser.py | 221 +++++ .../src/components/ProcessingModeSelector.tsx | 23 +- .../webui/src/services/elevenlabs.ts | 185 ++++ .../src/services/speakerIdentification.ts | 66 +- wizard.py | 6 + 17 files changed, 2037 insertions(+), 33 deletions(-) create mode 100644 backends/advanced/Docs/elevenlabs-integration.md create mode 100644 backends/advanced/src/advanced_omi_backend/services/transcription/elevenlabs.py create mode 100644 extras/speaker-recognition/src/simple_speaker_recognition/api/routers/elevenlabs_wrapper.py create mode 100644 extras/speaker-recognition/src/simple_speaker_recognition/utils/elevenlabs_parser.py create mode 100644 extras/speaker-recognition/webui/src/services/elevenlabs.ts diff --git a/CLAUDE.md b/CLAUDE.md index 9c06c934..76dca3dd 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -286,10 +286,9 @@ Friend-Lite supports multiple transcription services: TRANSCRIPTION_PROVIDER=deepgram DEEPGRAM_API_KEY=your-deepgram-key-here -# Option 2: Mistral (Voxtral models) -TRANSCRIPTION_PROVIDER=mistral -MISTRAL_API_KEY=your-mistral-key-here -MISTRAL_MODEL=voxtral-mini-2507 +# Option 2: ElevenLabs (99 languages, speaker diarization) +TRANSCRIPTION_PROVIDER=elevenlabs +ELEVENLABS_API_KEY=your-elevenlabs-key-here # Option 3: Local ASR (Parakeet) PARAKEET_ASR_URL=http://host.docker.internal:8767 diff --git a/backends/advanced/.env.template b/backends/advanced/.env.template index 44a88de6..afd04f56 100644 --- a/backends/advanced/.env.template +++ b/backends/advanced/.env.template @@ -50,10 +50,14 @@ OPENAI_MODEL=gpt-4o-mini # Option 1: Deepgram (recommended for best transcription quality) DEEPGRAM_API_KEY= -# Option 2: Parakeet ASR service from extras/asr-services +# Option 2: ElevenLabs (high quality with 99 language support) +# Get your API key from: https://elevenlabs.io/app/settings/api-keys +# ELEVENLABS_API_KEY= + +# Option 3: Parakeet ASR service from extras/asr-services # PARAKEET_ASR_URL=http://host.docker.internal:8767 -# Optional: Specify which provider to use ('deepgram' or 'parakeet') +# Optional: Specify which provider to use ('deepgram', 'elevenlabs', or 'parakeet') # If not set, will auto-select based on available configuration (Deepgram preferred) # TRANSCRIPTION_PROVIDER= diff --git a/backends/advanced/Docs/elevenlabs-integration.md b/backends/advanced/Docs/elevenlabs-integration.md new file mode 100644 index 00000000..a927edec --- /dev/null +++ b/backends/advanced/Docs/elevenlabs-integration.md @@ -0,0 +1,920 @@ +# ElevenLabs Speech-to-Text Integration Guide + +## Overview + +This document outlines the integration of ElevenLabs Speech-to-Text (Scribe v1 model) as a transcription provider for Friend-Lite. + +## ElevenLabs Capabilities + +### Core Features +- **Model**: Scribe v1 with state-of-the-art accuracy +- **API Endpoint**: `https://api.elevenlabs.io/v1/speech-to-text` +- **Authentication**: API key via `xi-api-key` header +- **Languages**: 99 languages with automatic detection +- **Speaker Diarization**: Up to 32 speakers +- **Word-Level Timestamps**: Precise timing for each word +- **Audio Events**: Optional detection of laughter, applause, etc. + +### Technical Specifications +- **Mode**: Batch processing only (no streaming support) +- **Format**: Multipart/form-data file upload +- **Max File Size**: 3 GB +- **Max Duration**: 10 hours +- **Supported Formats**: 18+ audio formats (AAC, MP3, WAV, FLAC, Opus, WebM, etc.) + +### Output Format +```json +{ + "text": "Full transcript text", + "language_code": "en", + "language_probability": 0.95, + "words": [ + { + "text": "word", + "start": 0.5, + "end": 1.2, + "type": "word", + "speaker_id": "speaker_1", + "logprob": -0.05 + } + ] +} +``` + +## Pricing + +| Tier | Price/Month | Hours Included | Cost per Hour | +|------|-------------|----------------|---------------| +| Starter | $5 | 12.5 | $0.40 | +| Creator | $22 | 62.85 | $0.35 | +| Pro | $99 | 300 | $0.33 | +| Scale | $330 | 1,100 | $0.30 | + +**Comparison**: Deepgram Nova-3 costs ~$0.36/hour (pay-as-you-go) + +## Integration Architecture + +### Provider Implementation + +Create `backends/advanced/src/advanced_omi_backend/services/transcription/elevenlabs.py`: + +```python +""" +ElevenLabs transcription provider implementation. + +Provides batch transcription using ElevenLabs Scribe v1 model. +""" + +import io +import logging +from typing import Dict, Optional + +import httpx + +from advanced_omi_backend.models.transcription import BatchTranscriptionProvider + +logger = logging.getLogger(__name__) + + +class ElevenLabsProvider(BatchTranscriptionProvider): + """ElevenLabs batch transcription provider using Scribe v1 model.""" + + def __init__(self, api_key: str): + self.api_key = api_key + self.url = "https://api.elevenlabs.io/v1/speech-to-text" + + @property + def name(self) -> str: + return "elevenlabs" + + async def transcribe(self, audio_data: bytes, sample_rate: int, diarize: bool = False) -> dict: + """Transcribe audio using ElevenLabs REST API. + + Args: + audio_data: Raw audio bytes (will be converted to WAV format) + sample_rate: Audio sample rate + diarize: Whether to enable speaker diarization + """ + try: + # Convert raw PCM to WAV format for ElevenLabs + wav_data = self._pcm_to_wav(audio_data, sample_rate) + + # Prepare multipart form data + files = { + 'file': ('audio.wav', io.BytesIO(wav_data), 'audio/wav') + } + + data = { + 'model_id': 'scribe_v1', + 'diarize': 'true' if diarize else 'false', + 'timestamps_granularity': 'word', + 'tag_audio_events': 'false', # Optional: set to true for laughter/applause detection + } + + headers = { + 'xi-api-key': self.api_key + } + + logger.info(f"Sending {len(audio_data)} bytes to ElevenLabs API (diarize={diarize})") + + # Calculate timeout based on audio duration + estimated_duration = len(audio_data) / (sample_rate * 2) # 16-bit mono + processing_timeout = max(120, int(estimated_duration * 5)) # 5x audio duration + + timeout_config = httpx.Timeout( + connect=30.0, + read=processing_timeout, + write=180.0, + pool=10.0, + ) + + logger.info( + f"Estimated audio duration: {estimated_duration:.1f}s, timeout: {processing_timeout}s" + ) + + async with httpx.AsyncClient(timeout=timeout_config) as client: + response = await client.post( + self.url, + headers=headers, + data=data, + files=files + ) + + if response.status_code == 200: + result = response.json() + logger.debug(f"ElevenLabs response: {result}") + + # Parse ElevenLabs response format + transcript = result.get('text', '').strip() + + # Extract word-level data + words = [] + segments = [] + + if 'words' in result: + # Map ElevenLabs format to Friend-Lite format + for word_obj in result['words']: + if word_obj.get('type') == 'word': # Skip spacing/audio_events + words.append({ + 'word': word_obj.get('text', ''), + 'start': word_obj.get('start', 0), + 'end': word_obj.get('end', 0), + 'confidence': 1.0 - abs(word_obj.get('logprob', 0)), # Convert logprob to confidence + 'speaker': word_obj.get('speaker_id'), + }) + + # Extract speaker segments if diarization is enabled + if diarize and words: + segments = self._create_speaker_segments(words) + + logger.info( + f"ElevenLabs transcription successful: {len(transcript)} chars, " + f"{len(words)} words, {len(segments)} segments" + ) + + return { + "text": transcript, + "words": words, + "segments": segments, + } + else: + logger.error(f"ElevenLabs API error: {response.status_code} - {response.text}") + return {"text": "", "words": [], "segments": []} + + except httpx.TimeoutException as e: + logger.error(f"Timeout during ElevenLabs API call: {e}") + return {"text": "", "words": [], "segments": []} + except Exception as e: + logger.error(f"Error calling ElevenLabs API: {e}") + return {"text": "", "words": [], "segments": []} + + def _pcm_to_wav(self, pcm_data: bytes, sample_rate: int) -> bytes: + """Convert raw PCM data to WAV format.""" + import wave + import io + + wav_buffer = io.BytesIO() + with wave.open(wav_buffer, 'wb') as wav_file: + wav_file.setnchannels(1) # Mono + wav_file.setsampwidth(2) # 16-bit + wav_file.setframerate(sample_rate) + wav_file.writeframes(pcm_data) + + return wav_buffer.getvalue() + + def _create_speaker_segments(self, words: list) -> list: + """Group consecutive words by speaker into segments.""" + segments = [] + current_speaker = None + current_segment = None + + for word in words: + speaker = word.get('speaker') + if speaker is None: + continue + + if speaker == current_speaker and current_segment: + # Extend current segment + current_segment['text'] += ' ' + word['word'] + current_segment['end'] = word['end'] + else: + # Save previous segment and start new one + if current_segment: + segments.append(current_segment) + current_segment = { + 'text': word['word'], + 'speaker': f"Speaker {speaker}", + 'start': word['start'], + 'end': word['end'], + 'confidence': word.get('confidence'), + } + current_speaker = speaker + + # Don't forget the last segment + if current_segment: + segments.append(current_segment) + + return segments +``` + +### Factory Integration + +Update `backends/advanced/src/advanced_omi_backend/services/transcription/__init__.py`: + +#### 1. Add Import +```python +from advanced_omi_backend.services.transcription.elevenlabs import ElevenLabsProvider +``` + +#### 2. Update `get_transcription_provider()` Function + +Add after line 46: +```python +def get_transcription_provider( + provider_name: Optional[str] = None, + mode: Optional[str] = None, +) -> Optional[BaseTranscriptionProvider]: + deepgram_key = os.getenv("DEEPGRAM_API_KEY") + parakeet_url = os.getenv("PARAKEET_ASR_URL") + elevenlabs_key = os.getenv("ELEVENLABS_API_KEY") # Add this line + + # ... existing code ... + + # Add after parakeet/offline sections (around line 88): + elif provider_name == "elevenlabs": + if not elevenlabs_key: + raise RuntimeError( + "ElevenLabs transcription provider requested but ELEVENLABS_API_KEY not configured" + ) + logger.info(f"Using ElevenLabs transcription provider in {mode} mode") + if mode == "streaming": + raise RuntimeError("ElevenLabs does not support streaming mode - use batch mode") + return ElevenLabsProvider(elevenlabs_key) +``` + +#### 3. Update `__all__` Export +```python +__all__ = [ + "get_transcription_provider", + "DeepgramProvider", + "DeepgramStreamingProvider", + "DeepgramStreamConsumer", + "ParakeetProvider", + "ParakeetStreamingProvider", + "ElevenLabsProvider", # Add this +] +``` + +### Model Update + +Update `backends/advanced/src/advanced_omi_backend/models/transcription.py`: + +```python +class TranscriptionProvider(Enum): + """Available transcription providers for audio stream routing.""" + DEEPGRAM = "deepgram" + PARAKEET = "parakeet" + MISTRAL = "mistral" + ELEVENLABS = "elevenlabs" # Add this line +``` + +## Configuration + +### Environment Variables + +Update `backends/advanced/.env.template` (around line 48): + +```bash +# ======================================== +# SPEECH-TO-TEXT CONFIGURATION (Choose one) +# ======================================== + +# Option 1: Deepgram (recommended for best transcription quality) +DEEPGRAM_API_KEY= + +# Option 2: ElevenLabs (high quality with 99 language support) +# Get your API key from: https://elevenlabs.io/app/settings/api-keys +# ELEVENLABS_API_KEY= + +# Option 3: Mistral (Voxtral models) +# MISTRAL_API_KEY= +# MISTRAL_MODEL=voxtral-mini-2507 + +# Option 4: Parakeet ASR service from extras/asr-services +# PARAKEET_ASR_URL=http://host.docker.internal:8767 + +# Optional: Specify which provider to use ('deepgram', 'elevenlabs', 'mistral', or 'parakeet') +# If not set, will auto-select based on available configuration (Deepgram preferred) +# TRANSCRIPTION_PROVIDER=elevenlabs +``` + +### Usage Example + +```bash +# In .env file +ELEVENLABS_API_KEY=sk_your_api_key_here +TRANSCRIPTION_PROVIDER=elevenlabs + +# Start the backend +docker compose up --build -d +``` + +## Implementation Checklist + +- [ ] Create `elevenlabs.py` provider implementation +- [ ] Update `__init__.py` factory function +- [ ] Add `ELEVENLABS` to `TranscriptionProvider` enum +- [ ] Update `.env.template` with configuration +- [ ] Update `CLAUDE.md` documentation +- [ ] Run integration tests +- [ ] Update API documentation + +## Testing + +### Unit Tests +```bash +cd backends/advanced +uv run pytest tests/test_transcription_providers.py -k elevenlabs +``` + +### Integration Tests +```bash +cd backends/advanced + +# Set environment variables +export ELEVENLABS_API_KEY=sk_your_key_here +export TRANSCRIPTION_PROVIDER=elevenlabs + +# Run full integration test +./run-test.sh +``` + +### Manual Testing +```bash +# Test with audio file upload +curl -X POST http://localhost:8000/api/audio/upload \ + -H "Authorization: Bearer YOUR_JWT_TOKEN" \ + -F "audio_file=@test_audio.wav" \ + -F "client_id=test-client" +``` + +## Key Implementation Notes + +### 1. File Format Conversion +ElevenLabs requires proper audio file formats (not raw bytes). The provider converts raw PCM to WAV: +```python +def _pcm_to_wav(self, pcm_data: bytes, sample_rate: int) -> bytes: + """Convert raw PCM data to WAV format.""" +``` + +### 2. Batch-Only Processing +ElevenLabs does not support streaming transcription. All audio must be sent as complete files. + +### 3. Confidence Score Mapping +ElevenLabs returns `logprob` (log probability) which needs conversion to confidence (0-1): +```python +'confidence': 1.0 - abs(word_obj.get('logprob', 0)) +``` + +### 4. Speaker Diarization +Automatic speaker identification is built-in and returns `speaker_id` in word-level data. The provider groups consecutive words from the same speaker into segments. + +### 5. Timeout Configuration +Processing timeout is dynamically calculated based on audio duration: +```python +processing_timeout = max(120, int(estimated_duration * 5)) # 5x audio duration +``` + +## Comparison with Other Providers + +| Feature | ElevenLabs | Deepgram | Parakeet | +|---------|------------|----------|----------| +| **Streaming** | ❌ No | ✅ Yes | ✅ Yes | +| **Batch** | ✅ Yes | ✅ Yes | ✅ Yes | +| **Languages** | 99 languages | Multi-language | English-focused | +| **Diarization** | ✅ 32 speakers | ✅ Yes | ❌ No | +| **Word Timestamps** | ✅ Yes | ✅ Yes | ✅ Yes | +| **Cost** | $0.30-0.40/hr | ~$0.36/hr | Free (self-hosted) | +| **Setup** | API key | API key | Docker service | + +## Limitations + +1. **No Streaming**: Real-time transcription not supported +2. **File Size**: Maximum 3 GB per file +3. **Duration**: Maximum 10 hours per file +4. **API Dependency**: Requires internet connection and ElevenLabs service availability +5. **Confidence Scores**: Log probability conversion may not be as accurate as native confidence scores + +## Future Enhancements + +1. **Async Processing**: Use webhooks for long audio files +2. **Audio Events**: Enable `tag_audio_events` for laughter/applause detection +3. **Multi-Channel**: Support `use_multi_channel` for separate channel transcription +4. **Custom Formats**: Support additional output formats (SRT, DOCX, PDF) + +## Speaker Recognition Service Integration + +The speaker recognition service (`extras/speaker-recognition`) also needs ElevenLabs support to enhance transcriptions with speaker identification. + +### Current Architecture + +The speaker recognition service: +1. Acts as a proxy/wrapper for transcription services (currently Deepgram only) +2. Forwards transcription requests to the ASR provider +3. Enhances responses with speaker identification using enrolled speakers +4. Returns enriched transcripts with `identified_speaker_id` and `identified_speaker_name` + +### Integration Components Required + +#### 1. ElevenLabs Parser (`elevenlabs_parser.py`) +Similar to `deepgram_parser.py`, this will: +- Parse ElevenLabs JSON response format +- Extract speaker segments from word-level data +- Group consecutive words by speaker_id +- Convert logprob to confidence scores +- Provide speaker statistics + +#### 2. ElevenLabs Wrapper Endpoint (`elevenlabs_wrapper.py`) +Similar to `deepgram_wrapper.py`, this will: +- Accept audio file uploads +- Forward to ElevenLabs API with diarization enabled +- Extract speaker segments from response +- Identify speakers using enrolled voice embeddings +- Return enhanced response with speaker identification + +#### 3. Service Configuration Updates +- Add `elevenlabs_api_key` to Settings +- Register ElevenLabs router +- Update environment templates + +### Key Differences from Deepgram Integration + +| Aspect | Deepgram | ElevenLabs | +|--------|----------|------------| +| **Speaker Field** | `speaker` (integer) | `speaker_id` (string) | +| **Confidence** | Native `confidence` field | Derived from `logprob` | +| **Word Filtering** | All words included | Filter by `type == "word"` | +| **API Endpoint** | `/v1/listen` | `/v1/speech-to-text` | +| **Auth Header** | `Authorization: Token` | `xi-api-key` | +| **Request Format** | Query params + raw audio | Multipart form data | + +### Implementation Plan + +1. **Create Parser** (`utils/elevenlabs_parser.py`) + - Parse JSON response + - Group words by `speaker_id` + - Filter word-type entries only + - Convert logprob to confidence + +2. **Create Wrapper** (`api/routers/elevenlabs_wrapper.py`) + - `/elevenlabs/v1/transcribe` endpoint + - Forward to ElevenLabs API + - Enhance with speaker identification + - Return enriched response + +3. **Update Configuration** + - Add `ELEVENLABS_API_KEY` to `.env.template` + - Add field to Settings class + - Register router in main app + +4. **Testing** + - Unit tests for parser + - Integration tests for wrapper + - End-to-end with real audio + +### Usage Example + +```bash +# Transcribe with speaker identification +curl -X POST http://localhost:8085/elevenlabs/v1/transcribe \ + -H "xi-api-key: YOUR_ELEVENLABS_KEY" \ + -F "file=@audio.wav" \ + -F "diarize=true" \ + -F "model_id=scribe_v1" \ + "?user_id=1&enhance_speakers=true&speaker_confidence_threshold=0.15" +``` + +Expected response with enhancement: +```json +{ + "text": "Hello, how are you today?", + "language_code": "en", + "words": [ + { + "text": "Hello", + "start": 0.1, + "end": 0.5, + "type": "word", + "speaker_id": "speaker_1", + "identified_speaker_id": 42, + "identified_speaker_name": "John Doe", + "speaker_identification_confidence": 0.87, + "speaker_status": "IDENTIFIED" + } + ], + "speaker_enhancement": { + "enabled": true, + "provider": "elevenlabs", + "user_id": 1, + "identified_speakers": { + "speaker_1": { + "speaker_id": 42, + "speaker_name": "John Doe", + "confidence": 0.87 + } + }, + "total_segments": 3, + "identified_segments": 2 + } +} +``` + +## Part 3: Web UI Integration (Batch Inference) + +### Overview + +The Speaker Recognition Web UI provides a batch inference page where users can upload audio files and process them with different transcription providers. This integration adds ElevenLabs as a processing mode option alongside existing Deepgram modes. + +**Note**: ElevenLabs is **batch-only** and does not support real-time streaming. The live inference pages will continue to use Deepgram WebSocket for real-time transcription. + +### Architecture + +``` +User uploads audio file + ↓ +[Inference Page UI] + ↓ +Select "ElevenLabs Transcribe" mode + ↓ +Frontend service: elevenlabs.ts + ↓ +POST /elevenlabs/v1/transcribe + ↓ +Backend wrapper → ElevenLabs API + ↓ +Enhanced response with speaker IDs + ↓ +Display results with transcription + speakers +``` + +### Implementation Details + +#### 1. ElevenLabs Service (`webui/src/services/elevenlabs.ts`) + +Create new service module similar to `deepgram.ts`: + +```typescript +// Type definitions +export interface ElevenLabsTranscriptionOptions { + model_id?: string // Default: 'scribe_v1' + diarize?: boolean // Default: true + timestamps_granularity?: string // Default: 'word' + tag_audio_events?: boolean // Default: false + enhanceSpeakers?: boolean // Enable speaker identification + userId?: number // For speaker enhancement + speakerConfidenceThreshold?: number +} + +export interface ElevenLabsWord { + text: string + start: number + end: number + type: 'word' | 'spacing' | 'audio_event' + speaker_id?: string + logprob?: number + identified_speaker_id?: string + identified_speaker_name?: string + speaker_identification_confidence?: number + speaker_status?: string +} + +export interface ElevenLabsResponse { + text: string + language_code?: string + language_probability?: number + words: ElevenLabsWord[] + speaker_enhancement?: { + enabled: boolean + provider: string + user_id?: number + identified_speakers: Record + total_segments: number + } +} + +// Main transcription function +export async function transcribeWithElevenLabs( + file: File | Blob, + options: ElevenLabsTranscriptionOptions = {} +): Promise { + const formData = new FormData() + formData.append('file', file) + formData.append('model_id', options.model_id || 'scribe_v1') + formData.append('diarize', String(options.diarize !== false)) + formData.append('timestamps_granularity', options.timestamps_granularity || 'word') + formData.append('tag_audio_events', String(options.tag_audio_events || false)) + + const params = new URLSearchParams() + if (options.enhanceSpeakers && options.userId) { + params.append('user_id', String(options.userId)) + params.append('enhance_speakers', 'true') + params.append('speaker_confidence_threshold', String(options.speakerConfidenceThreshold || 0.15)) + } + + const response = await apiService.post( + `/elevenlabs/v1/transcribe?${params}`, + formData + ) + return response.data +} + +// Process response into segments +export function processElevenLabsResponse(response: ElevenLabsResponse) { + // Filter only words (skip spacing and audio events) + const words = response.words.filter(w => w.type === 'word') + + // Group consecutive words by speaker_id + const segments = [] + let currentSegment = null + + for (const word of words) { + if (!word.speaker_id) continue + + if (currentSegment && currentSegment.speaker_id === word.speaker_id) { + // Extend current segment + currentSegment.text += ' ' + word.text + currentSegment.end = word.end + } else { + // Save previous and start new + if (currentSegment) segments.push(currentSegment) + currentSegment = { + speaker_id: word.speaker_id, + speaker: word.identified_speaker_name || `Speaker ${word.speaker_id}`, + text: word.text, + start: word.start, + end: word.end, + confidence: 1.0 - Math.abs(word.logprob || 0), + identifiedSpeakerId: word.identified_speaker_id, + identifiedSpeakerName: word.identified_speaker_name, + speakerIdentificationConfidence: word.speaker_identification_confidence, + speakerStatus: word.speaker_status + } + } + } + if (currentSegment) segments.push(currentSegment) + + return segments +} + +// Calculate confidence summary +export function calculateConfidenceSummary(segments) { + const total = segments.length + const high = segments.filter(s => s.confidence >= 0.8).length + const medium = segments.filter(s => s.confidence >= 0.5 && s.confidence < 0.8).length + const low = segments.filter(s => s.confidence < 0.5).length + + return { + total_segments: total, + high_confidence: high, + medium_confidence: medium, + low_confidence: low + } +} +``` + +#### 2. Speaker Identification Service Updates + +**File**: `webui/src/services/speakerIdentification.ts` + +Add ElevenLabs processing mode: + +```typescript +// Update type +export type ProcessingMode = + | 'diarization-only' + | 'speaker-identification' + | 'deepgram-enhanced' + | 'deepgram-transcript-internal-speakers' + | 'diarize-identify-match' + | 'elevenlabs-enhanced' // NEW + +// Add processing method +private async processWithElevenLabs( + audioFile: File | Blob, + options: ProcessingOptions +): Promise { + try { + const filename = audioFile instanceof File ? audioFile.name : 'Audio' + + const elevenlabsResponse = await transcribeWithElevenLabs(audioFile, { + enhanceSpeakers: options.enhanceSpeakers !== false, + userId: options.userId, + speakerConfidenceThreshold: options.confidenceThreshold || 0.15, + }) + + const elevenlabsSegments = processElevenLabsResponse(elevenlabsResponse) + + const speakerSegments: SpeakerSegment[] = elevenlabsSegments.map(segment => ({ + start: segment.start, + end: segment.end, + speaker_id: segment.speaker_id, + speaker_name: segment.identifiedSpeakerName || segment.speaker, + confidence: segment.confidence, + text: segment.text, + identified_speaker_id: segment.identifiedSpeakerId, + identified_speaker_name: segment.identifiedSpeakerName, + speaker_identification_confidence: segment.speakerIdentificationConfidence, + speaker_status: segment.speakerStatus + })) + + const confidenceSummary = calculateConfidenceSummary(elevenlabsSegments) + + return { + id: Math.random().toString(36), + filename, + duration: this.estimateDuration(speakerSegments), + status: 'completed', + created_at: new Date().toISOString(), + mode: 'elevenlabs-enhanced', + speakers: speakerSegments, + confidence_summary: confidenceSummary, + } + } catch (error) { + throw new Error(`ElevenLabs processing failed: ${error.message}`) + } +} + +// Update processAudio switch +async processAudio(audioFile: File | Blob, options: ProcessingOptions): Promise { + const startTime = Date.now() + + try { + let result: ProcessingResult + + switch (options.mode) { + case 'elevenlabs-enhanced': // NEW + result = await this.processWithElevenLabs(audioFile, options) + break + case 'deepgram-enhanced': + result = await this.processWithDeepgram(audioFile, options) + break + // ... other cases + } + + result.processing_time = Date.now() - startTime + return result + } catch (error) { + // ... error handling + } +} +``` + +#### 3. Processing Mode Selector Updates + +**File**: `webui/src/components/ProcessingModeSelector.tsx` + +Add ElevenLabs to modes array: + +```typescript +const PROCESSING_MODES: ProcessingModeConfig[] = [ + { + mode: 'speaker-identification', + name: 'Speaker Identification', + description: 'Diarization + speaker recognition only', + icon: '🎯', + color: 'bg-blue-600 hover:bg-blue-700', + features: ['Speaker diarization', 'Speaker identification', 'Confidence scoring'] + }, + { + mode: 'deepgram-enhanced', + name: 'Transcribe + Identify', + description: 'Full transcription with enhanced speaker ID', + icon: '🚀', + color: 'bg-green-600 hover:bg-green-700', + requirements: ['Deepgram API key'], + features: ['High-quality transcription', 'Speaker diarization', 'Enhanced speaker identification', 'Word-level timing'] + }, + { + mode: 'elevenlabs-enhanced', // NEW + name: 'ElevenLabs Transcribe', + description: '99 languages with speaker diarization', + icon: '🌐', + color: 'bg-indigo-600 hover:bg-indigo-700', + requirements: ['ElevenLabs API key (configured in backend)'], + features: [ + '99 language support', + 'Built-in speaker diarization', + 'Enhanced speaker identification', + 'Word-level timestamps' + ] + }, + // ... other modes +] +``` + +### Usage Flow + +1. **User Navigation**: Navigate to `/inference` page +2. **Audio Input**: Upload an audio file or record audio +3. **Mode Selection**: Select "🌐 ElevenLabs Transcribe" from dropdown +4. **Processing**: Click "Process Audio" button +5. **Backend Flow**: + - Frontend → `POST /elevenlabs/v1/transcribe` + - Backend wrapper → ElevenLabs API + - Speaker enhancement adds `identified_speaker_name` fields +6. **Results Display**: View transcription with speaker identification + +### Limitations + +**Batch-Only Processing:** +- ✅ Works: `/inference` page (batch file upload) +- ❌ Doesn't work: `/infer-live-simple` and `/infer-live` (require WebSocket streaming) +- ElevenLabs does not support real-time streaming, so live inference pages will continue using Deepgram + +**Language Support:** +- Automatic language detection (99 languages) +- No need to specify language code + +**Speaker Diarization:** +- Maximum 32 speakers +- Automatic speaker detection +- No manual speaker count configuration needed + +### Testing + +```bash +# 1. Ensure backend is running with ElevenLabs configured +cd extras/speaker-recognition +docker compose up -d + +# 2. Navigate to web UI +open https://your-host:8444/inference + +# 3. Test workflow: +# - Upload a WAV file with multiple speakers +# - Select "ElevenLabs Transcribe" mode +# - Set user_id for speaker identification +# - Click "Process Audio" +# - Verify transcription and speaker names appear + +# 4. Check network requests: +# POST /elevenlabs/v1/transcribe +# Response should include speaker_enhancement metadata +``` + +### Integration Checklist + +**Advanced Backend:** +- [x] Create `services/transcription/elevenlabs.py` +- [x] Update `services/transcription/__init__.py` +- [x] Update `models/transcription.py` enum +- [x] Update `.env.template` +- [x] Update `init.py` wizard +- [x] Update root `wizard.py` +- [x] Configure API key in `.env` +- [x] Update CLAUDE.md +- [ ] Run integration tests + +**Speaker Recognition Service (Backend):** +- [x] Create `utils/elevenlabs_parser.py` +- [x] Create `api/routers/elevenlabs_wrapper.py` +- [x] Update `api/service.py` Settings class +- [x] Register ElevenLabs router in main app +- [x] Update `.env.template` +- [x] Update `init.py` wizard +- [x] Configure API key in `.env` +- [ ] Add parser unit tests +- [ ] Add wrapper integration tests + +**Speaker Recognition Web UI:** +- [x] Create `webui/src/services/elevenlabs.ts` +- [x] Update `webui/src/services/speakerIdentification.ts` +- [x] Update `webui/src/components/ProcessingModeSelector.tsx` +- [ ] Test batch inference with ElevenLabs mode + +## References + +- [ElevenLabs Speech-to-Text Docs](https://elevenlabs.io/docs/capabilities/speech-to-text) +- [API Reference](https://elevenlabs.io/docs/api-reference/speech-to-text/convert) +- [Quickstart Guide](https://elevenlabs.io/docs/cookbooks/speech-to-text/quickstart) +- [Friend-Lite Transcription Architecture](./transcription-architecture.md) diff --git a/backends/advanced/init.py b/backends/advanced/init.py index 4e345ec1..8523f32a 100644 --- a/backends/advanced/init.py +++ b/backends/advanced/init.py @@ -135,14 +135,14 @@ def setup_authentication(self): def setup_transcription(self): """Configure transcription provider""" self.print_section("Speech-to-Text Configuration") - + choices = { "1": "Deepgram (recommended - high quality, requires API key)", - "2": "Mistral (Voxtral models - requires API key)", + "2": "ElevenLabs (99 languages, speaker diarization - requires API key)", "3": "Offline (Parakeet ASR - requires GPU, runs locally)", "4": "None (skip transcription setup)" } - + choice = self.prompt_choice("Choose your transcription provider:", choices, "1") if choice == "1": @@ -167,26 +167,23 @@ def setup_transcription(self): self.console.print("[yellow][WARNING][/yellow] No API key provided - transcription will not work") elif choice == "2": - self.config["TRANSCRIPTION_PROVIDER"] = "mistral" - self.console.print("[blue][INFO][/blue] Mistral selected") - self.console.print("Get your API key from: https://console.mistral.ai/") + self.console.print("[blue][INFO][/blue] ElevenLabs selected") + self.console.print("Get your API key from: https://elevenlabs.io/app/settings/api-keys") # Check for existing API key - existing_key = self.read_existing_env_value("MISTRAL_API_KEY") - if existing_key and existing_key not in ['your_mistral_api_key_here', 'your-mistral-key-here']: + existing_key = self.read_existing_env_value("ELEVENLABS_API_KEY") + if existing_key and existing_key not in ['your_elevenlabs_api_key_here', 'your-elevenlabs-key-here']: masked_key = self.mask_api_key(existing_key) - prompt_text = f"Mistral API key ({masked_key}) [press Enter to reuse, or enter new]" + prompt_text = f"ElevenLabs API key ({masked_key}) [press Enter to reuse, or enter new]" api_key_input = self.prompt_value(prompt_text, "") api_key = api_key_input if api_key_input else existing_key else: - api_key = self.prompt_value("Mistral API key (leave empty to skip)", "") - - model = self.prompt_value("Mistral model", "voxtral-mini-2507") + api_key = self.prompt_value("ElevenLabs API key (leave empty to skip)", "") if api_key: - self.config["MISTRAL_API_KEY"] = api_key - self.config["MISTRAL_MODEL"] = model - self.console.print("[green][SUCCESS][/green] Mistral configured") + self.config["TRANSCRIPTION_PROVIDER"] = "elevenlabs" + self.config["ELEVENLABS_API_KEY"] = api_key + self.console.print("[green][SUCCESS][/green] ElevenLabs configured") else: self.console.print("[yellow][WARNING][/yellow] No API key provided - transcription will not work") diff --git a/backends/advanced/src/advanced_omi_backend/models/transcription.py b/backends/advanced/src/advanced_omi_backend/models/transcription.py index 13893a68..ef19b0c9 100644 --- a/backends/advanced/src/advanced_omi_backend/models/transcription.py +++ b/backends/advanced/src/advanced_omi_backend/models/transcription.py @@ -36,7 +36,7 @@ class TranscriptionProvider(Enum): """Available transcription providers for audio stream routing.""" DEEPGRAM = "deepgram" PARAKEET = "parakeet" - MISTRAL = "mistral" + ELEVENLABS = "elevenlabs" class BaseTranscriptionProvider(abc.ABC): diff --git a/backends/advanced/src/advanced_omi_backend/services/transcription/__init__.py b/backends/advanced/src/advanced_omi_backend/services/transcription/__init__.py index 9036aa61..72d7c153 100644 --- a/backends/advanced/src/advanced_omi_backend/services/transcription/__init__.py +++ b/backends/advanced/src/advanced_omi_backend/services/transcription/__init__.py @@ -20,6 +20,9 @@ ParakeetProvider, ParakeetStreamingProvider, ) +from advanced_omi_backend.services.transcription.elevenlabs import ( + ElevenLabsProvider, +) logger = logging.getLogger(__name__) @@ -44,6 +47,7 @@ def get_transcription_provider( """ deepgram_key = os.getenv("DEEPGRAM_API_KEY") parakeet_url = os.getenv("PARAKEET_ASR_URL") + elevenlabs_key = os.getenv("ELEVENLABS_API_KEY") if provider_name: provider_name = provider_name.lower() @@ -87,6 +91,16 @@ def get_transcription_provider( else: return ParakeetProvider(parakeet_url) + elif provider_name == "elevenlabs": + if not elevenlabs_key: + raise RuntimeError( + "ElevenLabs transcription provider requested but ELEVENLABS_API_KEY not configured" + ) + logger.info(f"Using ElevenLabs transcription provider in {mode} mode") + if mode == "streaming": + raise RuntimeError("ElevenLabs does not support streaming mode - use batch mode") + return ElevenLabsProvider(elevenlabs_key) + # Auto-select provider based on available configuration (when provider_name is None) if provider_name is None: # Check TRANSCRIPTION_PROVIDER environment variable first @@ -124,4 +138,5 @@ def get_transcription_provider( "DeepgramStreamConsumer", "ParakeetProvider", "ParakeetStreamingProvider", + "ElevenLabsProvider", ] diff --git a/backends/advanced/src/advanced_omi_backend/services/transcription/elevenlabs.py b/backends/advanced/src/advanced_omi_backend/services/transcription/elevenlabs.py new file mode 100644 index 00000000..698e0bad --- /dev/null +++ b/backends/advanced/src/advanced_omi_backend/services/transcription/elevenlabs.py @@ -0,0 +1,173 @@ +""" +ElevenLabs transcription provider implementation. + +Provides batch transcription using ElevenLabs Scribe v1 model. +""" + +import io +import logging +import wave + +import httpx + +from advanced_omi_backend.models.transcription import BatchTranscriptionProvider + +logger = logging.getLogger(__name__) + + +class ElevenLabsProvider(BatchTranscriptionProvider): + """ElevenLabs batch transcription provider using Scribe v1 model.""" + + def __init__(self, api_key: str): + self.api_key = api_key + self.url = "https://api.elevenlabs.io/v1/speech-to-text" + + @property + def name(self) -> str: + return "elevenlabs" + + async def transcribe(self, audio_data: bytes, sample_rate: int, diarize: bool = False) -> dict: + """Transcribe audio using ElevenLabs REST API. + + Args: + audio_data: Raw audio bytes (will be converted to WAV format) + sample_rate: Audio sample rate + diarize: Whether to enable speaker diarization + """ + try: + # Convert raw PCM to WAV format for ElevenLabs + wav_data = self._pcm_to_wav(audio_data, sample_rate) + + # Prepare multipart form data + files = { + 'file': ('audio.wav', io.BytesIO(wav_data), 'audio/wav') + } + + data = { + 'model_id': 'scribe_v1', + 'diarize': 'true' if diarize else 'false', + 'timestamps_granularity': 'word', + 'tag_audio_events': 'false', # Optional: set to true for laughter/applause detection + } + + headers = { + 'xi-api-key': self.api_key + } + + logger.info(f"Sending {len(audio_data)} bytes to ElevenLabs API (diarize={diarize})") + + # Calculate timeout based on audio duration + estimated_duration = len(audio_data) / (sample_rate * 2) # 16-bit mono + processing_timeout = max(120, int(estimated_duration * 5)) # 5x audio duration + + timeout_config = httpx.Timeout( + connect=30.0, + read=processing_timeout, + write=180.0, + pool=10.0, + ) + + logger.info( + f"Estimated audio duration: {estimated_duration:.1f}s, timeout: {processing_timeout}s" + ) + + async with httpx.AsyncClient(timeout=timeout_config) as client: + response = await client.post( + self.url, + headers=headers, + data=data, + files=files + ) + + if response.status_code == 200: + result = response.json() + logger.debug(f"ElevenLabs response: {result}") + + # Parse ElevenLabs response format + transcript = result.get('text', '').strip() + + # Extract word-level data + words = [] + segments = [] + + if 'words' in result: + # Map ElevenLabs format to Friend-Lite format + for word_obj in result['words']: + if word_obj.get('type') == 'word': # Skip spacing/audio_events + words.append({ + 'word': word_obj.get('text', ''), + 'start': word_obj.get('start', 0), + 'end': word_obj.get('end', 0), + 'confidence': 1.0 - abs(word_obj.get('logprob', 0)), # Convert logprob to confidence + 'speaker': word_obj.get('speaker_id'), + }) + + # Extract speaker segments if diarization is enabled + if diarize and words: + segments = self._create_speaker_segments(words) + + logger.info( + f"ElevenLabs transcription successful: {len(transcript)} chars, " + f"{len(words)} words, {len(segments)} segments" + ) + + return { + "text": transcript, + "words": words, + "segments": segments, + } + else: + logger.error(f"ElevenLabs API error: {response.status_code} - {response.text}") + return {"text": "", "words": [], "segments": []} + + except httpx.TimeoutException as e: + logger.error(f"Timeout during ElevenLabs API call: {e}") + return {"text": "", "words": [], "segments": []} + except Exception as e: + logger.error(f"Error calling ElevenLabs API: {e}") + return {"text": "", "words": [], "segments": []} + + def _pcm_to_wav(self, pcm_data: bytes, sample_rate: int) -> bytes: + """Convert raw PCM data to WAV format.""" + wav_buffer = io.BytesIO() + with wave.open(wav_buffer, 'wb') as wav_file: + wav_file.setnchannels(1) # Mono + wav_file.setsampwidth(2) # 16-bit + wav_file.setframerate(sample_rate) + wav_file.writeframes(pcm_data) + + return wav_buffer.getvalue() + + def _create_speaker_segments(self, words: list) -> list: + """Group consecutive words by speaker into segments.""" + segments = [] + current_speaker = None + current_segment = None + + for word in words: + speaker = word.get('speaker') + if speaker is None: + continue + + if speaker == current_speaker and current_segment: + # Extend current segment + current_segment['text'] += ' ' + word['word'] + current_segment['end'] = word['end'] + else: + # Save previous segment and start new one + if current_segment: + segments.append(current_segment) + current_segment = { + 'text': word['word'], + 'speaker': f"Speaker {speaker}", + 'start': word['start'], + 'end': word['end'], + 'confidence': word.get('confidence'), + } + current_speaker = speaker + + # Don't forget the last segment + if current_segment: + segments.append(current_segment) + + return segments diff --git a/extras/speaker-recognition/.env.template b/extras/speaker-recognition/.env.template index 2ba3a1c7..7f8e203d 100644 --- a/extras/speaker-recognition/.env.template +++ b/extras/speaker-recognition/.env.template @@ -39,6 +39,7 @@ REACT_UI_HTTPS=false # Optional: External Services DEEPGRAM_API_KEY=your_deepgram_api_key_here +ELEVENLABS_API_KEY=your_elevenlabs_api_key_here GROQ_API_KEY=your_groq_api_key_here # Test Configuration (for docker-compose-test.yml) diff --git a/extras/speaker-recognition/init.py b/extras/speaker-recognition/init.py index a5354394..b636ddff 100755 --- a/extras/speaker-recognition/init.py +++ b/extras/speaker-recognition/init.py @@ -248,6 +248,13 @@ def setup_deepgram(self): self.config["DEEPGRAM_API_KEY"] = self.args.deepgram_api_key self.console.print("[green][SUCCESS][/green] Deepgram API key configured from command line") + def setup_elevenlabs(self): + """Configure ElevenLabs API key if provided""" + # Only set if provided via command line + if hasattr(self.args, 'elevenlabs_api_key') and self.args.elevenlabs_api_key: + self.config["ELEVENLABS_API_KEY"] = self.args.elevenlabs_api_key + self.console.print("[green][SUCCESS][/green] ElevenLabs API key configured from command line") + def setup_https(self): """Configure HTTPS settings""" # Check if HTTPS configuration provided via command line @@ -413,6 +420,7 @@ def run(self): self.setup_hf_token() self.setup_compute_mode() self.setup_deepgram() + self.setup_elevenlabs() self.setup_https() # Generate files @@ -446,6 +454,8 @@ def main(): help="Compute mode: cpu or gpu (default: prompt user)") parser.add_argument("--deepgram-api-key", help="Deepgram API key (optional)") + parser.add_argument("--elevenlabs-api-key", + help="ElevenLabs API key (optional)") parser.add_argument("--enable-https", action="store_true", help="Enable HTTPS configuration (default: prompt user)") parser.add_argument("--server-ip", diff --git a/extras/speaker-recognition/src/simple_speaker_recognition/api/routers/__init__.py b/extras/speaker-recognition/src/simple_speaker_recognition/api/routers/__init__.py index c9c77c02..d603ed8e 100644 --- a/extras/speaker-recognition/src/simple_speaker_recognition/api/routers/__init__.py +++ b/extras/speaker-recognition/src/simple_speaker_recognition/api/routers/__init__.py @@ -3,15 +3,17 @@ from .users import router as users_router from .speakers import router as speakers_router from .enrollment import router as enrollment_router -from .identification import router as identification_router +from .identification import router as identification_router from .deepgram_wrapper import router as deepgram_router +from .elevenlabs_wrapper import router as elevenlabs_router from .websocket_wrapper import router as websocket_router __all__ = [ "users_router", - "speakers_router", + "speakers_router", "enrollment_router", "identification_router", "deepgram_router", + "elevenlabs_router", "websocket_router" ] \ No newline at end of file diff --git a/extras/speaker-recognition/src/simple_speaker_recognition/api/routers/elevenlabs_wrapper.py b/extras/speaker-recognition/src/simple_speaker_recognition/api/routers/elevenlabs_wrapper.py new file mode 100644 index 00000000..7c4f6019 --- /dev/null +++ b/extras/speaker-recognition/src/simple_speaker_recognition/api/routers/elevenlabs_wrapper.py @@ -0,0 +1,391 @@ +"""ElevenLabs API wrapper endpoints with speaker enhancement.""" + +import io +import json +import logging +import tempfile +from pathlib import Path +from typing import Any, Dict, List, Optional + +import aiohttp +from fastapi import APIRouter, File, Form, Header, HTTPException, Query, UploadFile + +from simple_speaker_recognition.api.core.utils import ( + safe_format_confidence, + validate_confidence +) +from simple_speaker_recognition.core.models import SpeakerStatus +from simple_speaker_recognition.utils.audio_processing import get_audio_info +from simple_speaker_recognition.utils.elevenlabs_parser import ElevenLabsParser + +router = APIRouter() +log = logging.getLogger("speaker_service") + + +# Dependency functions - will be resolved during integration +async def get_db(): + """Get speaker database dependency.""" + from .. import service + return await service.get_db() + + +def get_audio_backend(): + """Get audio backend.""" + from .. import service + return service.audio_backend + + +def get_speaker_db(): + """Get speaker database.""" + from .. import service + return service.speaker_db + + +def get_auth(): + """Get auth settings.""" + from .. import service + return service.auth + + +async def forward_to_elevenlabs( + audio_data: bytes, + params: Dict[str, Any], + elevenlabs_api_key: str +) -> Dict[str, Any]: + """Forward audio to ElevenLabs API and return response.""" + url = "https://api.elevenlabs.io/v1/speech-to-text" + + headers = { + "xi-api-key": elevenlabs_api_key + } + + # Prepare multipart form data + form_data = aiohttp.FormData() + form_data.add_field('file', io.BytesIO(audio_data), filename='audio.wav', content_type='audio/wav') + + # Add text form fields + for key, value in params.items(): + if value is not None: + # Convert boolean to lowercase string + if isinstance(value, bool): + form_data.add_field(key, str(value).lower()) + else: + form_data.add_field(key, str(value)) + + async with aiohttp.ClientSession() as session: + async with session.post( + url, + headers=headers, + data=form_data + ) as response: + if response.status != 200: + error_text = await response.text() + log.error(f"ElevenLabs API error: {response.status} - {error_text}") + raise HTTPException( + status_code=response.status, + detail=f"ElevenLabs API error: {error_text}" + ) + + result = await response.json() + log.info("Successfully received ElevenLabs response") + return result + + +async def enhance_elevenlabs_response_with_speaker_id( + audio_data: bytes, + elevenlabs_response: Dict[str, Any], + user_id: Optional[int], + confidence_threshold: float = 0.15 +) -> Dict[str, Any]: + """Extract speaker segments and identify speakers from ElevenLabs response.""" + enhanced_response = elevenlabs_response.copy() + + if not user_id: + log.warning("No user_id provided, skipping speaker identification") + enhanced_response["speaker_enhancement"] = { + "enabled": False, + "provider": "elevenlabs", + "reason": "No user_id provided for speaker identification" + } + return enhanced_response + + try: + # Create temporary file for audio processing + with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file: + tmp_file.write(audio_data) + tmp_path = Path(tmp_file.name) + + try: + # Get actual audio duration for boundary validation + audio_info = get_audio_info(str(tmp_path)) + audio_duration = audio_info["duration_seconds"] + log.info(f"Audio file duration: {audio_duration:.6f}s") + + # Extract words from ElevenLabs response + words = elevenlabs_response.get("words", []) + + if not words: + log.warning("No words found in ElevenLabs response") + enhanced_response["speaker_enhancement"] = { + "enabled": False, + "provider": "elevenlabs", + "reason": "No words found in response" + } + return enhanced_response + + # Filter only actual words (skip spacing and audio events) + filtered_words = [w for w in words if w.get('type') == 'word'] + + # Group consecutive words by speaker_id to create segments + speaker_segments = [] + if filtered_words: + current_segment = None + + for word in filtered_words: + speaker_id = word.get('speaker_id') + if speaker_id is None: + continue + + if current_segment is None or current_segment['speaker_id'] != speaker_id: + # Save previous segment + if current_segment: + speaker_segments.append(current_segment) + + # Start new segment + current_segment = { + 'speaker_id': speaker_id, + 'start_time': word.get('start', 0.0), + 'end_time': word.get('end', 0.0), + 'word_indices': [filtered_words.index(word)] + } + else: + # Extend current segment + current_segment['end_time'] = word.get('end', 0.0) + current_segment['word_indices'].append(filtered_words.index(word)) + + # Don't forget the last segment + if current_segment: + speaker_segments.append(current_segment) + + log.info(f"Found {len(speaker_segments)} speaker segments to identify") + + # Create enhanced words list + enhanced_words = words.copy() + + # Get audio backend and speaker DB + audio_backend = get_audio_backend() + speaker_db = get_speaker_db() + + # Identify each segment + for segment_idx, segment_info in enumerate(speaker_segments): + try: + start_time = segment_info["start_time"] + end_time = segment_info["end_time"] + + # Validate segment boundaries + if start_time >= audio_duration: + log.warning(f"Segment {segment_idx} start_time {start_time:.6f}s >= audio duration {audio_duration:.6f}s, skipping") + continue + if end_time > audio_duration: + log.warning(f"Segment {segment_idx} end_time {end_time:.6f}s > audio duration {audio_duration:.6f}s, clamping to {audio_duration:.6f}s") + end_time = audio_duration + + # Load and extract segment + wav = audio_backend.load_wave(tmp_path, start_time, end_time) + + # Get embedding + emb = await audio_backend.async_embed(wav) + + # Identify speaker + found, speaker_info, confidence = await speaker_db.identify(emb, user_id=user_id) + confidence = validate_confidence(confidence, "elevenlabs_enhancement") + + # Store identification result for this segment + segment_result = None + + if found and confidence >= confidence_threshold: + segment_result = { + "speaker_id": speaker_info["id"], + "speaker_name": speaker_info["name"], + "confidence": confidence, + "status": SpeakerStatus.IDENTIFIED.value + } + confidence_str = safe_format_confidence(confidence, "elevenlabs_speaker_identification") + log.info(f"Identified segment {segment_idx} (speaker_id {segment_info['speaker_id']}) as {speaker_info['name']} (confidence: {confidence_str})") + else: + segment_result = { + "speaker_id": None, + "speaker_name": None, + "confidence": confidence if confidence is not None else 0.0, + "status": SpeakerStatus.UNKNOWN.value + } + confidence_str = safe_format_confidence(confidence, "elevenlabs_speaker_unknown") + log.info(f"Segment {segment_idx} (speaker_id {segment_info['speaker_id']}) not identified (confidence: {confidence_str})") + + # Apply identification to all words in this segment + for word_idx in segment_info["word_indices"]: + if word_idx < len(filtered_words): + # Find the original index in enhanced_words + original_word = filtered_words[word_idx] + for i, w in enumerate(enhanced_words): + if w is original_word or (w.get('start') == original_word.get('start') and w.get('text') == original_word.get('text')): + enhanced_words[i].update({ + "identified_speaker_id": segment_result["speaker_id"], + "identified_speaker_name": segment_result["speaker_name"], + "speaker_identification_confidence": segment_result["confidence"], + "speaker_status": segment_result["status"] + }) + break + + # Store result for summary + segment_info["identification"] = segment_result + + except Exception as e: + log.warning(f"Error identifying segment {segment_idx}: {e}") + # Apply error status to all words in this segment + for word_idx in segment_info["word_indices"]: + if word_idx < len(filtered_words): + original_word = filtered_words[word_idx] + for i, w in enumerate(enhanced_words): + if w is original_word or (w.get('start') == original_word.get('start') and w.get('text') == original_word.get('text')): + enhanced_words[i].update({ + "identified_speaker_id": None, + "identified_speaker_name": None, + "speaker_identification_confidence": 0.0, + "speaker_status": SpeakerStatus.ERROR.value + }) + break + + # Update the response with enhanced words + enhanced_response["words"] = enhanced_words + + # Collect unique identified speakers + identified_speakers = {} + for segment in speaker_segments: + if "identification" in segment: + result = segment["identification"] + if result["status"] == SpeakerStatus.IDENTIFIED.value: + # Use the ElevenLabs speaker_id as key + speaker_key = str(segment["speaker_id"]) + # Only store the first occurrence of each identified speaker + if speaker_key not in identified_speakers: + identified_speakers[speaker_key] = result + + # Add speaker enhancement metadata + enhanced_response["speaker_enhancement"] = { + "enabled": True, + "provider": "elevenlabs", + "user_id": user_id, + "confidence_threshold": confidence_threshold, + "identified_speakers": identified_speakers, + "total_segments": len(speaker_segments), + "identified_segments": len([s for s in speaker_segments if s.get("identification", {}).get("status") == SpeakerStatus.IDENTIFIED.value]), + "total_speakers": len(set(s["speaker_id"] for s in speaker_segments)), + "identified_count": len(identified_speakers) + } + + finally: + # Clean up temporary file + tmp_path.unlink(missing_ok=True) + + except Exception as e: + log.error(f"Error during speaker identification: {e}") + # Add error info to response but don't fail the request + enhanced_response["speaker_enhancement"] = { + "enabled": True, + "provider": "elevenlabs", + "error": str(e), + "status": "failed" + } + + return enhanced_response + + +@router.post("/elevenlabs/v1/transcribe") +async def elevenlabs_transcription_with_speaker_id( + file: UploadFile = File(..., description="Audio file to transcribe"), + # ElevenLabs API parameters + model_id: str = Form(default="scribe_v1", description="Model to use for transcription"), + diarize: bool = Form(default=True, description="Enable speaker diarization"), + timestamps_granularity: str = Form(default="word", description="Timestamp granularity"), + tag_audio_events: bool = Form(default=False, description="Tag audio events like laughter"), + # Speaker identification parameters + user_id: Optional[int] = Query(default=None, description="User ID for speaker identification"), + enhance_speakers: bool = Query(default=True, description="Enable speaker identification enhancement"), + speaker_confidence_threshold: float = Query(default=0.15, ge=0.0, le=1.0, description="Minimum confidence threshold for speaker identification"), + # API key + xi_api_key: Optional[str] = Header(default=None, description="ElevenLabs API key") +): + """ + Transcribe audio using ElevenLabs with speaker identification. + + This endpoint forwards the audio to ElevenLabs API for transcription with + speaker diarization, then enhances the response with speaker identification + from enrolled speakers. + + **Authentication**: Provide `xi-api-key` header with your ElevenLabs API key. + + **Speaker Enhancement**: If `user_id` and `enhance_speakers=true`, the service will: + 1. Forward audio to ElevenLabs for transcription with diarization + 2. Extract speaker segments from the diarized response + 3. Identify each speaker using enrolled voice embeddings + 4. Add `identified_speaker_name` and related fields to each word + + **Response Format**: Returns ElevenLabs JSON format with additional `speaker_enhancement` metadata. + """ + # Get ElevenLabs API key from header or settings + auth = get_auth() + api_key = xi_api_key or auth.elevenlabs_api_key + + if not api_key: + raise HTTPException( + status_code=401, + detail="ElevenLabs API key required (provide via xi-api-key header or ELEVENLABS_API_KEY env var)" + ) + + try: + # Read audio data + audio_data = await file.read() + log.info(f"Received audio file: {file.filename}, size: {len(audio_data)} bytes") + + # Prepare ElevenLabs API parameters + elevenlabs_params = { + "model_id": model_id, + "diarize": diarize, + "timestamps_granularity": timestamps_granularity, + "tag_audio_events": tag_audio_events + } + + # Forward to ElevenLabs + log.info(f"Forwarding to ElevenLabs API with params: {elevenlabs_params}") + elevenlabs_response = await forward_to_elevenlabs(audio_data, elevenlabs_params, api_key) + + # Enhance with speaker identification if requested + if enhance_speakers and user_id: + log.info(f"Enhancing response with speaker identification for user_id={user_id}") + enhanced_response = await enhance_elevenlabs_response_with_speaker_id( + audio_data, + elevenlabs_response, + user_id, + speaker_confidence_threshold + ) + return enhanced_response + else: + # Return ElevenLabs response without enhancement + if not enhance_speakers: + log.info("Speaker enhancement disabled") + elif not user_id: + log.info("No user_id provided, skipping speaker identification") + + elevenlabs_response["speaker_enhancement"] = { + "enabled": False, + "provider": "elevenlabs", + "reason": "Enhancement not requested or no user_id provided" + } + return elevenlabs_response + + except HTTPException: + raise + except Exception as e: + log.error(f"Error processing request: {e}") + raise HTTPException(status_code=500, detail=f"Error processing request: {str(e)}") diff --git a/extras/speaker-recognition/src/simple_speaker_recognition/api/service.py b/extras/speaker-recognition/src/simple_speaker_recognition/api/service.py index e08823f5..15973c4a 100644 --- a/extras/speaker-recognition/src/simple_speaker_recognition/api/service.py +++ b/extras/speaker-recognition/src/simple_speaker_recognition/api/service.py @@ -31,6 +31,7 @@ class Settings(BaseSettings): max_file_seconds: int = Field(default=180, description="Maximum file duration in seconds") deepgram_api_key: Optional[str] = Field(default=None, description="Deepgram API key for wrapper service") deepgram_base_url: str = Field(default="https://api.deepgram.com", description="Deepgram API base URL") + elevenlabs_api_key: Optional[str] = Field(default=None, description="ElevenLabs API key for wrapper service") hf_token: Optional[str] = Field(default=None, description="Hugging Face token for Pyannote models") class Config: @@ -52,6 +53,10 @@ class Config: if os.getenv("DEEPGRAM_API_KEY"): auth.deepgram_api_key = os.getenv("DEEPGRAM_API_KEY") +# Override ElevenLabs API key from environment if available +if os.getenv("ELEVENLABS_API_KEY"): + auth.elevenlabs_api_key = os.getenv("ELEVENLABS_API_KEY") + # Set HF token in auth settings for consistency auth.hf_token = hf_token @@ -132,15 +137,17 @@ async def lifespan(app: FastAPI): enrollment_router, identification_router, deepgram_router, + elevenlabs_router, websocket_router ) # Include routers with appropriate tags and prefixes app.include_router(users_router, tags=["users"]) -app.include_router(speakers_router, tags=["speakers"]) +app.include_router(speakers_router, tags=["speakers"]) app.include_router(enrollment_router, tags=["enrollment"]) app.include_router(identification_router, tags=["identification"]) app.include_router(deepgram_router, tags=["deepgram"]) +app.include_router(elevenlabs_router, tags=["elevenlabs"]) app.include_router(websocket_router, tags=["websocket"]) diff --git a/extras/speaker-recognition/src/simple_speaker_recognition/utils/elevenlabs_parser.py b/extras/speaker-recognition/src/simple_speaker_recognition/utils/elevenlabs_parser.py new file mode 100644 index 00000000..6f4aa35e --- /dev/null +++ b/extras/speaker-recognition/src/simple_speaker_recognition/utils/elevenlabs_parser.py @@ -0,0 +1,221 @@ +"""Parser for ElevenLabs JSON transcription output.""" + +import json +import logging +from collections import defaultdict +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple + +logger = logging.getLogger(__name__) + + +class ElevenLabsParser: + """Parse ElevenLabs JSON transcription output and extract speaker segments.""" + + def __init__(self, min_segment_duration: float = 0.5): + """Initialize parser. + + Args: + min_segment_duration: Minimum duration for a segment in seconds + """ + self.min_segment_duration = min_segment_duration + + def parse_elevenlabs_json(self, json_path: str) -> Dict[str, Any]: + """Parse ElevenLabs JSON file and extract transcript data. + + Args: + json_path: Path to ElevenLabs JSON file + + Returns: + Parsed data with metadata and segments + """ + with open(json_path, 'r') as f: + data = json.load(f) + + # Extract text and language info + transcript = data.get('text', '') + language_code = data.get('language_code', '') + language_probability = data.get('language_probability', 0.0) + + # Extract words with speaker information + words = data.get('words', []) + + # Filter only actual words (skip spacing and audio events) + filtered_words = [w for w in words if w.get('type') == 'word'] + + # Group words into speaker segments + segments = self._group_words_by_speaker(filtered_words) + + # Extract unique speakers + unique_speakers = self._extract_unique_speakers(segments) + + # Calculate total duration from last word end time + total_duration = 0.0 + if filtered_words: + total_duration = filtered_words[-1].get('end', 0.0) + + return { + 'metadata': { + 'language_code': language_code, + 'language_probability': language_probability, + 'duration': total_duration + }, + 'transcript': transcript, + 'confidence': self._calculate_avg_confidence(filtered_words), + 'segments': segments, + 'unique_speakers': unique_speakers, + 'total_duration': total_duration + } + + def _group_words_by_speaker(self, words: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Group consecutive words by speaker into segments. + + Args: + words: List of word dictionaries from ElevenLabs + + Returns: + List of speaker segments + """ + if not words: + return [] + + segments = [] + current_segment = None + + for word in words: + speaker_id = word.get('speaker_id') + if speaker_id is None: + continue + + speaker_label = f"speaker_{speaker_id}" + + # Check if we need to start a new segment + if (current_segment is None or + current_segment['elevenlabs_speaker_label'] != speaker_label): + + # Save previous segment if it exists and meets minimum duration + if current_segment is not None: + duration = current_segment['end_time'] - current_segment['start_time'] + if duration >= self.min_segment_duration: + segments.append(current_segment) + + # Start new segment + current_segment = { + 'elevenlabs_speaker_label': speaker_label, + 'start_time': word.get('start', 0.0), + 'end_time': word.get('end', 0.0), + 'text': word.get('text', ''), + 'words': [word], + 'confidence': self._logprob_to_confidence(word.get('logprob', 0.0)) + } + else: + # Extend current segment + current_segment['end_time'] = word.get('end', 0.0) + current_segment['text'] += ' ' + word.get('text', '') + current_segment['words'].append(word) + # Update average confidence + word_confidence = self._logprob_to_confidence(word.get('logprob', 0.0)) + current_segment['confidence'] = ( + (current_segment['confidence'] * (len(current_segment['words']) - 1) + word_confidence) / + len(current_segment['words']) + ) + + # Don't forget the last segment + if current_segment is not None: + duration = current_segment['end_time'] - current_segment['start_time'] + if duration >= self.min_segment_duration: + segments.append(current_segment) + + return segments + + def _extract_unique_speakers(self, segments: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Extract unique speakers and their statistics from segments. + + Args: + segments: List of speaker segments + + Returns: + List of unique speakers with statistics + """ + speaker_stats = defaultdict(lambda: { + 'total_duration': 0.0, + 'segment_count': 0, + 'word_count': 0 + }) + + for segment in segments: + speaker = segment['elevenlabs_speaker_label'] + duration = segment['end_time'] - segment['start_time'] + speaker_stats[speaker]['total_duration'] += duration + speaker_stats[speaker]['segment_count'] += 1 + speaker_stats[speaker]['word_count'] += len(segment['words']) + + unique_speakers = [] + for speaker, stats in speaker_stats.items(): + unique_speakers.append({ + 'speaker': speaker, + **stats + }) + + # Sort by total duration (most active speaker first) + unique_speakers.sort(key=lambda x: x['total_duration'], reverse=True) + + return unique_speakers + + def _logprob_to_confidence(self, logprob: float) -> float: + """Convert ElevenLabs logprob to confidence score (0-1). + + Args: + logprob: Log probability from ElevenLabs + + Returns: + Confidence score between 0 and 1 + """ + # ElevenLabs returns log probability (negative values closer to 0 are more confident) + # Convert to confidence: closer to 0 = higher confidence + return 1.0 - min(abs(logprob), 1.0) + + def _calculate_avg_confidence(self, words: List[Dict[str, Any]]) -> float: + """Calculate average confidence from word list. + + Args: + words: List of word dictionaries + + Returns: + Average confidence score + """ + if not words: + return 0.0 + + total_confidence = sum( + self._logprob_to_confidence(w.get('logprob', 0.0)) + for w in words + ) + return total_confidence / len(words) + + def extract_speaker_segments_for_identification( + self, + segments: List[Dict[str, Any]], + audio_path: str + ) -> List[Dict[str, Any]]: + """Extract speaker segment info for identification. + + Args: + segments: Parsed segments from parse_elevenlabs_json + audio_path: Path to the audio file + + Returns: + List of segment info dicts for speaker identification + """ + segment_info = [] + for i, segment in enumerate(segments): + segment_info.append({ + 'segment_id': i, + 'start_time': segment['start_time'], + 'end_time': segment['end_time'], + 'text': segment['text'], + 'audio_path': audio_path, + 'elevenlabs_speaker_label': segment['elevenlabs_speaker_label'], + 'confidence': segment['confidence'] + }) + return segment_info diff --git a/extras/speaker-recognition/webui/src/components/ProcessingModeSelector.tsx b/extras/speaker-recognition/webui/src/components/ProcessingModeSelector.tsx index df1ff021..c5b56cd9 100644 --- a/extras/speaker-recognition/webui/src/components/ProcessingModeSelector.tsx +++ b/extras/speaker-recognition/webui/src/components/ProcessingModeSelector.tsx @@ -59,6 +59,15 @@ const PROCESSING_MODES: ProcessingModeConfig[] = [ requirements: ['Deepgram API key'], features: ['High-quality transcription', 'Speaker diarization', 'Enhanced speaker identification', 'Word-level timing'] }, + { + mode: 'elevenlabs-enhanced', + name: 'ElevenLabs Transcribe', + description: '99-language transcription with speaker diarization', + icon: '🌍', + color: 'bg-indigo-600 hover:bg-indigo-700', + requirements: ['ElevenLabs API key'], + features: ['99 language support', 'Speaker diarization (up to 32 speakers)', 'Word-level timestamps', 'Speaker identification'] + }, { mode: 'deepgram-transcript-internal-speakers', name: 'Hybrid Mode', @@ -439,15 +448,15 @@ export const ProcessingModeSelector: React.FC = ({ {/* Mode Selection */}

Choose Processing Mode

- + {/* Grid Layout for Modes */} -
- {PROCESSING_MODES.slice(0, 2).map((config) => ( +
+ {PROCESSING_MODES.slice(0, 3).map((config) => (
onModeChange(config.mode)} @@ -457,7 +466,7 @@ export const ProcessingModeSelector: React.FC = ({
{config.name}

{config.description}

- + {/* Requirements */} {config.requirements && (
@@ -466,7 +475,7 @@ export const ProcessingModeSelector: React.FC = ({
)} - + {/* Features */}
diff --git a/extras/speaker-recognition/webui/src/services/elevenlabs.ts b/extras/speaker-recognition/webui/src/services/elevenlabs.ts new file mode 100644 index 00000000..75a89994 --- /dev/null +++ b/extras/speaker-recognition/webui/src/services/elevenlabs.ts @@ -0,0 +1,185 @@ +/** + * ElevenLabs Service - Transcription and speaker diarization via ElevenLabs API + * Provides batch transcription with speaker diarization (up to 32 speakers) + * Word-level timestamps with confidence scores + */ + +import { apiService } from './api' + +export interface ElevenLabsWord { + text: string + start: number + end: number + type: 'word' | 'spacing' | 'audio_event' | 'punctuation' + logprob: number + speaker_id?: string +} + +export interface ElevenLabsResponse { + language_code: string + language_probability: number + text: string + words: ElevenLabsWord[] + transcription_id: string +} + +export interface ElevenLabsSegment { + start: number + end: number + speaker: number + speakerId?: string + speakerName?: string + confidence: number + text: string + identifiedSpeakerId?: string + identifiedSpeakerName?: string + speakerIdentificationConfidence?: number + speakerStatus?: string +} + +export interface ElevenLabsOptions { + enhanceSpeakers?: boolean + userId?: number + speakerConfidenceThreshold?: number + model?: string + language?: string + numSpeakers?: number +} + +/** + * Transcribe audio using ElevenLabs Scribe API with optional speaker enhancement + */ +export async function transcribeWithElevenLabs( + audioFile: File | Blob, + options: ElevenLabsOptions = {} +): Promise { + try { + const formData = new FormData() + formData.append('file', audioFile, audioFile instanceof File ? audioFile.name : 'audio.wav') + formData.append('model_id', options.model || 'scribe_v1') + + if (options.language) { + formData.append('language', options.language) + } + + // Enable speaker diarization + formData.append('enable_speaker_diarization', 'true') + + if (options.numSpeakers) { + formData.append('num_speakers', options.numSpeakers.toString()) + } + + // Determine endpoint based on enhancement option + const endpoint = options.enhanceSpeakers + ? '/elevenlabs/v1/transcribe' + : 'https://api.elevenlabs.io/v1/speech-to-text' + + // Add speaker enhancement parameters if using wrapper endpoint + const params: Record = {} + if (options.enhanceSpeakers) { + params.enhance_speakers = 'true' + if (options.userId) { + params.user_id = options.userId.toString() + } + if (options.speakerConfidenceThreshold !== undefined) { + params.similarity_threshold = options.speakerConfidenceThreshold.toString() + } + } + + const response = await apiService.post(endpoint, formData, { + headers: { + 'Content-Type': 'multipart/form-data', + ...(endpoint.includes('elevenlabs.io') && { + 'xi-api-key': import.meta.env.VITE_ELEVENLABS_API_KEY || '' + }) + }, + params, + timeout: 180000 + }) + + return response.data as ElevenLabsResponse + } catch (error) { + throw new Error(`ElevenLabs transcription failed: ${error instanceof Error ? error.message : 'Unknown error'}`) + } +} + +/** + * Process ElevenLabs response into speaker segments + */ +export function processElevenLabsResponse(response: ElevenLabsResponse): ElevenLabsSegment[] { + const segments: ElevenLabsSegment[] = [] + + // Filter to only word types and group by speaker_id + const words = response.words.filter(w => w.type === 'word') + + if (words.length === 0) { + return segments + } + + let currentSegment: ElevenLabsSegment | null = null + + for (const word of words) { + const speakerId = word.speaker_id || 'speaker_0' + const speakerNum = parseInt(speakerId.replace('speaker_', '')) || 0 + + // Convert logprob to confidence (logprob is typically 0 to -1, where 0 is highest confidence) + const confidence = logprobToConfidence(word.logprob) + + if (!currentSegment || currentSegment.speaker !== speakerNum) { + // Start new segment + if (currentSegment) { + segments.push(currentSegment) + } + + currentSegment = { + start: word.start, + end: word.end, + speaker: speakerNum, + speakerId: speakerId, + speakerName: `Speaker ${speakerNum}`, + confidence: confidence, + text: word.text + } + } else { + // Continue current segment + currentSegment.end = word.end + currentSegment.text += word.text + // Update confidence as running average + currentSegment.confidence = (currentSegment.confidence + confidence) / 2 + } + } + + // Push last segment + if (currentSegment) { + segments.push(currentSegment) + } + + return segments +} + +/** + * Convert ElevenLabs logprob to confidence score (0-1) + * logprob is typically in range [0, -1] where 0 is highest confidence + */ +function logprobToConfidence(logprob: number): number { + // ElevenLabs logprob: 0 = perfect confidence, negative = lower confidence + // Convert to 0-1 scale where 1 = perfect confidence + return 1.0 - Math.min(Math.abs(logprob), 1.0) +} + +/** + * Calculate confidence summary statistics for segments + */ +export function calculateConfidenceSummary(segments: ElevenLabsSegment[]): { + total_segments: number + high_confidence: number + medium_confidence: number + low_confidence: number +} { + return { + total_segments: segments.length, + high_confidence: segments.filter(s => s.confidence >= 0.8).length, + medium_confidence: segments.filter(s => s.confidence >= 0.6 && s.confidence < 0.8).length, + low_confidence: segments.filter(s => s.confidence >= 0.4 && s.confidence < 0.6).length + } +} diff --git a/extras/speaker-recognition/webui/src/services/speakerIdentification.ts b/extras/speaker-recognition/webui/src/services/speakerIdentification.ts index 59939a42..60165594 100644 --- a/extras/speaker-recognition/webui/src/services/speakerIdentification.ts +++ b/extras/speaker-recognition/webui/src/services/speakerIdentification.ts @@ -6,8 +6,9 @@ import { apiService } from './api' import { transcribeWithDeepgram, processDeepgramResponse, calculateConfidenceSummary, DeepgramResponse } from './deepgram' +import { transcribeWithElevenLabs, processElevenLabsResponse, calculateConfidenceSummary as calculateElevenLabsConfidenceSummary, ElevenLabsResponse } from './elevenlabs' -export type ProcessingMode = 'diarization-only' | 'speaker-identification' | 'deepgram-enhanced' | 'deepgram-transcript-internal-speakers' | 'diarize-identify-match' +export type ProcessingMode = 'diarization-only' | 'speaker-identification' | 'deepgram-enhanced' | 'deepgram-transcript-internal-speakers' | 'diarize-identify-match' | 'elevenlabs-enhanced' export interface SpeakerSegment { start: number @@ -51,6 +52,7 @@ export interface ProcessingResult { low_confidence: number } deepgram_response?: DeepgramResponse + elevenlabs_response?: ElevenLabsResponse processing_time?: number error?: string } @@ -94,6 +96,9 @@ export class SpeakerIdentificationService { case 'deepgram-transcript-internal-speakers': result = await this.processWithHybrid(audioFile, options) break + case 'elevenlabs-enhanced': + result = await this.processWithElevenLabs(audioFile, options) + break case 'diarization-only': result = await this.processWithDiarizationOnly(audioFile, options) break @@ -220,6 +225,60 @@ export class SpeakerIdentificationService { } } + /** + * Process with ElevenLabs transcription + speaker enhancement + */ + private async processWithElevenLabs( + audioFile: File | Blob, + options: ProcessingOptions + ): Promise { + try { + const filename = audioFile instanceof File ? audioFile.name : 'Audio' + + // Use ElevenLabs service + const elevenlabsResponse = await transcribeWithElevenLabs(audioFile, { + enhanceSpeakers: options.enhanceSpeakers !== false, + userId: options.userId, + speakerConfidenceThreshold: options.confidenceThreshold || 0.15, + numSpeakers: options.maxSpeakers + }) + + // Process response using ElevenLabs service + const elevenlabsSegments = processElevenLabsResponse(elevenlabsResponse) + + // Convert to SpeakerSegment format + const speakerSegments: SpeakerSegment[] = elevenlabsSegments.map(segment => ({ + start: segment.start, + end: segment.end, + speaker_id: segment.speakerId || `speaker_${segment.speaker}`, + speaker_name: segment.speakerName || `Speaker ${segment.speaker}`, + confidence: segment.confidence, + text: segment.text, + identified_speaker_id: segment.identifiedSpeakerId, + identified_speaker_name: segment.identifiedSpeakerName, + speaker_identification_confidence: segment.speakerIdentificationConfidence, + speaker_status: segment.speakerStatus + })) + + // Calculate confidence summary + const confidenceSummary = calculateElevenLabsConfidenceSummary(elevenlabsSegments) + + return { + id: Math.random().toString(36), + filename, + duration: this.estimateDuration(speakerSegments), + status: 'completed', + created_at: new Date().toISOString(), + mode: 'elevenlabs-enhanced', + speakers: speakerSegments, + confidence_summary: confidenceSummary, + elevenlabs_response: elevenlabsResponse + } + } catch (error) { + throw new Error(`ElevenLabs processing failed: ${error.message}`) + } + } + /** * Process with hybrid mode (Deepgram transcription + internal diarization) */ @@ -515,6 +574,11 @@ export class SpeakerIdentificationService { name: 'Deepgram Enhanced', description: 'Deepgram transcription + diarization + replace speakers with enrolled IDs' }, + { + mode: 'elevenlabs-enhanced', + name: 'ElevenLabs Enhanced', + description: 'ElevenLabs transcription (99 languages) + diarization + speaker identification' + }, { mode: 'deepgram-transcript-internal-speakers', name: 'Deepgram Transcript + Internal Speakers', diff --git a/wizard.py b/wizard.py index cfc5b861..15609ce2 100755 --- a/wizard.py +++ b/wizard.py @@ -183,6 +183,12 @@ def run_service_setup(service_name, selected_services, https_enabled=False, serv cmd.extend(['--deepgram-api-key', deepgram_key]) console.print("[blue][INFO][/blue] Found existing DEEPGRAM_API_KEY from backend config, reusing") + # Pass ElevenLabs API key from backend if available + elevenlabs_key = read_env_value(backend_env_path, 'ELEVENLABS_API_KEY') + if elevenlabs_key and not is_placeholder(elevenlabs_key, 'your_elevenlabs_api_key_here', 'your-elevenlabs-api-key-here'): + cmd.extend(['--elevenlabs-api-key', elevenlabs_key]) + console.print("[blue][INFO][/blue] Found existing ELEVENLABS_API_KEY from backend config, reusing") + # Pass HF Token from existing speaker recognition .env if available speaker_env_path = 'extras/speaker-recognition/.env' hf_token = read_env_value(speaker_env_path, 'HF_TOKEN')