From 316ac2ba550f786d67161669550a6e05003c9c23 Mon Sep 17 00:00:00 2001
From: Ankush Malaker <43288948+AnkushMalaker@users.noreply.github.com>
Date: Tue, 21 Oct 2025 16:31:23 +0000
Subject: [PATCH] eleven labs support

---
 CLAUDE.md                                     |   7 +-
 backends/advanced/.env.template               |   8 +-
 .../advanced/Docs/elevenlabs-integration.md   | 920 ++++++++++++++++++
 backends/advanced/init.py                     |  27 +-
 .../models/transcription.py                   |   2 +-
 .../services/transcription/__init__.py        |  15 +
 .../services/transcription/elevenlabs.py      | 173 ++++
 extras/speaker-recognition/.env.template      |   1 +
 extras/speaker-recognition/init.py            |  10 +
 .../api/routers/__init__.py                   |   6 +-
 .../api/routers/elevenlabs_wrapper.py         | 391 ++++++++
 .../simple_speaker_recognition/api/service.py |   9 +-
 .../utils/elevenlabs_parser.py                | 221 +++++
 .../src/components/ProcessingModeSelector.tsx |  23 +-
 .../webui/src/services/elevenlabs.ts          | 185 ++++
 .../src/services/speakerIdentification.ts     |  66 +-
 wizard.py                                     |   6 +
 17 files changed, 2037 insertions(+), 33 deletions(-)
 create mode 100644 backends/advanced/Docs/elevenlabs-integration.md
 create mode 100644 backends/advanced/src/advanced_omi_backend/services/transcription/elevenlabs.py
 create mode 100644 extras/speaker-recognition/src/simple_speaker_recognition/api/routers/elevenlabs_wrapper.py
 create mode 100644 extras/speaker-recognition/src/simple_speaker_recognition/utils/elevenlabs_parser.py
 create mode 100644 extras/speaker-recognition/webui/src/services/elevenlabs.ts

diff --git a/CLAUDE.md b/CLAUDE.md
index 9c06c934..76dca3dd 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -286,10 +286,9 @@ Friend-Lite supports multiple transcription services:
 TRANSCRIPTION_PROVIDER=deepgram
 DEEPGRAM_API_KEY=your-deepgram-key-here
 
-# Option 2: Mistral (Voxtral models)
-TRANSCRIPTION_PROVIDER=mistral
-MISTRAL_API_KEY=your-mistral-key-here
-MISTRAL_MODEL=voxtral-mini-2507
+# Option 2: ElevenLabs (99 languages, speaker diarization)
+TRANSCRIPTION_PROVIDER=elevenlabs
+ELEVENLABS_API_KEY=your-elevenlabs-key-here
 
 # Option 3: Local ASR (Parakeet)
 PARAKEET_ASR_URL=http://host.docker.internal:8767
diff --git a/backends/advanced/.env.template b/backends/advanced/.env.template
index 44a88de6..afd04f56 100644
--- a/backends/advanced/.env.template
+++ b/backends/advanced/.env.template
@@ -50,10 +50,14 @@ OPENAI_MODEL=gpt-4o-mini
 # Option 1: Deepgram (recommended for best transcription quality)
 DEEPGRAM_API_KEY=
 
-# Option 2: Parakeet ASR service from extras/asr-services
+# Option 2: ElevenLabs (high quality with 99 language support)
+# Get your API key from: https://elevenlabs.io/app/settings/api-keys
+# ELEVENLABS_API_KEY=
+
+# Option 3: Parakeet ASR service from extras/asr-services
 # PARAKEET_ASR_URL=http://host.docker.internal:8767
 
-# Optional: Specify which provider to use ('deepgram' or 'parakeet')
+# Optional: Specify which provider to use ('deepgram', 'elevenlabs', or 'parakeet')
 # If not set, will auto-select based on available configuration (Deepgram preferred)
 # TRANSCRIPTION_PROVIDER=
 
diff --git a/backends/advanced/Docs/elevenlabs-integration.md b/backends/advanced/Docs/elevenlabs-integration.md
new file mode 100644
index 00000000..a927edec
--- /dev/null
+++ b/backends/advanced/Docs/elevenlabs-integration.md
@@ -0,0 +1,920 @@
+# ElevenLabs Speech-to-Text Integration Guide
+
+## Overview
+
+This document outlines the integration of ElevenLabs Speech-to-Text (Scribe v1 model) as a transcription provider for Friend-Lite.
+
+## ElevenLabs Capabilities
+
+### Core Features
+- **Model**: Scribe v1 with state-of-the-art accuracy
+- **API Endpoint**: `https://api.elevenlabs.io/v1/speech-to-text`
+- **Authentication**: API key via `xi-api-key` header
+- **Languages**: 99 languages with automatic detection
+- **Speaker Diarization**: Up to 32 speakers
+- **Word-Level Timestamps**: Precise timing for each word
+- **Audio Events**: Optional detection of laughter, applause, etc.
+
+### Technical Specifications
+- **Mode**: Batch processing only (no streaming support)
+- **Format**: Multipart/form-data file upload
+- **Max File Size**: 3 GB
+- **Max Duration**: 10 hours
+- **Supported Formats**: 18+ audio formats (AAC, MP3, WAV, FLAC, Opus, WebM, etc.)
+
+### Output Format
+```json
+{
+  "text": "Full transcript text",
+  "language_code": "en",
+  "language_probability": 0.95,
+  "words": [
+    {
+      "text": "word",
+      "start": 0.5,
+      "end": 1.2,
+      "type": "word",
+      "speaker_id": "speaker_1",
+      "logprob": -0.05
+    }
+  ]
+}
+```
+
+## Pricing
+
+| Tier | Price/Month | Hours Included | Cost per Hour |
+|------|-------------|----------------|---------------|
+| Starter | $5 | 12.5 | $0.40 |
+| Creator | $22 | 62.85 | $0.35 |
+| Pro | $99 | 300 | $0.33 |
+| Scale | $330 | 1,100 | $0.30 |
+
+**Comparison**: Deepgram Nova-3 costs ~$0.36/hour (pay-as-you-go)
+
+## Integration Architecture
+
+### Provider Implementation
+
+Create `backends/advanced/src/advanced_omi_backend/services/transcription/elevenlabs.py`:
+
+```python
+"""
+ElevenLabs transcription provider implementation.
+
+Provides batch transcription using ElevenLabs Scribe v1 model.
+"""
+
+import io
+import logging
+from typing import Dict, Optional
+
+import httpx
+
+from advanced_omi_backend.models.transcription import BatchTranscriptionProvider
+
+logger = logging.getLogger(__name__)
+
+
+class ElevenLabsProvider(BatchTranscriptionProvider):
+    """ElevenLabs batch transcription provider using Scribe v1 model."""
+
+    def __init__(self, api_key: str):
+        self.api_key = api_key
+        self.url = "https://api.elevenlabs.io/v1/speech-to-text"
+
+    @property
+    def name(self) -> str:
+        return "elevenlabs"
+
+    async def transcribe(self, audio_data: bytes, sample_rate: int, diarize: bool = False) -> dict:
+        """Transcribe audio using ElevenLabs REST API.
+
+        Args:
+            audio_data: Raw audio bytes (will be converted to WAV format)
+            sample_rate: Audio sample rate
+            diarize: Whether to enable speaker diarization
+        """
+        try:
+            # Convert raw PCM to WAV format for ElevenLabs
+            wav_data = self._pcm_to_wav(audio_data, sample_rate)
+
+            # Prepare multipart form data
+            files = {
+                'file': ('audio.wav', io.BytesIO(wav_data), 'audio/wav')
+            }
+
+            data = {
+                'model_id': 'scribe_v1',
+                'diarize': 'true' if diarize else 'false',
+                'timestamps_granularity': 'word',
+                'tag_audio_events': 'false',  # Optional: set to true for laughter/applause detection
+            }
+
+            headers = {
+                'xi-api-key': self.api_key
+            }
+
+            logger.info(f"Sending {len(audio_data)} bytes to ElevenLabs API (diarize={diarize})")
+
+            # Calculate timeout based on audio duration
+            estimated_duration = len(audio_data) / (sample_rate * 2)  # 16-bit mono
+            processing_timeout = max(120, int(estimated_duration * 5))  # 5x audio duration
+
+            timeout_config = httpx.Timeout(
+                connect=30.0,
+                read=processing_timeout,
+                write=180.0,
+                pool=10.0,
+            )
+
+            logger.info(
+                f"Estimated audio duration: {estimated_duration:.1f}s, timeout: {processing_timeout}s"
+            )
+
+            async with httpx.AsyncClient(timeout=timeout_config) as client:
+                response = await client.post(
+                    self.url,
+                    headers=headers,
+                    data=data,
+                    files=files
+                )
+
+                if response.status_code == 200:
+                    result = response.json()
+                    logger.debug(f"ElevenLabs response: {result}")
+
+                    # Parse ElevenLabs response format
+                    transcript = result.get('text', '').strip()
+
+                    # Extract word-level data
+                    words = []
+                    segments = []
+
+                    if 'words' in result:
+                        # Map ElevenLabs format to Friend-Lite format
+                        for word_obj in result['words']:
+                            if word_obj.get('type') == 'word':  # Skip spacing/audio_events
+                                words.append({
+                                    'word': word_obj.get('text', ''),
+                                    'start': word_obj.get('start', 0),
+                                    'end': word_obj.get('end', 0),
+                                    'confidence': 1.0 - abs(word_obj.get('logprob', 0)),  # Convert logprob to confidence
+                                    'speaker': word_obj.get('speaker_id'),
+                                })
+
+                    # Extract speaker segments if diarization is enabled
+                    if diarize and words:
+                        segments = self._create_speaker_segments(words)
+
+                    logger.info(
+                        f"ElevenLabs transcription successful: {len(transcript)} chars, "
+                        f"{len(words)} words, {len(segments)} segments"
+                    )
+
+                    return {
+                        "text": transcript,
+                        "words": words,
+                        "segments": segments,
+                    }
+                else:
+                    logger.error(f"ElevenLabs API error: {response.status_code} - {response.text}")
+                    return {"text": "", "words": [], "segments": []}
+
+        except httpx.TimeoutException as e:
+            logger.error(f"Timeout during ElevenLabs API call: {e}")
+            return {"text": "", "words": [], "segments": []}
+        except Exception as e:
+            logger.error(f"Error calling ElevenLabs API: {e}")
+            return {"text": "", "words": [], "segments": []}
+
+    def _pcm_to_wav(self, pcm_data: bytes, sample_rate: int) -> bytes:
+        """Convert raw PCM data to WAV format."""
+        import wave
+        import io
+
+        wav_buffer = io.BytesIO()
+        with wave.open(wav_buffer, 'wb') as wav_file:
+            wav_file.setnchannels(1)  # Mono
+            wav_file.setsampwidth(2)  # 16-bit
+            wav_file.setframerate(sample_rate)
+            wav_file.writeframes(pcm_data)
+
+        return wav_buffer.getvalue()
+
+    def _create_speaker_segments(self, words: list) -> list:
+        """Group consecutive words by speaker into segments."""
+        segments = []
+        current_speaker = None
+        current_segment = None
+
+        for word in words:
+            speaker = word.get('speaker')
+            if speaker is None:
+                continue
+
+            if speaker == current_speaker and current_segment:
+                # Extend current segment
+                current_segment['text'] += ' ' + word['word']
+                current_segment['end'] = word['end']
+            else:
+                # Save previous segment and start new one
+                if current_segment:
+                    segments.append(current_segment)
+                current_segment = {
+                    'text': word['word'],
+                    'speaker': f"Speaker {speaker}",
+                    'start': word['start'],
+                    'end': word['end'],
+                    'confidence': word.get('confidence'),
+                }
+                current_speaker = speaker
+
+        # Don't forget the last segment
+        if current_segment:
+            segments.append(current_segment)
+
+        return segments
+```
+
+### Factory Integration
+
+Update `backends/advanced/src/advanced_omi_backend/services/transcription/__init__.py`:
+
+#### 1. Add Import
+```python
+from advanced_omi_backend.services.transcription.elevenlabs import ElevenLabsProvider
+```
+
+#### 2. Update `get_transcription_provider()` Function
+
+Add after line 46:
+```python
+def get_transcription_provider(
+    provider_name: Optional[str] = None,
+    mode: Optional[str] = None,
+) -> Optional[BaseTranscriptionProvider]:
+    deepgram_key = os.getenv("DEEPGRAM_API_KEY")
+    parakeet_url = os.getenv("PARAKEET_ASR_URL")
+    elevenlabs_key = os.getenv("ELEVENLABS_API_KEY")  # Add this line
+
+    # ... existing code ...
+
+    # Add after parakeet/offline sections (around line 88):
+    elif provider_name == "elevenlabs":
+        if not elevenlabs_key:
+            raise RuntimeError(
+                "ElevenLabs transcription provider requested but ELEVENLABS_API_KEY not configured"
+            )
+        logger.info(f"Using ElevenLabs transcription provider in {mode} mode")
+        if mode == "streaming":
+            raise RuntimeError("ElevenLabs does not support streaming mode - use batch mode")
+        return ElevenLabsProvider(elevenlabs_key)
+```
+
+#### 3. Update `__all__` Export
+```python
+__all__ = [
+    "get_transcription_provider",
+    "DeepgramProvider",
+    "DeepgramStreamingProvider",
+    "DeepgramStreamConsumer",
+    "ParakeetProvider",
+    "ParakeetStreamingProvider",
+    "ElevenLabsProvider",  # Add this
+]
+```
+
+### Model Update
+
+Update `backends/advanced/src/advanced_omi_backend/models/transcription.py`:
+
+```python
+class TranscriptionProvider(Enum):
+    """Available transcription providers for audio stream routing."""
+    DEEPGRAM = "deepgram"
+    PARAKEET = "parakeet"
+    MISTRAL = "mistral"
+    ELEVENLABS = "elevenlabs"  # Add this line
+```
+
+## Configuration
+
+### Environment Variables
+
+Update `backends/advanced/.env.template` (around line 48):
+
+```bash
+# ========================================
+# SPEECH-TO-TEXT CONFIGURATION (Choose one)
+# ========================================
+
+# Option 1: Deepgram (recommended for best transcription quality)
+DEEPGRAM_API_KEY=
+
+# Option 2: ElevenLabs (high quality with 99 language support)
+# Get your API key from: https://elevenlabs.io/app/settings/api-keys
+# ELEVENLABS_API_KEY=
+
+# Option 3: Mistral (Voxtral models)
+# MISTRAL_API_KEY=
+# MISTRAL_MODEL=voxtral-mini-2507
+
+# Option 4: Parakeet ASR service from extras/asr-services
+# PARAKEET_ASR_URL=http://host.docker.internal:8767
+
+# Optional: Specify which provider to use ('deepgram', 'elevenlabs', 'mistral', or 'parakeet')
+# If not set, will auto-select based on available configuration (Deepgram preferred)
+# TRANSCRIPTION_PROVIDER=elevenlabs
+```
+
+### Usage Example
+
+```bash
+# In .env file
+ELEVENLABS_API_KEY=sk_your_api_key_here
+TRANSCRIPTION_PROVIDER=elevenlabs
+
+# Start the backend
+docker compose up --build -d
+```
+
+## Implementation Checklist
+
+- [ ] Create `elevenlabs.py` provider implementation
+- [ ] Update `__init__.py` factory function
+- [ ] Add `ELEVENLABS` to `TranscriptionProvider` enum
+- [ ] Update `.env.template` with configuration
+- [ ] Update `CLAUDE.md` documentation
+- [ ] Run integration tests
+- [ ] Update API documentation
+
+## Testing
+
+### Unit Tests
+```bash
+cd backends/advanced
+uv run pytest tests/test_transcription_providers.py -k elevenlabs
+```
+
+### Integration Tests
+```bash
+cd backends/advanced
+
+# Set environment variables
+export ELEVENLABS_API_KEY=sk_your_key_here
+export TRANSCRIPTION_PROVIDER=elevenlabs
+
+# Run full integration test
+./run-test.sh
+```
+
+### Manual Testing
+```bash
+# Test with audio file upload
+curl -X POST http://localhost:8000/api/audio/upload \
+  -H "Authorization: Bearer YOUR_JWT_TOKEN" \
+  -F "audio_file=@test_audio.wav" \
+  -F "client_id=test-client"
+```
+
+## Key Implementation Notes
+
+### 1. File Format Conversion
+ElevenLabs requires proper audio file formats (not raw bytes). The provider converts raw PCM to WAV:
+```python
+def _pcm_to_wav(self, pcm_data: bytes, sample_rate: int) -> bytes:
+    """Convert raw PCM data to WAV format."""
+```
+
+### 2. Batch-Only Processing
+ElevenLabs does not support streaming transcription. All audio must be sent as complete files.
+
+### 3. Confidence Score Mapping
+ElevenLabs returns `logprob` (log probability) which needs conversion to confidence (0-1):
+```python
+'confidence': 1.0 - abs(word_obj.get('logprob', 0))
+```
+
+### 4. Speaker Diarization
+Automatic speaker identification is built-in and returns `speaker_id` in word-level data. The provider groups consecutive words from the same speaker into segments.
+
+### 5. Timeout Configuration
+Processing timeout is dynamically calculated based on audio duration:
+```python
+processing_timeout = max(120, int(estimated_duration * 5))  # 5x audio duration
+```
+
+## Comparison with Other Providers
+
+| Feature | ElevenLabs | Deepgram | Parakeet |
+|---------|------------|----------|----------|
+| **Streaming** | ❌ No | ✅ Yes | ✅ Yes |
+| **Batch** | ✅ Yes | ✅ Yes | ✅ Yes |
+| **Languages** | 99 languages | Multi-language | English-focused |
+| **Diarization** | ✅ 32 speakers | ✅ Yes | ❌ No |
+| **Word Timestamps** | ✅ Yes | ✅ Yes | ✅ Yes |
+| **Cost** | $0.30-0.40/hr | ~$0.36/hr | Free (self-hosted) |
+| **Setup** | API key | API key | Docker service |
+
+## Limitations
+
+1. **No Streaming**: Real-time transcription not supported
+2. **File Size**: Maximum 3 GB per file
+3. **Duration**: Maximum 10 hours per file
+4. **API Dependency**: Requires internet connection and ElevenLabs service availability
+5. **Confidence Scores**: Log probability conversion may not be as accurate as native confidence scores
+
+## Future Enhancements
+
+1. **Async Processing**: Use webhooks for long audio files
+2. **Audio Events**: Enable `tag_audio_events` for laughter/applause detection
+3. **Multi-Channel**: Support `use_multi_channel` for separate channel transcription
+4. **Custom Formats**: Support additional output formats (SRT, DOCX, PDF)
+
+## Speaker Recognition Service Integration
+
+The speaker recognition service (`extras/speaker-recognition`) also needs ElevenLabs support to enhance transcriptions with speaker identification.
+
+### Current Architecture
+
+The speaker recognition service:
+1. Acts as a proxy/wrapper for transcription services (currently Deepgram only)
+2. Forwards transcription requests to the ASR provider
+3. Enhances responses with speaker identification using enrolled speakers
+4. Returns enriched transcripts with `identified_speaker_id` and `identified_speaker_name`
+
+### Integration Components Required
+
+#### 1. ElevenLabs Parser (`elevenlabs_parser.py`)
+Similar to `deepgram_parser.py`, this will:
+- Parse ElevenLabs JSON response format
+- Extract speaker segments from word-level data
+- Group consecutive words by speaker_id
+- Convert logprob to confidence scores
+- Provide speaker statistics
+
+#### 2. ElevenLabs Wrapper Endpoint (`elevenlabs_wrapper.py`)
+Similar to `deepgram_wrapper.py`, this will:
+- Accept audio file uploads
+- Forward to ElevenLabs API with diarization enabled
+- Extract speaker segments from response
+- Identify speakers using enrolled voice embeddings
+- Return enhanced response with speaker identification
+
+#### 3. Service Configuration Updates
+- Add `elevenlabs_api_key` to Settings
+- Register ElevenLabs router
+- Update environment templates
+
+### Key Differences from Deepgram Integration
+
+| Aspect | Deepgram | ElevenLabs |
+|--------|----------|------------|
+| **Speaker Field** | `speaker` (integer) | `speaker_id` (string) |
+| **Confidence** | Native `confidence` field | Derived from `logprob` |
+| **Word Filtering** | All words included | Filter by `type == "word"` |
+| **API Endpoint** | `/v1/listen` | `/v1/speech-to-text` |
+| **Auth Header** | `Authorization: Token` | `xi-api-key` |
+| **Request Format** | Query params + raw audio | Multipart form data |
+
+### Implementation Plan
+
+1. **Create Parser** (`utils/elevenlabs_parser.py`)
+   - Parse JSON response
+   - Group words by `speaker_id`
+   - Filter word-type entries only
+   - Convert logprob to confidence
+
+2. **Create Wrapper** (`api/routers/elevenlabs_wrapper.py`)
+   - `/elevenlabs/v1/transcribe` endpoint
+   - Forward to ElevenLabs API
+   - Enhance with speaker identification
+   - Return enriched response
+
+3. **Update Configuration**
+   - Add `ELEVENLABS_API_KEY` to `.env.template`
+   - Add field to Settings class
+   - Register router in main app
+
+4. **Testing**
+   - Unit tests for parser
+   - Integration tests for wrapper
+   - End-to-end with real audio
+
+### Usage Example
+
+```bash
+# Transcribe with speaker identification
+curl -X POST http://localhost:8085/elevenlabs/v1/transcribe \
+  -H "xi-api-key: YOUR_ELEVENLABS_KEY" \
+  -F "file=@audio.wav" \
+  -F "diarize=true" \
+  -F "model_id=scribe_v1" \
+  "?user_id=1&enhance_speakers=true&speaker_confidence_threshold=0.15"
+```
+
+Expected response with enhancement:
+```json
+{
+  "text": "Hello, how are you today?",
+  "language_code": "en",
+  "words": [
+    {
+      "text": "Hello",
+      "start": 0.1,
+      "end": 0.5,
+      "type": "word",
+      "speaker_id": "speaker_1",
+      "identified_speaker_id": 42,
+      "identified_speaker_name": "John Doe",
+      "speaker_identification_confidence": 0.87,
+      "speaker_status": "IDENTIFIED"
+    }
+  ],
+  "speaker_enhancement": {
+    "enabled": true,
+    "provider": "elevenlabs",
+    "user_id": 1,
+    "identified_speakers": {
+      "speaker_1": {
+        "speaker_id": 42,
+        "speaker_name": "John Doe",
+        "confidence": 0.87
+      }
+    },
+    "total_segments": 3,
+    "identified_segments": 2
+  }
+}
+```
+
+## Part 3: Web UI Integration (Batch Inference)
+
+### Overview
+
+The Speaker Recognition Web UI provides a batch inference page where users can upload audio files and process them with different transcription providers. This integration adds ElevenLabs as a processing mode option alongside existing Deepgram modes.
+
+**Note**: ElevenLabs is **batch-only** and does not support real-time streaming. The live inference pages will continue to use Deepgram WebSocket for real-time transcription.
+
+### Architecture
+
+```
+User uploads audio file
+        ↓
+[Inference Page UI]
+        ↓
+Select "ElevenLabs Transcribe" mode
+        ↓
+Frontend service: elevenlabs.ts
+        ↓
+POST /elevenlabs/v1/transcribe
+        ↓
+Backend wrapper → ElevenLabs API
+        ↓
+Enhanced response with speaker IDs
+        ↓
+Display results with transcription + speakers
+```
+
+### Implementation Details
+
+#### 1. ElevenLabs Service (`webui/src/services/elevenlabs.ts`)
+
+Create new service module similar to `deepgram.ts`:
+
+```typescript
+// Type definitions
+export interface ElevenLabsTranscriptionOptions {
+  model_id?: string               // Default: 'scribe_v1'
+  diarize?: boolean               // Default: true
+  timestamps_granularity?: string // Default: 'word'
+  tag_audio_events?: boolean      // Default: false
+  enhanceSpeakers?: boolean       // Enable speaker identification
+  userId?: number                 // For speaker enhancement
+  speakerConfidenceThreshold?: number
+}
+
+export interface ElevenLabsWord {
+  text: string
+  start: number
+  end: number
+  type: 'word' | 'spacing' | 'audio_event'
+  speaker_id?: string
+  logprob?: number
+  identified_speaker_id?: string
+  identified_speaker_name?: string
+  speaker_identification_confidence?: number
+  speaker_status?: string
+}
+
+export interface ElevenLabsResponse {
+  text: string
+  language_code?: string
+  language_probability?: number
+  words: ElevenLabsWord[]
+  speaker_enhancement?: {
+    enabled: boolean
+    provider: string
+    user_id?: number
+    identified_speakers: Record<string, any>
+    total_segments: number
+  }
+}
+
+// Main transcription function
+export async function transcribeWithElevenLabs(
+  file: File | Blob,
+  options: ElevenLabsTranscriptionOptions = {}
+): Promise<ElevenLabsResponse> {
+  const formData = new FormData()
+  formData.append('file', file)
+  formData.append('model_id', options.model_id || 'scribe_v1')
+  formData.append('diarize', String(options.diarize !== false))
+  formData.append('timestamps_granularity', options.timestamps_granularity || 'word')
+  formData.append('tag_audio_events', String(options.tag_audio_events || false))
+
+  const params = new URLSearchParams()
+  if (options.enhanceSpeakers && options.userId) {
+    params.append('user_id', String(options.userId))
+    params.append('enhance_speakers', 'true')
+    params.append('speaker_confidence_threshold', String(options.speakerConfidenceThreshold || 0.15))
+  }
+
+  const response = await apiService.post(
+    `/elevenlabs/v1/transcribe?${params}`,
+    formData
+  )
+  return response.data
+}
+
+// Process response into segments
+export function processElevenLabsResponse(response: ElevenLabsResponse) {
+  // Filter only words (skip spacing and audio events)
+  const words = response.words.filter(w => w.type === 'word')
+
+  // Group consecutive words by speaker_id
+  const segments = []
+  let currentSegment = null
+
+  for (const word of words) {
+    if (!word.speaker_id) continue
+
+    if (currentSegment && currentSegment.speaker_id === word.speaker_id) {
+      // Extend current segment
+      currentSegment.text += ' ' + word.text
+      currentSegment.end = word.end
+    } else {
+      // Save previous and start new
+      if (currentSegment) segments.push(currentSegment)
+      currentSegment = {
+        speaker_id: word.speaker_id,
+        speaker: word.identified_speaker_name || `Speaker ${word.speaker_id}`,
+        text: word.text,
+        start: word.start,
+        end: word.end,
+        confidence: 1.0 - Math.abs(word.logprob || 0),
+        identifiedSpeakerId: word.identified_speaker_id,
+        identifiedSpeakerName: word.identified_speaker_name,
+        speakerIdentificationConfidence: word.speaker_identification_confidence,
+        speakerStatus: word.speaker_status
+      }
+    }
+  }
+  if (currentSegment) segments.push(currentSegment)
+
+  return segments
+}
+
+// Calculate confidence summary
+export function calculateConfidenceSummary(segments) {
+  const total = segments.length
+  const high = segments.filter(s => s.confidence >= 0.8).length
+  const medium = segments.filter(s => s.confidence >= 0.5 && s.confidence < 0.8).length
+  const low = segments.filter(s => s.confidence < 0.5).length
+
+  return {
+    total_segments: total,
+    high_confidence: high,
+    medium_confidence: medium,
+    low_confidence: low
+  }
+}
+```
+
+#### 2. Speaker Identification Service Updates
+
+**File**: `webui/src/services/speakerIdentification.ts`
+
+Add ElevenLabs processing mode:
+
+```typescript
+// Update type
+export type ProcessingMode =
+  | 'diarization-only'
+  | 'speaker-identification'
+  | 'deepgram-enhanced'
+  | 'deepgram-transcript-internal-speakers'
+  | 'diarize-identify-match'
+  | 'elevenlabs-enhanced'  // NEW
+
+// Add processing method
+private async processWithElevenLabs(
+  audioFile: File | Blob,
+  options: ProcessingOptions
+): Promise<ProcessingResult> {
+  try {
+    const filename = audioFile instanceof File ? audioFile.name : 'Audio'
+
+    const elevenlabsResponse = await transcribeWithElevenLabs(audioFile, {
+      enhanceSpeakers: options.enhanceSpeakers !== false,
+      userId: options.userId,
+      speakerConfidenceThreshold: options.confidenceThreshold || 0.15,
+    })
+
+    const elevenlabsSegments = processElevenLabsResponse(elevenlabsResponse)
+
+    const speakerSegments: SpeakerSegment[] = elevenlabsSegments.map(segment => ({
+      start: segment.start,
+      end: segment.end,
+      speaker_id: segment.speaker_id,
+      speaker_name: segment.identifiedSpeakerName || segment.speaker,
+      confidence: segment.confidence,
+      text: segment.text,
+      identified_speaker_id: segment.identifiedSpeakerId,
+      identified_speaker_name: segment.identifiedSpeakerName,
+      speaker_identification_confidence: segment.speakerIdentificationConfidence,
+      speaker_status: segment.speakerStatus
+    }))
+
+    const confidenceSummary = calculateConfidenceSummary(elevenlabsSegments)
+
+    return {
+      id: Math.random().toString(36),
+      filename,
+      duration: this.estimateDuration(speakerSegments),
+      status: 'completed',
+      created_at: new Date().toISOString(),
+      mode: 'elevenlabs-enhanced',
+      speakers: speakerSegments,
+      confidence_summary: confidenceSummary,
+    }
+  } catch (error) {
+    throw new Error(`ElevenLabs processing failed: ${error.message}`)
+  }
+}
+
+// Update processAudio switch
+async processAudio(audioFile: File | Blob, options: ProcessingOptions): Promise<ProcessingResult> {
+  const startTime = Date.now()
+
+  try {
+    let result: ProcessingResult
+
+    switch (options.mode) {
+      case 'elevenlabs-enhanced':  // NEW
+        result = await this.processWithElevenLabs(audioFile, options)
+        break
+      case 'deepgram-enhanced':
+        result = await this.processWithDeepgram(audioFile, options)
+        break
+      // ... other cases
+    }
+
+    result.processing_time = Date.now() - startTime
+    return result
+  } catch (error) {
+    // ... error handling
+  }
+}
+```
+
+#### 3. Processing Mode Selector Updates
+
+**File**: `webui/src/components/ProcessingModeSelector.tsx`
+
+Add ElevenLabs to modes array:
+
+```typescript
+const PROCESSING_MODES: ProcessingModeConfig[] = [
+  {
+    mode: 'speaker-identification',
+    name: 'Speaker Identification',
+    description: 'Diarization + speaker recognition only',
+    icon: '🎯',
+    color: 'bg-blue-600 hover:bg-blue-700',
+    features: ['Speaker diarization', 'Speaker identification', 'Confidence scoring']
+  },
+  {
+    mode: 'deepgram-enhanced',
+    name: 'Transcribe + Identify',
+    description: 'Full transcription with enhanced speaker ID',
+    icon: '🚀',
+    color: 'bg-green-600 hover:bg-green-700',
+    requirements: ['Deepgram API key'],
+    features: ['High-quality transcription', 'Speaker diarization', 'Enhanced speaker identification', 'Word-level timing']
+  },
+  {
+    mode: 'elevenlabs-enhanced',  // NEW
+    name: 'ElevenLabs Transcribe',
+    description: '99 languages with speaker diarization',
+    icon: '🌐',
+    color: 'bg-indigo-600 hover:bg-indigo-700',
+    requirements: ['ElevenLabs API key (configured in backend)'],
+    features: [
+      '99 language support',
+      'Built-in speaker diarization',
+      'Enhanced speaker identification',
+      'Word-level timestamps'
+    ]
+  },
+  // ... other modes
+]
+```
+
+### Usage Flow
+
+1. **User Navigation**: Navigate to `/inference` page
+2. **Audio Input**: Upload an audio file or record audio
+3. **Mode Selection**: Select "🌐 ElevenLabs Transcribe" from dropdown
+4. **Processing**: Click "Process Audio" button
+5. **Backend Flow**:
+   - Frontend → `POST /elevenlabs/v1/transcribe`
+   - Backend wrapper → ElevenLabs API
+   - Speaker enhancement adds `identified_speaker_name` fields
+6. **Results Display**: View transcription with speaker identification
+
+### Limitations
+
+**Batch-Only Processing:**
+- ✅ Works: `/inference` page (batch file upload)
+- ❌ Doesn't work: `/infer-live-simple` and `/infer-live` (require WebSocket streaming)
+- ElevenLabs does not support real-time streaming, so live inference pages will continue using Deepgram
+
+**Language Support:**
+- Automatic language detection (99 languages)
+- No need to specify language code
+
+**Speaker Diarization:**
+- Maximum 32 speakers
+- Automatic speaker detection
+- No manual speaker count configuration needed
+
+### Testing
+
+```bash
+# 1. Ensure backend is running with ElevenLabs configured
+cd extras/speaker-recognition
+docker compose up -d
+
+# 2. Navigate to web UI
+open https://your-host:8444/inference
+
+# 3. Test workflow:
+# - Upload a WAV file with multiple speakers
+# - Select "ElevenLabs Transcribe" mode
+# - Set user_id for speaker identification
+# - Click "Process Audio"
+# - Verify transcription and speaker names appear
+
+# 4. Check network requests:
+# POST /elevenlabs/v1/transcribe
+# Response should include speaker_enhancement metadata
+```
+
+### Integration Checklist
+
+**Advanced Backend:**
+- [x] Create `services/transcription/elevenlabs.py`
+- [x] Update `services/transcription/__init__.py`
+- [x] Update `models/transcription.py` enum
+- [x] Update `.env.template`
+- [x] Update `init.py` wizard
+- [x] Update root `wizard.py`
+- [x] Configure API key in `.env`
+- [x] Update CLAUDE.md
+- [ ] Run integration tests
+
+**Speaker Recognition Service (Backend):**
+- [x] Create `utils/elevenlabs_parser.py`
+- [x] Create `api/routers/elevenlabs_wrapper.py`
+- [x] Update `api/service.py` Settings class
+- [x] Register ElevenLabs router in main app
+- [x] Update `.env.template`
+- [x] Update `init.py` wizard
+- [x] Configure API key in `.env`
+- [ ] Add parser unit tests
+- [ ] Add wrapper integration tests
+
+**Speaker Recognition Web UI:**
+- [x] Create `webui/src/services/elevenlabs.ts`
+- [x] Update `webui/src/services/speakerIdentification.ts`
+- [x] Update `webui/src/components/ProcessingModeSelector.tsx`
+- [ ] Test batch inference with ElevenLabs mode
+
+## References
+
+- [ElevenLabs Speech-to-Text Docs](https://elevenlabs.io/docs/capabilities/speech-to-text)
+- [API Reference](https://elevenlabs.io/docs/api-reference/speech-to-text/convert)
+- [Quickstart Guide](https://elevenlabs.io/docs/cookbooks/speech-to-text/quickstart)
+- [Friend-Lite Transcription Architecture](./transcription-architecture.md)
diff --git a/backends/advanced/init.py b/backends/advanced/init.py
index 4e345ec1..8523f32a 100644
--- a/backends/advanced/init.py
+++ b/backends/advanced/init.py
@@ -135,14 +135,14 @@ def setup_authentication(self):
     def setup_transcription(self):
         """Configure transcription provider"""
         self.print_section("Speech-to-Text Configuration")
-        
+
         choices = {
             "1": "Deepgram (recommended - high quality, requires API key)",
-            "2": "Mistral (Voxtral models - requires API key)", 
+            "2": "ElevenLabs (99 languages, speaker diarization - requires API key)",
             "3": "Offline (Parakeet ASR - requires GPU, runs locally)",
             "4": "None (skip transcription setup)"
         }
-        
+
         choice = self.prompt_choice("Choose your transcription provider:", choices, "1")
 
         if choice == "1":
@@ -167,26 +167,23 @@ def setup_transcription(self):
                 self.console.print("[yellow][WARNING][/yellow] No API key provided - transcription will not work")
 
         elif choice == "2":
-            self.config["TRANSCRIPTION_PROVIDER"] = "mistral"
-            self.console.print("[blue][INFO][/blue] Mistral selected")
-            self.console.print("Get your API key from: https://console.mistral.ai/")
+            self.console.print("[blue][INFO][/blue] ElevenLabs selected")
+            self.console.print("Get your API key from: https://elevenlabs.io/app/settings/api-keys")
 
             # Check for existing API key
-            existing_key = self.read_existing_env_value("MISTRAL_API_KEY")
-            if existing_key and existing_key not in ['your_mistral_api_key_here', 'your-mistral-key-here']:
+            existing_key = self.read_existing_env_value("ELEVENLABS_API_KEY")
+            if existing_key and existing_key not in ['your_elevenlabs_api_key_here', 'your-elevenlabs-key-here']:
                 masked_key = self.mask_api_key(existing_key)
-                prompt_text = f"Mistral API key ({masked_key}) [press Enter to reuse, or enter new]"
+                prompt_text = f"ElevenLabs API key ({masked_key}) [press Enter to reuse, or enter new]"
                 api_key_input = self.prompt_value(prompt_text, "")
                 api_key = api_key_input if api_key_input else existing_key
             else:
-                api_key = self.prompt_value("Mistral API key (leave empty to skip)", "")
-
-            model = self.prompt_value("Mistral model", "voxtral-mini-2507")
+                api_key = self.prompt_value("ElevenLabs API key (leave empty to skip)", "")
 
             if api_key:
-                self.config["MISTRAL_API_KEY"] = api_key
-                self.config["MISTRAL_MODEL"] = model
-                self.console.print("[green][SUCCESS][/green] Mistral configured")
+                self.config["TRANSCRIPTION_PROVIDER"] = "elevenlabs"
+                self.config["ELEVENLABS_API_KEY"] = api_key
+                self.console.print("[green][SUCCESS][/green] ElevenLabs configured")
             else:
                 self.console.print("[yellow][WARNING][/yellow] No API key provided - transcription will not work")
 
diff --git a/backends/advanced/src/advanced_omi_backend/models/transcription.py b/backends/advanced/src/advanced_omi_backend/models/transcription.py
index 13893a68..ef19b0c9 100644
--- a/backends/advanced/src/advanced_omi_backend/models/transcription.py
+++ b/backends/advanced/src/advanced_omi_backend/models/transcription.py
@@ -36,7 +36,7 @@ class TranscriptionProvider(Enum):
     """Available transcription providers for audio stream routing."""
     DEEPGRAM = "deepgram"
     PARAKEET = "parakeet"
-    MISTRAL = "mistral"
+    ELEVENLABS = "elevenlabs"
 
 
 class BaseTranscriptionProvider(abc.ABC):
diff --git a/backends/advanced/src/advanced_omi_backend/services/transcription/__init__.py b/backends/advanced/src/advanced_omi_backend/services/transcription/__init__.py
index 9036aa61..72d7c153 100644
--- a/backends/advanced/src/advanced_omi_backend/services/transcription/__init__.py
+++ b/backends/advanced/src/advanced_omi_backend/services/transcription/__init__.py
@@ -20,6 +20,9 @@
     ParakeetProvider,
     ParakeetStreamingProvider,
 )
+from advanced_omi_backend.services.transcription.elevenlabs import (
+    ElevenLabsProvider,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -44,6 +47,7 @@ def get_transcription_provider(
     """
     deepgram_key = os.getenv("DEEPGRAM_API_KEY")
     parakeet_url = os.getenv("PARAKEET_ASR_URL")
+    elevenlabs_key = os.getenv("ELEVENLABS_API_KEY")
 
     if provider_name:
         provider_name = provider_name.lower()
@@ -87,6 +91,16 @@ def get_transcription_provider(
         else:
             return ParakeetProvider(parakeet_url)
 
+    elif provider_name == "elevenlabs":
+        if not elevenlabs_key:
+            raise RuntimeError(
+                "ElevenLabs transcription provider requested but ELEVENLABS_API_KEY not configured"
+            )
+        logger.info(f"Using ElevenLabs transcription provider in {mode} mode")
+        if mode == "streaming":
+            raise RuntimeError("ElevenLabs does not support streaming mode - use batch mode")
+        return ElevenLabsProvider(elevenlabs_key)
+
     # Auto-select provider based on available configuration (when provider_name is None)
     if provider_name is None:
         # Check TRANSCRIPTION_PROVIDER environment variable first
@@ -124,4 +138,5 @@ def get_transcription_provider(
     "DeepgramStreamConsumer",
     "ParakeetProvider",
     "ParakeetStreamingProvider",
+    "ElevenLabsProvider",
 ]
diff --git a/backends/advanced/src/advanced_omi_backend/services/transcription/elevenlabs.py b/backends/advanced/src/advanced_omi_backend/services/transcription/elevenlabs.py
new file mode 100644
index 00000000..698e0bad
--- /dev/null
+++ b/backends/advanced/src/advanced_omi_backend/services/transcription/elevenlabs.py
@@ -0,0 +1,173 @@
+"""
+ElevenLabs transcription provider implementation.
+
+Provides batch transcription using ElevenLabs Scribe v1 model.
+"""
+
+import io
+import logging
+import wave
+
+import httpx
+
+from advanced_omi_backend.models.transcription import BatchTranscriptionProvider
+
+logger = logging.getLogger(__name__)
+
+
+class ElevenLabsProvider(BatchTranscriptionProvider):
+    """ElevenLabs batch transcription provider using Scribe v1 model."""
+
+    def __init__(self, api_key: str):
+        self.api_key = api_key
+        self.url = "https://api.elevenlabs.io/v1/speech-to-text"
+
+    @property
+    def name(self) -> str:
+        return "elevenlabs"
+
+    async def transcribe(self, audio_data: bytes, sample_rate: int, diarize: bool = False) -> dict:
+        """Transcribe audio using ElevenLabs REST API.
+
+        Args:
+            audio_data: Raw audio bytes (will be converted to WAV format)
+            sample_rate: Audio sample rate
+            diarize: Whether to enable speaker diarization
+        """
+        try:
+            # Convert raw PCM to WAV format for ElevenLabs
+            wav_data = self._pcm_to_wav(audio_data, sample_rate)
+
+            # Prepare multipart form data
+            files = {
+                'file': ('audio.wav', io.BytesIO(wav_data), 'audio/wav')
+            }
+
+            data = {
+                'model_id': 'scribe_v1',
+                'diarize': 'true' if diarize else 'false',
+                'timestamps_granularity': 'word',
+                'tag_audio_events': 'false',  # Optional: set to true for laughter/applause detection
+            }
+
+            headers = {
+                'xi-api-key': self.api_key
+            }
+
+            logger.info(f"Sending {len(audio_data)} bytes to ElevenLabs API (diarize={diarize})")
+
+            # Calculate timeout based on audio duration
+            estimated_duration = len(audio_data) / (sample_rate * 2)  # 16-bit mono
+            processing_timeout = max(120, int(estimated_duration * 5))  # 5x audio duration
+
+            timeout_config = httpx.Timeout(
+                connect=30.0,
+                read=processing_timeout,
+                write=180.0,
+                pool=10.0,
+            )
+
+            logger.info(
+                f"Estimated audio duration: {estimated_duration:.1f}s, timeout: {processing_timeout}s"
+            )
+
+            async with httpx.AsyncClient(timeout=timeout_config) as client:
+                response = await client.post(
+                    self.url,
+                    headers=headers,
+                    data=data,
+                    files=files
+                )
+
+                if response.status_code == 200:
+                    result = response.json()
+                    logger.debug(f"ElevenLabs response: {result}")
+
+                    # Parse ElevenLabs response format
+                    transcript = result.get('text', '').strip()
+
+                    # Extract word-level data
+                    words = []
+                    segments = []
+
+                    if 'words' in result:
+                        # Map ElevenLabs format to Friend-Lite format
+                        for word_obj in result['words']:
+                            if word_obj.get('type') == 'word':  # Skip spacing/audio_events
+                                words.append({
+                                    'word': word_obj.get('text', ''),
+                                    'start': word_obj.get('start', 0),
+                                    'end': word_obj.get('end', 0),
+                                    'confidence': 1.0 - abs(word_obj.get('logprob', 0)),  # Convert logprob to confidence
+                                    'speaker': word_obj.get('speaker_id'),
+                                })
+
+                    # Extract speaker segments if diarization is enabled
+                    if diarize and words:
+                        segments = self._create_speaker_segments(words)
+
+                    logger.info(
+                        f"ElevenLabs transcription successful: {len(transcript)} chars, "
+                        f"{len(words)} words, {len(segments)} segments"
+                    )
+
+                    return {
+                        "text": transcript,
+                        "words": words,
+                        "segments": segments,
+                    }
+                else:
+                    logger.error(f"ElevenLabs API error: {response.status_code} - {response.text}")
+                    return {"text": "", "words": [], "segments": []}
+
+        except httpx.TimeoutException as e:
+            logger.error(f"Timeout during ElevenLabs API call: {e}")
+            return {"text": "", "words": [], "segments": []}
+        except Exception as e:
+            logger.error(f"Error calling ElevenLabs API: {e}")
+            return {"text": "", "words": [], "segments": []}
+
+    def _pcm_to_wav(self, pcm_data: bytes, sample_rate: int) -> bytes:
+        """Convert raw PCM data to WAV format."""
+        wav_buffer = io.BytesIO()
+        with wave.open(wav_buffer, 'wb') as wav_file:
+            wav_file.setnchannels(1)  # Mono
+            wav_file.setsampwidth(2)  # 16-bit
+            wav_file.setframerate(sample_rate)
+            wav_file.writeframes(pcm_data)
+
+        return wav_buffer.getvalue()
+
+    def _create_speaker_segments(self, words: list) -> list:
+        """Group consecutive words by speaker into segments."""
+        segments = []
+        current_speaker = None
+        current_segment = None
+
+        for word in words:
+            speaker = word.get('speaker')
+            if speaker is None:
+                continue
+
+            if speaker == current_speaker and current_segment:
+                # Extend current segment
+                current_segment['text'] += ' ' + word['word']
+                current_segment['end'] = word['end']
+            else:
+                # Save previous segment and start new one
+                if current_segment:
+                    segments.append(current_segment)
+                current_segment = {
+                    'text': word['word'],
+                    'speaker': f"Speaker {speaker}",
+                    'start': word['start'],
+                    'end': word['end'],
+                    'confidence': word.get('confidence'),
+                }
+                current_speaker = speaker
+
+        # Don't forget the last segment
+        if current_segment:
+            segments.append(current_segment)
+
+        return segments
diff --git a/extras/speaker-recognition/.env.template b/extras/speaker-recognition/.env.template
index 2ba3a1c7..7f8e203d 100644
--- a/extras/speaker-recognition/.env.template
+++ b/extras/speaker-recognition/.env.template
@@ -39,6 +39,7 @@ REACT_UI_HTTPS=false
 
 # Optional: External Services
 DEEPGRAM_API_KEY=your_deepgram_api_key_here
+ELEVENLABS_API_KEY=your_elevenlabs_api_key_here
 GROQ_API_KEY=your_groq_api_key_here
 
 # Test Configuration (for docker-compose-test.yml)
diff --git a/extras/speaker-recognition/init.py b/extras/speaker-recognition/init.py
index a5354394..b636ddff 100755
--- a/extras/speaker-recognition/init.py
+++ b/extras/speaker-recognition/init.py
@@ -248,6 +248,13 @@ def setup_deepgram(self):
             self.config["DEEPGRAM_API_KEY"] = self.args.deepgram_api_key
             self.console.print("[green][SUCCESS][/green] Deepgram API key configured from command line")
 
+    def setup_elevenlabs(self):
+        """Configure ElevenLabs API key if provided"""
+        # Only set if provided via command line
+        if hasattr(self.args, 'elevenlabs_api_key') and self.args.elevenlabs_api_key:
+            self.config["ELEVENLABS_API_KEY"] = self.args.elevenlabs_api_key
+            self.console.print("[green][SUCCESS][/green] ElevenLabs API key configured from command line")
+
     def setup_https(self):
         """Configure HTTPS settings"""
         # Check if HTTPS configuration provided via command line
@@ -413,6 +420,7 @@ def run(self):
             self.setup_hf_token()
             self.setup_compute_mode()
             self.setup_deepgram()
+            self.setup_elevenlabs()
             self.setup_https()
 
             # Generate files
@@ -446,6 +454,8 @@ def main():
                        help="Compute mode: cpu or gpu (default: prompt user)")
     parser.add_argument("--deepgram-api-key",
                        help="Deepgram API key (optional)")
+    parser.add_argument("--elevenlabs-api-key",
+                       help="ElevenLabs API key (optional)")
     parser.add_argument("--enable-https", action="store_true",
                        help="Enable HTTPS configuration (default: prompt user)")
     parser.add_argument("--server-ip",
diff --git a/extras/speaker-recognition/src/simple_speaker_recognition/api/routers/__init__.py b/extras/speaker-recognition/src/simple_speaker_recognition/api/routers/__init__.py
index c9c77c02..d603ed8e 100644
--- a/extras/speaker-recognition/src/simple_speaker_recognition/api/routers/__init__.py
+++ b/extras/speaker-recognition/src/simple_speaker_recognition/api/routers/__init__.py
@@ -3,15 +3,17 @@
 from .users import router as users_router
 from .speakers import router as speakers_router
 from .enrollment import router as enrollment_router
-from .identification import router as identification_router  
+from .identification import router as identification_router
 from .deepgram_wrapper import router as deepgram_router
+from .elevenlabs_wrapper import router as elevenlabs_router
 from .websocket_wrapper import router as websocket_router
 
 __all__ = [
     "users_router",
-    "speakers_router", 
+    "speakers_router",
     "enrollment_router",
     "identification_router",
     "deepgram_router",
+    "elevenlabs_router",
     "websocket_router"
 ]
\ No newline at end of file
diff --git a/extras/speaker-recognition/src/simple_speaker_recognition/api/routers/elevenlabs_wrapper.py b/extras/speaker-recognition/src/simple_speaker_recognition/api/routers/elevenlabs_wrapper.py
new file mode 100644
index 00000000..7c4f6019
--- /dev/null
+++ b/extras/speaker-recognition/src/simple_speaker_recognition/api/routers/elevenlabs_wrapper.py
@@ -0,0 +1,391 @@
+"""ElevenLabs API wrapper endpoints with speaker enhancement."""
+
+import io
+import json
+import logging
+import tempfile
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+import aiohttp
+from fastapi import APIRouter, File, Form, Header, HTTPException, Query, UploadFile
+
+from simple_speaker_recognition.api.core.utils import (
+    safe_format_confidence,
+    validate_confidence
+)
+from simple_speaker_recognition.core.models import SpeakerStatus
+from simple_speaker_recognition.utils.audio_processing import get_audio_info
+from simple_speaker_recognition.utils.elevenlabs_parser import ElevenLabsParser
+
+router = APIRouter()
+log = logging.getLogger("speaker_service")
+
+
+# Dependency functions - will be resolved during integration
+async def get_db():
+    """Get speaker database dependency."""
+    from .. import service
+    return await service.get_db()
+
+
+def get_audio_backend():
+    """Get audio backend."""
+    from .. import service
+    return service.audio_backend
+
+
+def get_speaker_db():
+    """Get speaker database."""
+    from .. import service
+    return service.speaker_db
+
+
+def get_auth():
+    """Get auth settings."""
+    from .. import service
+    return service.auth
+
+
+async def forward_to_elevenlabs(
+    audio_data: bytes,
+    params: Dict[str, Any],
+    elevenlabs_api_key: str
+) -> Dict[str, Any]:
+    """Forward audio to ElevenLabs API and return response."""
+    url = "https://api.elevenlabs.io/v1/speech-to-text"
+
+    headers = {
+        "xi-api-key": elevenlabs_api_key
+    }
+
+    # Prepare multipart form data
+    form_data = aiohttp.FormData()
+    form_data.add_field('file', io.BytesIO(audio_data), filename='audio.wav', content_type='audio/wav')
+
+    # Add text form fields
+    for key, value in params.items():
+        if value is not None:
+            # Convert boolean to lowercase string
+            if isinstance(value, bool):
+                form_data.add_field(key, str(value).lower())
+            else:
+                form_data.add_field(key, str(value))
+
+    async with aiohttp.ClientSession() as session:
+        async with session.post(
+            url,
+            headers=headers,
+            data=form_data
+        ) as response:
+            if response.status != 200:
+                error_text = await response.text()
+                log.error(f"ElevenLabs API error: {response.status} - {error_text}")
+                raise HTTPException(
+                    status_code=response.status,
+                    detail=f"ElevenLabs API error: {error_text}"
+                )
+
+            result = await response.json()
+            log.info("Successfully received ElevenLabs response")
+            return result
+
+
+async def enhance_elevenlabs_response_with_speaker_id(
+    audio_data: bytes,
+    elevenlabs_response: Dict[str, Any],
+    user_id: Optional[int],
+    confidence_threshold: float = 0.15
+) -> Dict[str, Any]:
+    """Extract speaker segments and identify speakers from ElevenLabs response."""
+    enhanced_response = elevenlabs_response.copy()
+
+    if not user_id:
+        log.warning("No user_id provided, skipping speaker identification")
+        enhanced_response["speaker_enhancement"] = {
+            "enabled": False,
+            "provider": "elevenlabs",
+            "reason": "No user_id provided for speaker identification"
+        }
+        return enhanced_response
+
+    try:
+        # Create temporary file for audio processing
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
+            tmp_file.write(audio_data)
+            tmp_path = Path(tmp_file.name)
+
+        try:
+            # Get actual audio duration for boundary validation
+            audio_info = get_audio_info(str(tmp_path))
+            audio_duration = audio_info["duration_seconds"]
+            log.info(f"Audio file duration: {audio_duration:.6f}s")
+
+            # Extract words from ElevenLabs response
+            words = elevenlabs_response.get("words", [])
+
+            if not words:
+                log.warning("No words found in ElevenLabs response")
+                enhanced_response["speaker_enhancement"] = {
+                    "enabled": False,
+                    "provider": "elevenlabs",
+                    "reason": "No words found in response"
+                }
+                return enhanced_response
+
+            # Filter only actual words (skip spacing and audio events)
+            filtered_words = [w for w in words if w.get('type') == 'word']
+
+            # Group consecutive words by speaker_id to create segments
+            speaker_segments = []
+            if filtered_words:
+                current_segment = None
+
+                for word in filtered_words:
+                    speaker_id = word.get('speaker_id')
+                    if speaker_id is None:
+                        continue
+
+                    if current_segment is None or current_segment['speaker_id'] != speaker_id:
+                        # Save previous segment
+                        if current_segment:
+                            speaker_segments.append(current_segment)
+
+                        # Start new segment
+                        current_segment = {
+                            'speaker_id': speaker_id,
+                            'start_time': word.get('start', 0.0),
+                            'end_time': word.get('end', 0.0),
+                            'word_indices': [filtered_words.index(word)]
+                        }
+                    else:
+                        # Extend current segment
+                        current_segment['end_time'] = word.get('end', 0.0)
+                        current_segment['word_indices'].append(filtered_words.index(word))
+
+                # Don't forget the last segment
+                if current_segment:
+                    speaker_segments.append(current_segment)
+
+            log.info(f"Found {len(speaker_segments)} speaker segments to identify")
+
+            # Create enhanced words list
+            enhanced_words = words.copy()
+
+            # Get audio backend and speaker DB
+            audio_backend = get_audio_backend()
+            speaker_db = get_speaker_db()
+
+            # Identify each segment
+            for segment_idx, segment_info in enumerate(speaker_segments):
+                try:
+                    start_time = segment_info["start_time"]
+                    end_time = segment_info["end_time"]
+
+                    # Validate segment boundaries
+                    if start_time >= audio_duration:
+                        log.warning(f"Segment {segment_idx} start_time {start_time:.6f}s >= audio duration {audio_duration:.6f}s, skipping")
+                        continue
+                    if end_time > audio_duration:
+                        log.warning(f"Segment {segment_idx} end_time {end_time:.6f}s > audio duration {audio_duration:.6f}s, clamping to {audio_duration:.6f}s")
+                        end_time = audio_duration
+
+                    # Load and extract segment
+                    wav = audio_backend.load_wave(tmp_path, start_time, end_time)
+
+                    # Get embedding
+                    emb = await audio_backend.async_embed(wav)
+
+                    # Identify speaker
+                    found, speaker_info, confidence = await speaker_db.identify(emb, user_id=user_id)
+                    confidence = validate_confidence(confidence, "elevenlabs_enhancement")
+
+                    # Store identification result for this segment
+                    segment_result = None
+
+                    if found and confidence >= confidence_threshold:
+                        segment_result = {
+                            "speaker_id": speaker_info["id"],
+                            "speaker_name": speaker_info["name"],
+                            "confidence": confidence,
+                            "status": SpeakerStatus.IDENTIFIED.value
+                        }
+                        confidence_str = safe_format_confidence(confidence, "elevenlabs_speaker_identification")
+                        log.info(f"Identified segment {segment_idx} (speaker_id {segment_info['speaker_id']}) as {speaker_info['name']} (confidence: {confidence_str})")
+                    else:
+                        segment_result = {
+                            "speaker_id": None,
+                            "speaker_name": None,
+                            "confidence": confidence if confidence is not None else 0.0,
+                            "status": SpeakerStatus.UNKNOWN.value
+                        }
+                        confidence_str = safe_format_confidence(confidence, "elevenlabs_speaker_unknown")
+                        log.info(f"Segment {segment_idx} (speaker_id {segment_info['speaker_id']}) not identified (confidence: {confidence_str})")
+
+                    # Apply identification to all words in this segment
+                    for word_idx in segment_info["word_indices"]:
+                        if word_idx < len(filtered_words):
+                            # Find the original index in enhanced_words
+                            original_word = filtered_words[word_idx]
+                            for i, w in enumerate(enhanced_words):
+                                if w is original_word or (w.get('start') == original_word.get('start') and w.get('text') == original_word.get('text')):
+                                    enhanced_words[i].update({
+                                        "identified_speaker_id": segment_result["speaker_id"],
+                                        "identified_speaker_name": segment_result["speaker_name"],
+                                        "speaker_identification_confidence": segment_result["confidence"],
+                                        "speaker_status": segment_result["status"]
+                                    })
+                                    break
+
+                    # Store result for summary
+                    segment_info["identification"] = segment_result
+
+                except Exception as e:
+                    log.warning(f"Error identifying segment {segment_idx}: {e}")
+                    # Apply error status to all words in this segment
+                    for word_idx in segment_info["word_indices"]:
+                        if word_idx < len(filtered_words):
+                            original_word = filtered_words[word_idx]
+                            for i, w in enumerate(enhanced_words):
+                                if w is original_word or (w.get('start') == original_word.get('start') and w.get('text') == original_word.get('text')):
+                                    enhanced_words[i].update({
+                                        "identified_speaker_id": None,
+                                        "identified_speaker_name": None,
+                                        "speaker_identification_confidence": 0.0,
+                                        "speaker_status": SpeakerStatus.ERROR.value
+                                    })
+                                    break
+
+            # Update the response with enhanced words
+            enhanced_response["words"] = enhanced_words
+
+            # Collect unique identified speakers
+            identified_speakers = {}
+            for segment in speaker_segments:
+                if "identification" in segment:
+                    result = segment["identification"]
+                    if result["status"] == SpeakerStatus.IDENTIFIED.value:
+                        # Use the ElevenLabs speaker_id as key
+                        speaker_key = str(segment["speaker_id"])
+                        # Only store the first occurrence of each identified speaker
+                        if speaker_key not in identified_speakers:
+                            identified_speakers[speaker_key] = result
+
+            # Add speaker enhancement metadata
+            enhanced_response["speaker_enhancement"] = {
+                "enabled": True,
+                "provider": "elevenlabs",
+                "user_id": user_id,
+                "confidence_threshold": confidence_threshold,
+                "identified_speakers": identified_speakers,
+                "total_segments": len(speaker_segments),
+                "identified_segments": len([s for s in speaker_segments if s.get("identification", {}).get("status") == SpeakerStatus.IDENTIFIED.value]),
+                "total_speakers": len(set(s["speaker_id"] for s in speaker_segments)),
+                "identified_count": len(identified_speakers)
+            }
+
+        finally:
+            # Clean up temporary file
+            tmp_path.unlink(missing_ok=True)
+
+    except Exception as e:
+        log.error(f"Error during speaker identification: {e}")
+        # Add error info to response but don't fail the request
+        enhanced_response["speaker_enhancement"] = {
+            "enabled": True,
+            "provider": "elevenlabs",
+            "error": str(e),
+            "status": "failed"
+        }
+
+    return enhanced_response
+
+
+@router.post("/elevenlabs/v1/transcribe")
+async def elevenlabs_transcription_with_speaker_id(
+    file: UploadFile = File(..., description="Audio file to transcribe"),
+    # ElevenLabs API parameters
+    model_id: str = Form(default="scribe_v1", description="Model to use for transcription"),
+    diarize: bool = Form(default=True, description="Enable speaker diarization"),
+    timestamps_granularity: str = Form(default="word", description="Timestamp granularity"),
+    tag_audio_events: bool = Form(default=False, description="Tag audio events like laughter"),
+    # Speaker identification parameters
+    user_id: Optional[int] = Query(default=None, description="User ID for speaker identification"),
+    enhance_speakers: bool = Query(default=True, description="Enable speaker identification enhancement"),
+    speaker_confidence_threshold: float = Query(default=0.15, ge=0.0, le=1.0, description="Minimum confidence threshold for speaker identification"),
+    # API key
+    xi_api_key: Optional[str] = Header(default=None, description="ElevenLabs API key")
+):
+    """
+    Transcribe audio using ElevenLabs with speaker identification.
+
+    This endpoint forwards the audio to ElevenLabs API for transcription with
+    speaker diarization, then enhances the response with speaker identification
+    from enrolled speakers.
+
+    **Authentication**: Provide `xi-api-key` header with your ElevenLabs API key.
+
+    **Speaker Enhancement**: If `user_id` and `enhance_speakers=true`, the service will:
+    1. Forward audio to ElevenLabs for transcription with diarization
+    2. Extract speaker segments from the diarized response
+    3. Identify each speaker using enrolled voice embeddings
+    4. Add `identified_speaker_name` and related fields to each word
+
+    **Response Format**: Returns ElevenLabs JSON format with additional `speaker_enhancement` metadata.
+    """
+    # Get ElevenLabs API key from header or settings
+    auth = get_auth()
+    api_key = xi_api_key or auth.elevenlabs_api_key
+
+    if not api_key:
+        raise HTTPException(
+            status_code=401,
+            detail="ElevenLabs API key required (provide via xi-api-key header or ELEVENLABS_API_KEY env var)"
+        )
+
+    try:
+        # Read audio data
+        audio_data = await file.read()
+        log.info(f"Received audio file: {file.filename}, size: {len(audio_data)} bytes")
+
+        # Prepare ElevenLabs API parameters
+        elevenlabs_params = {
+            "model_id": model_id,
+            "diarize": diarize,
+            "timestamps_granularity": timestamps_granularity,
+            "tag_audio_events": tag_audio_events
+        }
+
+        # Forward to ElevenLabs
+        log.info(f"Forwarding to ElevenLabs API with params: {elevenlabs_params}")
+        elevenlabs_response = await forward_to_elevenlabs(audio_data, elevenlabs_params, api_key)
+
+        # Enhance with speaker identification if requested
+        if enhance_speakers and user_id:
+            log.info(f"Enhancing response with speaker identification for user_id={user_id}")
+            enhanced_response = await enhance_elevenlabs_response_with_speaker_id(
+                audio_data,
+                elevenlabs_response,
+                user_id,
+                speaker_confidence_threshold
+            )
+            return enhanced_response
+        else:
+            # Return ElevenLabs response without enhancement
+            if not enhance_speakers:
+                log.info("Speaker enhancement disabled")
+            elif not user_id:
+                log.info("No user_id provided, skipping speaker identification")
+
+            elevenlabs_response["speaker_enhancement"] = {
+                "enabled": False,
+                "provider": "elevenlabs",
+                "reason": "Enhancement not requested or no user_id provided"
+            }
+            return elevenlabs_response
+
+    except HTTPException:
+        raise
+    except Exception as e:
+        log.error(f"Error processing request: {e}")
+        raise HTTPException(status_code=500, detail=f"Error processing request: {str(e)}")
diff --git a/extras/speaker-recognition/src/simple_speaker_recognition/api/service.py b/extras/speaker-recognition/src/simple_speaker_recognition/api/service.py
index e08823f5..15973c4a 100644
--- a/extras/speaker-recognition/src/simple_speaker_recognition/api/service.py
+++ b/extras/speaker-recognition/src/simple_speaker_recognition/api/service.py
@@ -31,6 +31,7 @@ class Settings(BaseSettings):
     max_file_seconds: int = Field(default=180, description="Maximum file duration in seconds")
     deepgram_api_key: Optional[str] = Field(default=None, description="Deepgram API key for wrapper service")
     deepgram_base_url: str = Field(default="https://api.deepgram.com", description="Deepgram API base URL")
+    elevenlabs_api_key: Optional[str] = Field(default=None, description="ElevenLabs API key for wrapper service")
     hf_token: Optional[str] = Field(default=None, description="Hugging Face token for Pyannote models")
 
     class Config:
@@ -52,6 +53,10 @@ class Config:
 if os.getenv("DEEPGRAM_API_KEY"):
     auth.deepgram_api_key = os.getenv("DEEPGRAM_API_KEY")
 
+# Override ElevenLabs API key from environment if available
+if os.getenv("ELEVENLABS_API_KEY"):
+    auth.elevenlabs_api_key = os.getenv("ELEVENLABS_API_KEY")
+
 # Set HF token in auth settings for consistency
 auth.hf_token = hf_token
 
@@ -132,15 +137,17 @@ async def lifespan(app: FastAPI):
     enrollment_router,
     identification_router,
     deepgram_router,
+    elevenlabs_router,
     websocket_router
 )
 
 # Include routers with appropriate tags and prefixes
 app.include_router(users_router, tags=["users"])
-app.include_router(speakers_router, tags=["speakers"])  
+app.include_router(speakers_router, tags=["speakers"])
 app.include_router(enrollment_router, tags=["enrollment"])
 app.include_router(identification_router, tags=["identification"])
 app.include_router(deepgram_router, tags=["deepgram"])
+app.include_router(elevenlabs_router, tags=["elevenlabs"])
 app.include_router(websocket_router, tags=["websocket"])
 
 
diff --git a/extras/speaker-recognition/src/simple_speaker_recognition/utils/elevenlabs_parser.py b/extras/speaker-recognition/src/simple_speaker_recognition/utils/elevenlabs_parser.py
new file mode 100644
index 00000000..6f4aa35e
--- /dev/null
+++ b/extras/speaker-recognition/src/simple_speaker_recognition/utils/elevenlabs_parser.py
@@ -0,0 +1,221 @@
+"""Parser for ElevenLabs JSON transcription output."""
+
+import json
+import logging
+from collections import defaultdict
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+
+logger = logging.getLogger(__name__)
+
+
+class ElevenLabsParser:
+    """Parse ElevenLabs JSON transcription output and extract speaker segments."""
+
+    def __init__(self, min_segment_duration: float = 0.5):
+        """Initialize parser.
+
+        Args:
+            min_segment_duration: Minimum duration for a segment in seconds
+        """
+        self.min_segment_duration = min_segment_duration
+
+    def parse_elevenlabs_json(self, json_path: str) -> Dict[str, Any]:
+        """Parse ElevenLabs JSON file and extract transcript data.
+
+        Args:
+            json_path: Path to ElevenLabs JSON file
+
+        Returns:
+            Parsed data with metadata and segments
+        """
+        with open(json_path, 'r') as f:
+            data = json.load(f)
+
+        # Extract text and language info
+        transcript = data.get('text', '')
+        language_code = data.get('language_code', '')
+        language_probability = data.get('language_probability', 0.0)
+
+        # Extract words with speaker information
+        words = data.get('words', [])
+
+        # Filter only actual words (skip spacing and audio events)
+        filtered_words = [w for w in words if w.get('type') == 'word']
+
+        # Group words into speaker segments
+        segments = self._group_words_by_speaker(filtered_words)
+
+        # Extract unique speakers
+        unique_speakers = self._extract_unique_speakers(segments)
+
+        # Calculate total duration from last word end time
+        total_duration = 0.0
+        if filtered_words:
+            total_duration = filtered_words[-1].get('end', 0.0)
+
+        return {
+            'metadata': {
+                'language_code': language_code,
+                'language_probability': language_probability,
+                'duration': total_duration
+            },
+            'transcript': transcript,
+            'confidence': self._calculate_avg_confidence(filtered_words),
+            'segments': segments,
+            'unique_speakers': unique_speakers,
+            'total_duration': total_duration
+        }
+
+    def _group_words_by_speaker(self, words: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """Group consecutive words by speaker into segments.
+
+        Args:
+            words: List of word dictionaries from ElevenLabs
+
+        Returns:
+            List of speaker segments
+        """
+        if not words:
+            return []
+
+        segments = []
+        current_segment = None
+
+        for word in words:
+            speaker_id = word.get('speaker_id')
+            if speaker_id is None:
+                continue
+
+            speaker_label = f"speaker_{speaker_id}"
+
+            # Check if we need to start a new segment
+            if (current_segment is None or
+                current_segment['elevenlabs_speaker_label'] != speaker_label):
+
+                # Save previous segment if it exists and meets minimum duration
+                if current_segment is not None:
+                    duration = current_segment['end_time'] - current_segment['start_time']
+                    if duration >= self.min_segment_duration:
+                        segments.append(current_segment)
+
+                # Start new segment
+                current_segment = {
+                    'elevenlabs_speaker_label': speaker_label,
+                    'start_time': word.get('start', 0.0),
+                    'end_time': word.get('end', 0.0),
+                    'text': word.get('text', ''),
+                    'words': [word],
+                    'confidence': self._logprob_to_confidence(word.get('logprob', 0.0))
+                }
+            else:
+                # Extend current segment
+                current_segment['end_time'] = word.get('end', 0.0)
+                current_segment['text'] += ' ' + word.get('text', '')
+                current_segment['words'].append(word)
+                # Update average confidence
+                word_confidence = self._logprob_to_confidence(word.get('logprob', 0.0))
+                current_segment['confidence'] = (
+                    (current_segment['confidence'] * (len(current_segment['words']) - 1) + word_confidence) /
+                    len(current_segment['words'])
+                )
+
+        # Don't forget the last segment
+        if current_segment is not None:
+            duration = current_segment['end_time'] - current_segment['start_time']
+            if duration >= self.min_segment_duration:
+                segments.append(current_segment)
+
+        return segments
+
+    def _extract_unique_speakers(self, segments: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """Extract unique speakers and their statistics from segments.
+
+        Args:
+            segments: List of speaker segments
+
+        Returns:
+            List of unique speakers with statistics
+        """
+        speaker_stats = defaultdict(lambda: {
+            'total_duration': 0.0,
+            'segment_count': 0,
+            'word_count': 0
+        })
+
+        for segment in segments:
+            speaker = segment['elevenlabs_speaker_label']
+            duration = segment['end_time'] - segment['start_time']
+            speaker_stats[speaker]['total_duration'] += duration
+            speaker_stats[speaker]['segment_count'] += 1
+            speaker_stats[speaker]['word_count'] += len(segment['words'])
+
+        unique_speakers = []
+        for speaker, stats in speaker_stats.items():
+            unique_speakers.append({
+                'speaker': speaker,
+                **stats
+            })
+
+        # Sort by total duration (most active speaker first)
+        unique_speakers.sort(key=lambda x: x['total_duration'], reverse=True)
+
+        return unique_speakers
+
+    def _logprob_to_confidence(self, logprob: float) -> float:
+        """Convert ElevenLabs logprob to confidence score (0-1).
+
+        Args:
+            logprob: Log probability from ElevenLabs
+
+        Returns:
+            Confidence score between 0 and 1
+        """
+        # ElevenLabs returns log probability (negative values closer to 0 are more confident)
+        # Convert to confidence: closer to 0 = higher confidence
+        return 1.0 - min(abs(logprob), 1.0)
+
+    def _calculate_avg_confidence(self, words: List[Dict[str, Any]]) -> float:
+        """Calculate average confidence from word list.
+
+        Args:
+            words: List of word dictionaries
+
+        Returns:
+            Average confidence score
+        """
+        if not words:
+            return 0.0
+
+        total_confidence = sum(
+            self._logprob_to_confidence(w.get('logprob', 0.0))
+            for w in words
+        )
+        return total_confidence / len(words)
+
+    def extract_speaker_segments_for_identification(
+        self,
+        segments: List[Dict[str, Any]],
+        audio_path: str
+    ) -> List[Dict[str, Any]]:
+        """Extract speaker segment info for identification.
+
+        Args:
+            segments: Parsed segments from parse_elevenlabs_json
+            audio_path: Path to the audio file
+
+        Returns:
+            List of segment info dicts for speaker identification
+        """
+        segment_info = []
+        for i, segment in enumerate(segments):
+            segment_info.append({
+                'segment_id': i,
+                'start_time': segment['start_time'],
+                'end_time': segment['end_time'],
+                'text': segment['text'],
+                'audio_path': audio_path,
+                'elevenlabs_speaker_label': segment['elevenlabs_speaker_label'],
+                'confidence': segment['confidence']
+            })
+        return segment_info
diff --git a/extras/speaker-recognition/webui/src/components/ProcessingModeSelector.tsx b/extras/speaker-recognition/webui/src/components/ProcessingModeSelector.tsx
index df1ff021..c5b56cd9 100644
--- a/extras/speaker-recognition/webui/src/components/ProcessingModeSelector.tsx
+++ b/extras/speaker-recognition/webui/src/components/ProcessingModeSelector.tsx
@@ -59,6 +59,15 @@ const PROCESSING_MODES: ProcessingModeConfig[] = [
     requirements: ['Deepgram API key'],
     features: ['High-quality transcription', 'Speaker diarization', 'Enhanced speaker identification', 'Word-level timing']
   },
+  {
+    mode: 'elevenlabs-enhanced',
+    name: 'ElevenLabs Transcribe',
+    description: '99-language transcription with speaker diarization',
+    icon: '🌍',
+    color: 'bg-indigo-600 hover:bg-indigo-700',
+    requirements: ['ElevenLabs API key'],
+    features: ['99 language support', 'Speaker diarization (up to 32 speakers)', 'Word-level timestamps', 'Speaker identification']
+  },
   {
     mode: 'deepgram-transcript-internal-speakers',
     name: 'Hybrid Mode',
@@ -439,15 +448,15 @@ export const ProcessingModeSelector: React.FC<ProcessingModeSelectorProps> = ({
       {/* Mode Selection */}
       <div className="space-y-4">
         <h4 className="font-medium text-gray-900">Choose Processing Mode</h4>
-        
+
         {/* Grid Layout for Modes */}
-        <div className="grid md:grid-cols-2 gap-4">
-          {PROCESSING_MODES.slice(0, 2).map((config) => (
+        <div className="grid md:grid-cols-3 gap-4">
+          {PROCESSING_MODES.slice(0, 3).map((config) => (
             <div
               key={config.mode}
               className={`border rounded-lg p-4 cursor-pointer transition-colors ${
-                selectedMode === config.mode 
-                  ? 'border-blue-500 bg-blue-50 dark:bg-blue-900/20 dark:border-blue-400' 
+                selectedMode === config.mode
+                  ? 'border-blue-500 bg-blue-50 dark:bg-blue-900/20 dark:border-blue-400'
                   : 'border-gray-200 dark:border-gray-600 hover:border-gray-300 dark:hover:border-gray-500'
               }`}
               onClick={() => onModeChange(config.mode)}
@@ -457,7 +466,7 @@ export const ProcessingModeSelector: React.FC<ProcessingModeSelectorProps> = ({
                 <div className="flex-1">
                   <h5 className="font-medium text-gray-900 dark:text-gray-100">{config.name}</h5>
                   <p className="text-sm text-gray-600 dark:text-gray-300 mt-1">{config.description}</p>
-                  
+
                   {/* Requirements */}
                   {config.requirements && (
                     <div className="mt-2">
@@ -466,7 +475,7 @@ export const ProcessingModeSelector: React.FC<ProcessingModeSelectorProps> = ({
                       </span>
                     </div>
                   )}
-                  
+
                   {/* Features */}
                   <div className="mt-2">
                     <div className="flex flex-wrap gap-1">
diff --git a/extras/speaker-recognition/webui/src/services/elevenlabs.ts b/extras/speaker-recognition/webui/src/services/elevenlabs.ts
new file mode 100644
index 00000000..75a89994
--- /dev/null
+++ b/extras/speaker-recognition/webui/src/services/elevenlabs.ts
@@ -0,0 +1,185 @@
+/**
+ * ElevenLabs Service - Transcription and speaker diarization via ElevenLabs API
+ * Provides batch transcription with speaker diarization (up to 32 speakers)
+ * Word-level timestamps with confidence scores
+ */
+
+import { apiService } from './api'
+
+export interface ElevenLabsWord {
+  text: string
+  start: number
+  end: number
+  type: 'word' | 'spacing' | 'audio_event' | 'punctuation'
+  logprob: number
+  speaker_id?: string
+}
+
+export interface ElevenLabsResponse {
+  language_code: string
+  language_probability: number
+  text: string
+  words: ElevenLabsWord[]
+  transcription_id: string
+}
+
+export interface ElevenLabsSegment {
+  start: number
+  end: number
+  speaker: number
+  speakerId?: string
+  speakerName?: string
+  confidence: number
+  text: string
+  identifiedSpeakerId?: string
+  identifiedSpeakerName?: string
+  speakerIdentificationConfidence?: number
+  speakerStatus?: string
+}
+
+export interface ElevenLabsOptions {
+  enhanceSpeakers?: boolean
+  userId?: number
+  speakerConfidenceThreshold?: number
+  model?: string
+  language?: string
+  numSpeakers?: number
+}
+
+/**
+ * Transcribe audio using ElevenLabs Scribe API with optional speaker enhancement
+ */
+export async function transcribeWithElevenLabs(
+  audioFile: File | Blob,
+  options: ElevenLabsOptions = {}
+): Promise<ElevenLabsResponse> {
+  try {
+    const formData = new FormData()
+    formData.append('file', audioFile, audioFile instanceof File ? audioFile.name : 'audio.wav')
+    formData.append('model_id', options.model || 'scribe_v1')
+
+    if (options.language) {
+      formData.append('language', options.language)
+    }
+
+    // Enable speaker diarization
+    formData.append('enable_speaker_diarization', 'true')
+
+    if (options.numSpeakers) {
+      formData.append('num_speakers', options.numSpeakers.toString())
+    }
+
+    // Determine endpoint based on enhancement option
+    const endpoint = options.enhanceSpeakers
+      ? '/elevenlabs/v1/transcribe'
+      : 'https://api.elevenlabs.io/v1/speech-to-text'
+
+    // Add speaker enhancement parameters if using wrapper endpoint
+    const params: Record<string, string> = {}
+    if (options.enhanceSpeakers) {
+      params.enhance_speakers = 'true'
+      if (options.userId) {
+        params.user_id = options.userId.toString()
+      }
+      if (options.speakerConfidenceThreshold !== undefined) {
+        params.similarity_threshold = options.speakerConfidenceThreshold.toString()
+      }
+    }
+
+    const response = await apiService.post(endpoint, formData, {
+      headers: {
+        'Content-Type': 'multipart/form-data',
+        ...(endpoint.includes('elevenlabs.io') && {
+          'xi-api-key': import.meta.env.VITE_ELEVENLABS_API_KEY || ''
+        })
+      },
+      params,
+      timeout: 180000
+    })
+
+    return response.data as ElevenLabsResponse
+  } catch (error) {
+    throw new Error(`ElevenLabs transcription failed: ${error instanceof Error ? error.message : 'Unknown error'}`)
+  }
+}
+
+/**
+ * Process ElevenLabs response into speaker segments
+ */
+export function processElevenLabsResponse(response: ElevenLabsResponse): ElevenLabsSegment[] {
+  const segments: ElevenLabsSegment[] = []
+
+  // Filter to only word types and group by speaker_id
+  const words = response.words.filter(w => w.type === 'word')
+
+  if (words.length === 0) {
+    return segments
+  }
+
+  let currentSegment: ElevenLabsSegment | null = null
+
+  for (const word of words) {
+    const speakerId = word.speaker_id || 'speaker_0'
+    const speakerNum = parseInt(speakerId.replace('speaker_', '')) || 0
+
+    // Convert logprob to confidence (logprob is typically 0 to -1, where 0 is highest confidence)
+    const confidence = logprobToConfidence(word.logprob)
+
+    if (!currentSegment || currentSegment.speaker !== speakerNum) {
+      // Start new segment
+      if (currentSegment) {
+        segments.push(currentSegment)
+      }
+
+      currentSegment = {
+        start: word.start,
+        end: word.end,
+        speaker: speakerNum,
+        speakerId: speakerId,
+        speakerName: `Speaker ${speakerNum}`,
+        confidence: confidence,
+        text: word.text
+      }
+    } else {
+      // Continue current segment
+      currentSegment.end = word.end
+      currentSegment.text += word.text
+      // Update confidence as running average
+      currentSegment.confidence = (currentSegment.confidence + confidence) / 2
+    }
+  }
+
+  // Push last segment
+  if (currentSegment) {
+    segments.push(currentSegment)
+  }
+
+  return segments
+}
+
+/**
+ * Convert ElevenLabs logprob to confidence score (0-1)
+ * logprob is typically in range [0, -1] where 0 is highest confidence
+ */
+function logprobToConfidence(logprob: number): number {
+  // ElevenLabs logprob: 0 = perfect confidence, negative = lower confidence
+  // Convert to 0-1 scale where 1 = perfect confidence
+  return 1.0 - Math.min(Math.abs(logprob), 1.0)
+}
+
+/**
+ * Calculate confidence summary statistics for segments
+ */
+export function calculateConfidenceSummary(segments: ElevenLabsSegment[]): {
+  total_segments: number
+  high_confidence: number
+  medium_confidence: number
+  low_confidence: number
+} {
+  return {
+    total_segments: segments.length,
+    high_confidence: segments.filter(s => s.confidence >= 0.8).length,
+    medium_confidence: segments.filter(s => s.confidence >= 0.6 && s.confidence < 0.8).length,
+    low_confidence: segments.filter(s => s.confidence >= 0.4 && s.confidence < 0.6).length
+  }
+}
diff --git a/extras/speaker-recognition/webui/src/services/speakerIdentification.ts b/extras/speaker-recognition/webui/src/services/speakerIdentification.ts
index 59939a42..60165594 100644
--- a/extras/speaker-recognition/webui/src/services/speakerIdentification.ts
+++ b/extras/speaker-recognition/webui/src/services/speakerIdentification.ts
@@ -6,8 +6,9 @@
 
 import { apiService } from './api'
 import { transcribeWithDeepgram, processDeepgramResponse, calculateConfidenceSummary, DeepgramResponse } from './deepgram'
+import { transcribeWithElevenLabs, processElevenLabsResponse, calculateConfidenceSummary as calculateElevenLabsConfidenceSummary, ElevenLabsResponse } from './elevenlabs'
 
-export type ProcessingMode = 'diarization-only' | 'speaker-identification' | 'deepgram-enhanced' | 'deepgram-transcript-internal-speakers' | 'diarize-identify-match'
+export type ProcessingMode = 'diarization-only' | 'speaker-identification' | 'deepgram-enhanced' | 'deepgram-transcript-internal-speakers' | 'diarize-identify-match' | 'elevenlabs-enhanced'
 
 export interface SpeakerSegment {
   start: number
@@ -51,6 +52,7 @@ export interface ProcessingResult {
     low_confidence: number
   }
   deepgram_response?: DeepgramResponse
+  elevenlabs_response?: ElevenLabsResponse
   processing_time?: number
   error?: string
 }
@@ -94,6 +96,9 @@ export class SpeakerIdentificationService {
         case 'deepgram-transcript-internal-speakers':
           result = await this.processWithHybrid(audioFile, options)
           break
+        case 'elevenlabs-enhanced':
+          result = await this.processWithElevenLabs(audioFile, options)
+          break
         case 'diarization-only':
           result = await this.processWithDiarizationOnly(audioFile, options)
           break
@@ -220,6 +225,60 @@ export class SpeakerIdentificationService {
     }
   }
 
+  /**
+   * Process with ElevenLabs transcription + speaker enhancement
+   */
+  private async processWithElevenLabs(
+    audioFile: File | Blob,
+    options: ProcessingOptions
+  ): Promise<ProcessingResult> {
+    try {
+      const filename = audioFile instanceof File ? audioFile.name : 'Audio'
+
+      // Use ElevenLabs service
+      const elevenlabsResponse = await transcribeWithElevenLabs(audioFile, {
+        enhanceSpeakers: options.enhanceSpeakers !== false,
+        userId: options.userId,
+        speakerConfidenceThreshold: options.confidenceThreshold || 0.15,
+        numSpeakers: options.maxSpeakers
+      })
+
+      // Process response using ElevenLabs service
+      const elevenlabsSegments = processElevenLabsResponse(elevenlabsResponse)
+
+      // Convert to SpeakerSegment format
+      const speakerSegments: SpeakerSegment[] = elevenlabsSegments.map(segment => ({
+        start: segment.start,
+        end: segment.end,
+        speaker_id: segment.speakerId || `speaker_${segment.speaker}`,
+        speaker_name: segment.speakerName || `Speaker ${segment.speaker}`,
+        confidence: segment.confidence,
+        text: segment.text,
+        identified_speaker_id: segment.identifiedSpeakerId,
+        identified_speaker_name: segment.identifiedSpeakerName,
+        speaker_identification_confidence: segment.speakerIdentificationConfidence,
+        speaker_status: segment.speakerStatus
+      }))
+
+      // Calculate confidence summary
+      const confidenceSummary = calculateElevenLabsConfidenceSummary(elevenlabsSegments)
+
+      return {
+        id: Math.random().toString(36),
+        filename,
+        duration: this.estimateDuration(speakerSegments),
+        status: 'completed',
+        created_at: new Date().toISOString(),
+        mode: 'elevenlabs-enhanced',
+        speakers: speakerSegments,
+        confidence_summary: confidenceSummary,
+        elevenlabs_response: elevenlabsResponse
+      }
+    } catch (error) {
+      throw new Error(`ElevenLabs processing failed: ${error.message}`)
+    }
+  }
+
   /**
    * Process with hybrid mode (Deepgram transcription + internal diarization)
    */
@@ -515,6 +574,11 @@ export class SpeakerIdentificationService {
         name: 'Deepgram Enhanced',
         description: 'Deepgram transcription + diarization + replace speakers with enrolled IDs'
       },
+      {
+        mode: 'elevenlabs-enhanced',
+        name: 'ElevenLabs Enhanced',
+        description: 'ElevenLabs transcription (99 languages) + diarization + speaker identification'
+      },
       {
         mode: 'deepgram-transcript-internal-speakers',
         name: 'Deepgram Transcript + Internal Speakers',
diff --git a/wizard.py b/wizard.py
index cfc5b861..15609ce2 100755
--- a/wizard.py
+++ b/wizard.py
@@ -183,6 +183,12 @@ def run_service_setup(service_name, selected_services, https_enabled=False, serv
                 cmd.extend(['--deepgram-api-key', deepgram_key])
                 console.print("[blue][INFO][/blue] Found existing DEEPGRAM_API_KEY from backend config, reusing")
 
+            # Pass ElevenLabs API key from backend if available
+            elevenlabs_key = read_env_value(backend_env_path, 'ELEVENLABS_API_KEY')
+            if elevenlabs_key and not is_placeholder(elevenlabs_key, 'your_elevenlabs_api_key_here', 'your-elevenlabs-api-key-here'):
+                cmd.extend(['--elevenlabs-api-key', elevenlabs_key])
+                console.print("[blue][INFO][/blue] Found existing ELEVENLABS_API_KEY from backend config, reusing")
+
             # Pass HF Token from existing speaker recognition .env if available
             speaker_env_path = 'extras/speaker-recognition/.env'
             hf_token = read_env_value(speaker_env_path, 'HF_TOKEN')