Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion backends/advanced/.dockerignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,6 @@
!webui
!ssl
!nginx.conf
!nginx.conf.template
!nginx.conf.template
!start.sh
!start-workers.sh
6 changes: 0 additions & 6 deletions backends/advanced/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -182,12 +182,6 @@ services:
# - ./data/neo4j_data:/data
# - ./data/neo4j_logs:/logs
# restart: unless-stopped
# proxy:
# image: nginx:alpine
# depends_on: [friend-backend, streamlit]
# volumes:
# - ./nginx.conf:/etc/nginx/nginx.conf:ro
# ports: ["80:80"] # publish once; ngrok points here

# ollama:
# image: ollama/ollama:latest
Expand Down
Empty file modified backends/advanced/start-workers.sh
100644 → 100755
Empty file.
8 changes: 1 addition & 7 deletions extras/speaker-recognition/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,7 @@ RUN mkdir -p src/simple_speaker_recognition
COPY src/simple_speaker_recognition/__init__.py src/simple_speaker_recognition/

# Install dependencies and package
# Use build arg to control CPU vs GPU mode
ARG COMPUTE_MODE=cpu
RUN if [ "$COMPUTE_MODE" = "gpu" ]; then \
uv sync --no-default-groups --group gpu; \
else \
uv sync --no-default-groups --group cpu; \
fi
RUN uv sync --no-dev

# Create directories
RUN mkdir -p /app/audio_chunks /app/debug /app/data /models
Expand Down
4 changes: 0 additions & 4 deletions extras/speaker-recognition/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@ services:
build:
context: .
dockerfile: Dockerfile
args:
COMPUTE_MODE: cpu
image: speaker-recognition:latest
env_file:
- .env
Expand Down Expand Up @@ -44,8 +42,6 @@ services:
build:
context: .
dockerfile: Dockerfile
args:
COMPUTE_MODE: gpu
deploy:
resources:
reservations:
Expand Down
26 changes: 4 additions & 22 deletions extras/speaker-recognition/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,12 @@ dependencies = [
"sqlalchemy>=2.0.0",
"alembic>=1.13.0",
# Data processing
"pandas>=2.2.0",
"pandas>=2.0.0",
"scikit-learn>=1.4.0", # For t-SNE/UMAP visualization
"umap-learn>=0.5.3",
"faiss-cpu>=1.9",
"torch>=2.0.0",
"torchaudio>=2.0.0",
]

[build-system]
Expand All @@ -49,17 +52,6 @@ simple-speaker-service = "simple_speaker_recognition.api.service:main"
simple-speaker-web = "simple_speaker_recognition.web.app:main"

[dependency-groups]
cpu = [
"faiss-cpu>=1.9",
"torch>=2.0.0",
"torchaudio>=2.0.0",
"numpy==1.23.5", # Ensure numpy compatibility with faiss-cpu
]
gpu = [
"faiss-cpu>=1.9", # Use CPU FAISS for compatibility, GPU PyTorch for performance
"torch>=2.0.0",
"torchaudio>=2.0.0",
]
dev = [
"black>=25.1.0",
"isort>=6.0.1",
Expand All @@ -70,15 +62,5 @@ test = [
"requests", # For integration tests
]

[tool.uv]
# dev & cpu are installed automatically
default-groups = ["dev", "cpu"]
# cpu and gpu can never coexist
conflicts = [
[
{ group = "cpu" },
{ group = "gpu" },
],
]
[tool.isort]
profile = "black"
11 changes: 6 additions & 5 deletions extras/speaker-recognition/scripts/download-pyannote.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,12 @@
This script is designed to be run during Docker build to pre-download models.
"""

import logging
import os
import sys
import logging

from pyannote.audio import Pipeline
from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
Expand All @@ -28,14 +31,12 @@ def download_models():
return True # Don't fail the build, just skip download

# Import and download models
from pyannote.audio import Pipeline
from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding

logger.info("Downloading speaker diarization model...")
Pipeline.from_pretrained('pyannote/speaker-diarization-3.1', use_auth_token=hf_token)
Pipeline.from_pretrained('pyannote/speaker-diarization-3.1', token=hf_token)

logger.info("Downloading speaker embedding model...")
PretrainedSpeakerEmbedding('pyannote/wespeaker-voxceleb-resnet34-LM', use_auth_token=hf_token)
PretrainedSpeakerEmbedding('pyannote/wespeaker-voxceleb-resnet34-LM', token=hf_token)

logger.info("Models downloaded successfully!")
return True
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,25 @@
from collections import deque
from datetime import datetime
from typing import Any, Dict, List, Optional, Tuple
from urllib.parse import urlencode, parse_qs
from urllib.parse import parse_qs, urlencode

import numpy as np
import torch
import websockets
from fastapi import APIRouter, Depends, HTTPException, Query, Request, WebSocket, WebSocketDisconnect
from fastapi import (
APIRouter,
Depends,
HTTPException,
Query,
Request,
WebSocket,
WebSocketDisconnect,
)
from pyannote.audio import Model
from pyannote.audio.pipelines import VoiceActivityDetection

from simple_speaker_recognition.api.core.utils import (
safe_format_confidence,
validate_confidence
validate_confidence,
)
from simple_speaker_recognition.core.models import SpeakerStatus
from simple_speaker_recognition.core.unified_speaker_db import UnifiedSpeakerDB
Expand Down Expand Up @@ -121,7 +128,7 @@ def initialize_vad(self):
log.info("Loading Pyannote VAD model...")
self.vad_model = Model.from_pretrained(
"pyannote/segmentation-3.0",
use_auth_token=self.hf_token
token=self.hf_token
)

# Create VAD pipeline
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ class AudioBackend:
def __init__(self, hf_token: str, device: torch.device):
self.device = device
self.diar = Pipeline.from_pretrained(
"pyannote/speaker-diarization-3.1", use_auth_token=hf_token
"pyannote/speaker-diarization-3.1", token=hf_token
).to(device)

# Configure pipeline with proper segmentation parameters to reduce over-segmentation
Expand Down
Loading