forked from TTS-AGI/tts-router-v2
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathapp.py
More file actions
123 lines (95 loc) · 3.54 KB
/
app.py
File metadata and controls
123 lines (95 loc) · 3.54 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import uvicorn
from loguru import logger
import os
from dotenv import load_dotenv
import base64
import io
from typing import Tuple
from pydub import AudioSegment
from pydub import effects as audio_effects
# Import TTS providers
from tts_providers import (
get_available_providers,
get_provider_models,
synthesize_speech,
)
# Load environment variables
load_dotenv()
app = FastAPI(
title="TTS Router API",
description="API to route text-to-speech requests to different providers",
version="1.0.0",
)
class TTSRequest(BaseModel):
text: str
provider: str
model: str = None
@app.get("/")
async def root():
return {"message": "TTS Router API"}
@app.get("/providers")
async def providers():
"""List all available TTS providers"""
return {"providers": get_available_providers()}
@app.get("/providers/{provider}/models")
async def models(provider: str):
"""List all available models for a specific provider"""
try:
return {"models": get_provider_models(provider)}
except ValueError as e:
logger.error(f"Error fetching models for provider {provider}: {str(e)}")
raise HTTPException(status_code=400, detail=str(e))
@app.post("/tts")
async def tts(request: TTSRequest):
"""Generate TTS audio from text using specified provider and model"""
try:
provider = request.provider
model = request.model
text = request.text
logger.info(
f"TTS request received - Provider: {provider}, Model: {model}, Text length: {len(text)}"
)
audio_data, extension = await synthesize_speech(text, provider, model)
# Normalize audio volume to reduce bias between providers
try:
audio_data = _normalize_base64_audio(audio_data, extension)
logger.info("Applied peak normalization to output audio")
except Exception as norm_err:
logger.warning(f"Audio normalization failed, returning original audio: {norm_err}")
logger.info(
f"TTS request completed successfully - Provider: {provider}, Model: {model}"
)
# The audio_data is now a base64 encoded string (already encoded by the provider)
return {
"status": "success",
"provider": provider,
"model": model,
"audio_data": audio_data, # Already base64 encoded
"extension": extension,
}
except Exception as e:
logger.error(f"Error generating TTS: {str(e)}")
raise HTTPException(status_code=500, detail=str(e))
if __name__ == "__main__":
uvicorn.run("app:app", host="0.0.0.0", port=8000, reload=True)
def _normalize_base64_audio(b64_audio: str, extension: str) -> str:
"""Quick peak normalization on base64 audio while preserving format.
- Decodes base64 -> AudioSegment
- Applies fast peak normalization with small headroom
- Re-encodes to original format and returns base64
"""
raw = base64.b64decode(b64_audio)
buf = io.BytesIO(raw)
# Map some extensions to ffmpeg format names if needed
fmt = (extension or "mp3").lower()
fmt_map = {"m4a": "mp4"}
load_fmt = fmt_map.get(fmt, fmt)
audio = AudioSegment.from_file(buf, format=load_fmt)
# Fast peak normalization with 1 dB headroom to avoid clipping
normalized = audio_effects.normalize(audio, headroom=1.0)
out = io.BytesIO()
export_fmt = fmt
normalized.export(out, format=export_fmt)
return base64.b64encode(out.getvalue()).decode("ascii")