textSpeaker / app.py
anuj-exe's picture
Update app.py
9f2ddc0 verified
from fastapi import FastAPI, Query
from fastapi.responses import StreamingResponse
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
import torch
import io
import soundfile as sf
import requests
import numpy as np
app = FastAPI(title="SpeechT5 TTS API")
# Adjustable parameters
NORMALIZATION_LEVEL = 0.1
SMOOTHING_WINDOW = 7
BIT_DEPTH = "32f"
# Load models once at startup
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
# Map integer to speaker embedding URL
SPEAKER_EMBEDDINGS = {
0: "https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/speaker_embeddings.bin", # Normal
1: "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_slt_arctic-wav-arctic_a0001.bin", # US female 1
2: "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_clb_arctic-wav-arctic_a0001.bin", # US female 2
3: "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_bdl_arctic-wav-arctic_a0003.bin", # US male 1
4: "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_rms_arctic-wav-arctic_a0003.bin", # US male 2
5: "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_jmk_arctic-wav-arctic_a0002.bin", # Canadian male
6: "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_awb_arctic-wav-arctic_b0002.bin", # Scottish male
7: "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_ksp_arctic-wav-arctic_a0007.bin", # Indian male
}
def load_speaker_embedding(url: str) -> torch.Tensor:
response = requests.get(url)
response.raise_for_status()
embedding = torch.frombuffer(response.content, dtype=torch.float32)
return embedding.unsqueeze(0)
def smooth_audio_nodejs(audio: np.ndarray, window_size: int) -> np.ndarray:
smoothed = np.copy(audio)
half = window_size // 2
for i in range(len(audio)):
start = max(0, i - half)
end = min(len(audio), i + half + 1)
smoothed[i] = np.mean(audio[start:end])
return smoothed
def apply_fade_out(audio: np.ndarray, fade_samples: int = 256) -> np.ndarray:
fade = np.linspace(1, 0, fade_samples)
audio[-fade_samples:] *= fade
return audio
@app.get("/speak")
def speak(
text: str = Query(..., description="Text to convert to speech"),
speaker: int = Query(1, ge=0, le=7, description="Speaker ID (0-7)")
):
embedding_url = SPEAKER_EMBEDDINGS[speaker]
speaker_embedding = load_speaker_embedding(embedding_url)
inputs = processor(text=text, return_tensors="pt")
speech = model.generate_speech(inputs["input_ids"], speaker_embedding, vocoder=vocoder)
audio = speech.numpy().astype(np.float32)
# --- Normalize ---
peak = np.max(np.abs(audio))
if peak > 0:
audio = (audio / peak) * NORMALIZATION_LEVEL
# --- Smooth audio ---
audio = smooth_audio_nodejs(audio, SMOOTHING_WINDOW * 2) # Slightly larger window
# --- Fade out to remove clicks at the end ---
fade_samples = min(512, len(audio)//10) # ~30ms at 16kHz
audio = apply_fade_out(audio, fade_samples)
# --- Bit depth ---
if BIT_DEPTH == "16":
pcm = np.clip(np.round(audio * 32767), -32768, 32767).astype(np.int16)
else:
pcm = audio
# --- Write WAV ---
buf = io.BytesIO()
subtype = "PCM_16" if BIT_DEPTH == "16" else "FLOAT"
sf.write(buf, pcm, samplerate=16000, format="WAV", subtype=subtype)
buf.seek(0)
return StreamingResponse(buf, media_type="audio/wav")