from fastapi import FastAPI, Query from fastapi.responses import StreamingResponse from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan import torch import io import soundfile as sf import requests import numpy as np app = FastAPI(title="SpeechT5 TTS API") NORMALIZATION_LEVEL = 0.1 SMOOTHING_WINDOW = 3 BIT_DEPTH = "32f" processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") def load_speaker_embedding(url: str) -> torch.Tensor: response = requests.get(url) response.raise_for_status() embedding = torch.frombuffer(response.content, dtype=torch.float32) return embedding.unsqueeze(0) speaker_embeddings = load_speaker_embedding( "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_slt_arctic-wav-arctic_a0001.bin" ) def smooth_audio(audio: np.ndarray, window_size: int) -> np.ndarray: if window_size < 2: return audio cumsum = np.cumsum(np.insert(audio, 0, 0)) smoothed = (cumsum[window_size:] - cumsum[:-window_size]) / window_size pad_left = window_size // 2 pad_right = window_size - 1 - pad_left smoothed = np.pad(smoothed, (pad_left, pad_right), mode='edge') return smoothed @app.get("/speak") def speak(text: str = Query(..., description="Text to convert to speech")): inputs = processor(text=text, return_tensors="pt") speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder) audio = speech.numpy().astype(np.float32) audio = smooth_audio(audio, SMOOTHING_WINDOW) peak = np.max(np.abs(audio)) if peak > 0: audio = (audio / peak) * NORMALIZATION_LEVEL if BIT_DEPTH == "16": pcm = np.clip(np.round(audio * 32767), -32768, 32767).astype(np.int16) else: pcm = audio buf = io.BytesIO() subtype = "PCM_16" if BIT_DEPTH == "16" else "FLOAT" sf.write(buf, pcm, samplerate=16000, format="WAV", subtype=subtype) buf.seek(0) return StreamingResponse(buf, media_type="audio/wav")