Spaces:
Sleeping
Sleeping
| from fastapi import FastAPI, Query | |
| from fastapi.responses import StreamingResponse | |
| from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan | |
| import torch | |
| import io | |
| import soundfile as sf | |
| import requests | |
| import numpy as np | |
| app = FastAPI(title="SpeechT5 TTS API") | |
| # Adjustable parameters | |
| NORMALIZATION_LEVEL = 0.1 | |
| SMOOTHING_WINDOW = 7 | |
| BIT_DEPTH = "32f" | |
| # Load models once at startup | |
| processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") | |
| model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") | |
| vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") | |
| # Map integer to speaker embedding URL | |
| SPEAKER_EMBEDDINGS = { | |
| 0: "https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/speaker_embeddings.bin", # Normal | |
| 1: "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_slt_arctic-wav-arctic_a0001.bin", # US female 1 | |
| 2: "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_clb_arctic-wav-arctic_a0001.bin", # US female 2 | |
| 3: "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_bdl_arctic-wav-arctic_a0003.bin", # US male 1 | |
| 4: "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_rms_arctic-wav-arctic_a0003.bin", # US male 2 | |
| 5: "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_jmk_arctic-wav-arctic_a0002.bin", # Canadian male | |
| 6: "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_awb_arctic-wav-arctic_b0002.bin", # Scottish male | |
| 7: "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_ksp_arctic-wav-arctic_a0007.bin", # Indian male | |
| } | |
| def load_speaker_embedding(url: str) -> torch.Tensor: | |
| response = requests.get(url) | |
| response.raise_for_status() | |
| embedding = torch.frombuffer(response.content, dtype=torch.float32) | |
| return embedding.unsqueeze(0) | |
| def smooth_audio_nodejs(audio: np.ndarray, window_size: int) -> np.ndarray: | |
| smoothed = np.copy(audio) | |
| half = window_size // 2 | |
| for i in range(len(audio)): | |
| start = max(0, i - half) | |
| end = min(len(audio), i + half + 1) | |
| smoothed[i] = np.mean(audio[start:end]) | |
| return smoothed | |
| def apply_fade_out(audio: np.ndarray, fade_samples: int = 256) -> np.ndarray: | |
| fade = np.linspace(1, 0, fade_samples) | |
| audio[-fade_samples:] *= fade | |
| return audio | |
| def speak( | |
| text: str = Query(..., description="Text to convert to speech"), | |
| speaker: int = Query(1, ge=0, le=7, description="Speaker ID (0-7)") | |
| ): | |
| embedding_url = SPEAKER_EMBEDDINGS[speaker] | |
| speaker_embedding = load_speaker_embedding(embedding_url) | |
| inputs = processor(text=text, return_tensors="pt") | |
| speech = model.generate_speech(inputs["input_ids"], speaker_embedding, vocoder=vocoder) | |
| audio = speech.numpy().astype(np.float32) | |
| # --- Normalize --- | |
| peak = np.max(np.abs(audio)) | |
| if peak > 0: | |
| audio = (audio / peak) * NORMALIZATION_LEVEL | |
| # --- Smooth audio --- | |
| audio = smooth_audio_nodejs(audio, SMOOTHING_WINDOW * 2) # Slightly larger window | |
| # --- Fade out to remove clicks at the end --- | |
| fade_samples = min(512, len(audio)//10) # ~30ms at 16kHz | |
| audio = apply_fade_out(audio, fade_samples) | |
| # --- Bit depth --- | |
| if BIT_DEPTH == "16": | |
| pcm = np.clip(np.round(audio * 32767), -32768, 32767).astype(np.int16) | |
| else: | |
| pcm = audio | |
| # --- Write WAV --- | |
| buf = io.BytesIO() | |
| subtype = "PCM_16" if BIT_DEPTH == "16" else "FLOAT" | |
| sf.write(buf, pcm, samplerate=16000, format="WAV", subtype=subtype) | |
| buf.seek(0) | |
| return StreamingResponse(buf, media_type="audio/wav") | |