Silero_TTS / app.py
farhananis005's picture
Upload app.py
3a56178 verified
# app.py
import gradio as gr
import torch
# --- Global Settings & Model Caching ---
MODEL_CACHE = {}
DEVICE = torch.device('cpu')
DEFAULT_MODEL_ID = 'v3_en_indic'
# --- Model Loading Logic ---
def load_model(model_id, language):
"""Loads a Silero model from cache or from torch.hub."""
if model_id in MODEL_CACHE:
print(f"Loading model '{model_id}' from cache.")
return MODEL_CACHE[model_id]
print(f"Loading model '{model_id}' from torch.hub...")
model, _ = torch.hub.load(repo_or_dir='snakers4/silero-models',
model='silero_tts',
language=language,
speaker=model_id)
model.to(DEVICE)
MODEL_CACHE[model_id] = model
return model
def get_model_details(model_id):
"""Returns the language for a given model ID."""
if model_id == 'v4_indic':
return 'indic'
return 'en'
def change_model(model_id):
"""
Called when the user selects a new model. It loads the model and
updates the speaker dropdown choices.
"""
language = get_model_details(model_id)
model = load_model(model_id, language)
speakers = ['random'] + model.speakers
# THE FIX IS ON THIS LINE: Changed gr.Dropdown.update to gr.update
return model, gr.update(choices=speakers, value='random')
# --- Core TTS Function ---
def generate_audio(model, text, speaker, apply_accent):
"""
Generates audio from text using the selected model and speaker.
"""
if model is None:
return None, "Error: Model not loaded. Please select a model from the dropdown."
if not text.strip():
return None, "Please enter some text to generate audio."
print(f"Generating audio for: '{text}' with speaker: '{speaker}'")
sample_rate = 48000
try:
audio_tensor = model.apply_tts(text=text,
speaker=speaker,
sample_rate=sample_rate,
put_accent=apply_accent)
return (sample_rate, audio_tensor.numpy()), f"Successfully generated for: '{text}'"
except Exception as e:
print(f"Error during TTS generation: {e}")
return None, f"An error occurred: {e}"
# --- Load the initial model at startup ---
initial_model = load_model(DEFAULT_MODEL_ID, get_model_details(DEFAULT_MODEL_ID))
initial_speakers = ['random'] + initial_model.speakers
# --- Gradio UI Definition ---
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown(
"""
# πŸŽ™οΈ Silero Multi-Model Text-to-Speech
Select a model, choose a speaker, and enter text to generate speech.
**Note:** `v3` models support [SSML tags](https://github.com/snakers4/silero-models?tab=readme-ov-file#ssml-support) for advanced control (e.g., `<speak><prosody rate='x-slow'>slow speech</prosody></speak>`).
"""
)
model_state = gr.State(initial_model)
with gr.Row():
with gr.Column(scale=2):
model_selector = gr.Dropdown(
label="Select Model",
choices=['v3_en_indic', 'v4_indic', 'v3_en'],
value=DEFAULT_MODEL_ID
)
text_input = gr.Textbox(
label="Text to Synthesize (Supports SSML for V3 models)",
placeholder="Hello, welcome to my text to speech app.",
lines=4
)
speaker_dropdown = gr.Dropdown(
label="Select Speaker Voice",
choices=initial_speakers,
value='random'
)
accent_checkbox = gr.Checkbox(label="Apply Accent Stress", value=True)
generate_btn = gr.Button("Generate Audio", variant="primary")
with gr.Column(scale=1):
status_text = gr.Textbox(label="Status", interactive=False)
audio_output = gr.Audio(label="Generated Audio")
# --- Event Handling ---
model_selector.change(
fn=change_model,
inputs=model_selector,
outputs=[model_state, speaker_dropdown]
)
generate_btn.click(
fn=generate_audio,
inputs=[model_state, text_input, speaker_dropdown, accent_checkbox],
outputs=[audio_output, status_text]
)
gr.Examples(
examples=[
["v3_en_indic", "Welcome to this demonstration of advanced speech synthesis technology.", "random", True],
["v4_indic", "Aapka shubh naam kya hai?", "hindi_female", True],
["v3_en", "I live in India", "en_10", True],
],
inputs=[model_selector, text_input, speaker_dropdown, accent_checkbox],
)
if __name__ == "__main__":
demo.launch()