# app.py import gradio as gr import torch # --- Global Settings & Model Caching --- MODEL_CACHE = {} DEVICE = torch.device('cpu') DEFAULT_MODEL_ID = 'v3_en_indic' # --- Model Loading Logic --- def load_model(model_id, language): """Loads a Silero model from cache or from torch.hub.""" if model_id in MODEL_CACHE: print(f"Loading model '{model_id}' from cache.") return MODEL_CACHE[model_id] print(f"Loading model '{model_id}' from torch.hub...") model, _ = torch.hub.load(repo_or_dir='snakers4/silero-models', model='silero_tts', language=language, speaker=model_id) model.to(DEVICE) MODEL_CACHE[model_id] = model return model def get_model_details(model_id): """Returns the language for a given model ID.""" if model_id == 'v4_indic': return 'indic' return 'en' def change_model(model_id): """ Called when the user selects a new model. It loads the model and updates the speaker dropdown choices. """ language = get_model_details(model_id) model = load_model(model_id, language) speakers = ['random'] + model.speakers # THE FIX IS ON THIS LINE: Changed gr.Dropdown.update to gr.update return model, gr.update(choices=speakers, value='random') # --- Core TTS Function --- def generate_audio(model, text, speaker, apply_accent): """ Generates audio from text using the selected model and speaker. """ if model is None: return None, "Error: Model not loaded. Please select a model from the dropdown." if not text.strip(): return None, "Please enter some text to generate audio." print(f"Generating audio for: '{text}' with speaker: '{speaker}'") sample_rate = 48000 try: audio_tensor = model.apply_tts(text=text, speaker=speaker, sample_rate=sample_rate, put_accent=apply_accent) return (sample_rate, audio_tensor.numpy()), f"Successfully generated for: '{text}'" except Exception as e: print(f"Error during TTS generation: {e}") return None, f"An error occurred: {e}" # --- Load the initial model at startup --- initial_model = load_model(DEFAULT_MODEL_ID, get_model_details(DEFAULT_MODEL_ID)) initial_speakers = ['random'] + initial_model.speakers # --- Gradio UI Definition --- with gr.Blocks(theme=gr.themes.Soft()) as demo: gr.Markdown( """ # 🎙️ Silero Multi-Model Text-to-Speech Select a model, choose a speaker, and enter text to generate speech. **Note:** `v3` models support [SSML tags](https://github.com/snakers4/silero-models?tab=readme-ov-file#ssml-support) for advanced control (e.g., `slow speech`). """ ) model_state = gr.State(initial_model) with gr.Row(): with gr.Column(scale=2): model_selector = gr.Dropdown( label="Select Model", choices=['v3_en_indic', 'v4_indic', 'v3_en'], value=DEFAULT_MODEL_ID ) text_input = gr.Textbox( label="Text to Synthesize (Supports SSML for V3 models)", placeholder="Hello, welcome to my text to speech app.", lines=4 ) speaker_dropdown = gr.Dropdown( label="Select Speaker Voice", choices=initial_speakers, value='random' ) accent_checkbox = gr.Checkbox(label="Apply Accent Stress", value=True) generate_btn = gr.Button("Generate Audio", variant="primary") with gr.Column(scale=1): status_text = gr.Textbox(label="Status", interactive=False) audio_output = gr.Audio(label="Generated Audio") # --- Event Handling --- model_selector.change( fn=change_model, inputs=model_selector, outputs=[model_state, speaker_dropdown] ) generate_btn.click( fn=generate_audio, inputs=[model_state, text_input, speaker_dropdown, accent_checkbox], outputs=[audio_output, status_text] ) gr.Examples( examples=[ ["v3_en_indic", "Welcome to this demonstration of advanced speech synthesis technology.", "random", True], ["v4_indic", "Aapka shubh naam kya hai?", "hindi_female", True], ["v3_en", "I live in India", "en_10", True], ], inputs=[model_selector, text_input, speaker_dropdown, accent_checkbox], ) if __name__ == "__main__": demo.launch()