coquiAPI

Sleeping

App Files Files Community

Samit-khedekar commited on Jun 11

Commit

c30d3ee

verified ·

1 Parent(s): 7f3065e

Update app.py

Browse files

Files changed (1) hide show

app.py +67 -37

app.py CHANGED Viewed

@@ -1,57 +1,87 @@
 import gradio as gr
 from TTS.api import TTS
 import time
-import os
-# Available voice models
-models = {
-    "fast_pitch": "tts_models/en/ljspeech/fast_pitch",
-    "tacotron2": "tts_models/en/ljspeech/tacotron2-DDC",
-    "glow_tts": "tts_models/en/ljspeech/glow-tts"
 }
-# Current model loaded (default)
-current_model_key = "fast_pitch"
-tts = TTS(models[current_model_key], gpu=False)
-# Synthesize function
-def synthesize(text, selected_model=None):
     global tts, current_model_key
-    # Switch model if needed
-    if selected_model and selected_model != current_model_key:
-        current_model_key = selected_model
-        tts = TTS(models[current_model_key], gpu=False)
     output_path = "output.wav"
     start_time = time.time()
-    tts.tts_to_file(text=text, file_path=output_path)
-    total_time = time.time() - start_time
-    # Calculate RTF (approximate)
-    audio_duration = len(text.split()) / 2.5  # est. 2.5 words/sec
-    rtf = round(total_time / audio_duration, 3)
     return output_path, {
         "processing_time_sec": round(total_time, 3),
         "real_time_factor": rtf,
-        "model_used": current_model_key
     }
-# Gradio API
-api = gr.Interface(
-    fn=synthesize,
-    inputs=[
-        gr.Textbox(label="Input Text"),
-        gr.Dropdown(list(models.keys()), value="fast_pitch", label="Select Voice Model"),
-    ],
-    outputs=[
-        gr.Audio(type="filepath", label="Synthesized Audio"),
-        gr.JSON(label="Meta Info (Time, Model, RTF)")
-    ],
-    title="🗣️ TTS API with Model Selector",
-    description="Send text to convert it to speech and get metadata via Gradio API. Change the voice model dynamically."
-)
-api.launch()

 import gradio as gr
 from TTS.api import TTS
 import time
+# Available models
+default_models = {
+    "FastPitch (Female - LJSpeech)": "tts_models/en/ljspeech/fast_pitch",
+    "Glow-TTS (Female - LJSpeech)": "tts_models/en/ljspeech/glow-tts",
+     "Tactron2 (Female- LJSpeaker)": "tts_models/en/ljspeech/tacotron2-DDC",
+    "VCTK (Multi-speaker)": "tts_models/en/vctk/vits",
+    "YourTTS (Cloning + Multi-speaker)": "tts_models/multilingual/multi-dataset/your_tts",
 }
+# Example speaker IDs (VCTK)
+vctk_speakers = ["p225", "p227", "p229", "p230", "p233", "p234", "p236"]
+# Default state
+current_model_key = list(default_models.values())[0]
+tts = TTS(current_model_key, gpu=False)
+def synthesize(text, selected_model, speaker_id, custom_model_url, speaker_wav_path):
     global tts, current_model_key
+    # Decide model
+    if custom_model_url:
+        model_path = custom_model_url
+    else:
+        model_path = default_models[selected_model]
+    if model_path != current_model_key:
+        tts = TTS(model_path, gpu=False)
+        current_model_key = model_path
     output_path = "output.wav"
     start_time = time.time()
+    # Handle speaker cloning
+    if "your_tts" in model_path.lower() and speaker_wav_path:
+        tts.tts_to_file(text=text, speaker_wav=speaker_wav_path, file_path=output_path)
+        speaker_info = f"WAV Upload: {speaker_wav_path.split('/')[-1]}"
+    elif "vctk" in model_path.lower() and speaker_id and speaker_id != "None":
+        tts.tts_to_file(text=text, speaker=speaker_id, file_path=output_path)
+        speaker_info = speaker_id
+    else:
+        tts.tts_to_file(text=text, file_path=output_path)
+        speaker_info = "Default"
+    total_time = time.time() - start_time
+    est_duration = len(text.split()) / 2.5
+    rtf = round(total_time / est_duration, 3)
     return output_path, {
         "processing_time_sec": round(total_time, 3),
         "real_time_factor": rtf,
+        "model_used": model_path,
+        "speaker_used": speaker_info
     }
+# Gradio UI
+with gr.Blocks() as demo:
+    gr.Markdown("## 🗣️ TTS App with Model + Speaker Selection + Cloning")
+    with gr.Row():
+        input_text = gr.Textbox(label="Text", placeholder="Type something...", lines=3)
+    with gr.Row():
+        model_dropdown = gr.Dropdown(choices=list(default_models.keys()), label="Select TTS Model")
+        speaker_dropdown = gr.Dropdown(choices=["None"] + vctk_speakers, label="Speaker ID (for VCTK)")
+    custom_model_box = gr.Textbox(label="Custom Model URL or Path (optional)")
+    speaker_wav = gr.Audio(label="Upload Speaker Voice (WAV, 5–10s)", type="filepath")
+    with gr.Row():
+        generate_btn = gr.Button("🔊 Generate Speech")
+    output_audio = gr.Audio(label="Output Audio", type="filepath")
+    metadata_json = gr.JSON(label="Meta Info (Time, Model, RTF)")
+    generate_btn.click(
+        fn=synthesize,
+        inputs=[input_text, model_dropdown, speaker_dropdown, custom_model_box, speaker_wav],
+        outputs=[output_audio, metadata_json]
+    )
+demo.launch()