coquiAPI

Sleeping

App Files Files Community

anuj-exe commited on Oct 7

Commit

fed065a

verified ·

1 Parent(s): 43990f7

Update app.py

Browse files

Files changed (1) hide show

app.py +109 -66

app.py CHANGED Viewed

@@ -1,91 +1,134 @@
 from TTS.api import TTS
 import time
-import os
-import uuid
-import shutil
-import tempfile
-from fastapi import FastAPI
-from fastapi.responses import FileResponse
-from pydantic import BaseModel
-# Initialize FastAPI
-app = FastAPI(title="YourTTS API")
-# Paths to speaker WAVs in repo
-SPEAKERS = {
-    "male": "speakers/voice1.wav",
-    "female": "speakers/voice2.wav"
 }
-# Load YourTTS model once at startup
-TTS_MODEL_PATH = "tts_models/multilingual/multi-dataset/your_tts"
-tts = TTS(TTS_MODEL_PATH, gpu=False)
-# Pydantic model for request
-class TTSRequest(BaseModel):
-    text: str
-    speaker: str  # "male" or "female"
-@app.post("/synthesize")
-def synthesize_tts(request: TTSRequest):
-    text = request.text
-    speaker_choice = request.speaker.lower()
-    if speaker_choice not in SPEAKERS:
-        return {"error": f"Invalid speaker '{speaker_choice}'. Choose 'male' or 'female'."}
-    repo_speaker_path = SPEAKERS[speaker_choice]
-    # Create a temporary WAV copy to simulate an uploaded file
-    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
-        temp_speaker_path = tmp_file.name
-    shutil.copyfile(repo_speaker_path, temp_speaker_path)
-    # Generate unique output file for this request
-    output_path = f"output_{uuid.uuid4().hex}.wav"
     start_time = time.time()
     try:
-        # Generate TTS using the temporary speaker WAV
-        tts.tts_to_file(
-            text=text,
-            speaker_wav=temp_speaker_path,
-            file_path=output_path,
-            language="en"
-        )
-        # Verify the file exists and is non-empty
-        if not os.path.exists(output_path) or os.path.getsize(output_path) == 0:
-            return {"error": "TTS generation failed, output file is empty."}
-    except Exception as e:
-        return {"error": str(e)}
-    finally:
-        # Clean up the temporary speaker WAV
-        if os.path.exists(temp_speaker_path):
-            os.remove(temp_speaker_path)
     total_time = time.time() - start_time
-    est_duration = len(text.split()) / 2.5  # rough estimate
     rtf = round(total_time / est_duration, 3)
-    metadata = {
-        "language": "English",
         "processing_time_sec": round(total_time, 3),
         "real_time_factor": rtf,
-        "model_used": TTS_MODEL_PATH,
-        "speaker_used": speaker_choice
     }
-    # Return the audio file
-    return FileResponse(
-        output_path,
-        media_type="audio/wav",
-        filename="output.wav",
-        headers={"X-Metadata": str(metadata)}
-    )
-if __name__ == "__main__":
-    import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=7860)

+import gradio as gr
 from TTS.api import TTS
 import time
+# Available models
+default_models = {
+    "FastPitch (Female - LJSpeech)": "tts_models/en/ljspeech/fast_pitch",
+    "Glow-TTS (Female - LJSpeech)": "tts_models/en/ljspeech/glow-tts",
+    "Tactron2 (Female - LJSpeaker)": "tts_models/en/ljspeech/tacotron2-DDC",
+    "VCTK (Multi-speaker)": "tts_models/en/vctk/vits",
+    "YourTTS (Cloning + Multi-speaker)": "tts_models/multilingual/multi-dataset/your_tts",
 }
+# Supported speaker IDs for VCTK
+vctk_speakers = ["p225", "p227", "p229", "p230", "p233", "p234", "p236"]
+# Language display name -> model language code
+language_map = {
+    "English": "en",
+    "French": "fr-fr",
+    "Portuguese": "pt-br",
+    "Hindi": "hi",         # Not supported in YourTTS
+    "Japanese": "ja"       # Not supported in YourTTS
+}
+# Supported languages for YourTTS
+yourtts_supported_languages = ["en", "fr-fr", "pt-br"]
+# Initial model setup
+current_model_key = list(default_models.values())[0]
+tts = TTS(current_model_key, gpu=False)
+def synthesize(text, selected_model, speaker_id, custom_model_url, speaker_wav_path, selected_language):
+    global tts, current_model_key
+    model_path = custom_model_url if custom_model_url else default_models[selected_model]
+    # Load the model only if different from current
+    if model_path != current_model_key:
+        tts = TTS(model_path, gpu=False)
+        current_model_key = model_path
+    output_path = "output.wav"
     start_time = time.time()
+    lang_code = language_map.get(selected_language, "en")
+    speaker_info = "Default"
     try:
+        if "your_tts" in model_path.lower():
+            if lang_code not in yourtts_supported_languages:
+                raise ValueError(f"❌ '{selected_language}' is not supported by YourTTS. Please choose from English, French, or Portuguese.")
+            if not speaker_wav_path:
+                raise ValueError("❌ Speaker WAV file is required for cloning with YourTTS.")
+            tts.tts_to_file(text=text, speaker_wav=speaker_wav_path, file_path=output_path, language=lang_code)
+            speaker_info = f"WAV Upload: {speaker_wav_path.split('/')[-1]}"
+        elif "vctk" in model_path.lower() and speaker_id and speaker_id != "None":
+            tts.tts_to_file(text=text, speaker=speaker_id, file_path=output_path)
+            speaker_info = speaker_id
+        else:
+            tts.tts_to_file(text=text, file_path=output_path)
+    except ValueError as e:
+        return None, {"error": str(e)}
     total_time = time.time() - start_time
+    est_duration = len(text.split()) / 2.5
     rtf = round(total_time / est_duration, 3)
+    return output_path, {
+        "language_selected": selected_language,
         "processing_time_sec": round(total_time, 3),
         "real_time_factor": rtf,
+        "model_used": model_path,
+        "speaker_used": speaker_info
     }
+# Gradio UI
+with gr.Blocks() as demo:
+    gr.Markdown("## 🗣️ TTS App (Model, Speaker, Language, Cloning, API-ready)")
+    with gr.Row():
+        input_text = gr.Textbox(label="Text", placeholder="Type something...", lines=3)
+    with gr.Row():
+        language_dropdown = gr.Dropdown(
+            choices=list(language_map.keys()),
+            value="English",
+            label="Select Language"
+        )
+    with gr.Row():
+        model_dropdown = gr.Dropdown(choices=list(default_models.keys()), label="Select TTS Model")
+        speaker_dropdown = gr.Dropdown(choices=["None"] + vctk_speakers, label="Speaker ID (for VCTK)")
+    custom_model_box = gr.Textbox(label="Custom Model URL or Path (optional)")
+    speaker_wav = gr.Audio(label="Upload Speaker Voice (WAV, 5–10s)", type="filepath")
+    with gr.Row():
+        generate_btn = gr.Button("🔊 Generate Speech")
+    output_audio = gr.Audio(label="Output Audio", type="filepath")
+    metadata_json = gr.JSON(label="Meta Info (Time, Model, RTF, Language / Error)")
+    generate_btn.click(
+        fn=synthesize,
+        inputs=[input_text, model_dropdown, speaker_dropdown, custom_model_box, speaker_wav, language_dropdown],
+        outputs=[output_audio, metadata_json]
+    )
+    gr.Markdown("### 🔌 API Access Available")
+# API Interface
+api = gr.Interface(
+    fn=synthesize,
+    inputs=[
+        gr.Text(),  # text
+        gr.Text(),  # model
+        gr.Text(),  # speaker id
+        gr.Text(),  # custom model url
+        gr.Audio(type="filepath"),  # speaker wav
+        gr.Text()   # language
+    ],
+    outputs=[gr.Audio(type="filepath"), gr.JSON()],
+)
+# Launch both
+demo.queue()
+api.queue()
+demo.launch()