anuj-exe commited on
Commit
fed065a
·
verified ·
1 Parent(s): 43990f7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +109 -66
app.py CHANGED
@@ -1,91 +1,134 @@
 
1
  from TTS.api import TTS
2
  import time
3
- import os
4
- import uuid
5
- import shutil
6
- import tempfile
7
- from fastapi import FastAPI
8
- from fastapi.responses import FileResponse
9
- from pydantic import BaseModel
10
-
11
- # Initialize FastAPI
12
- app = FastAPI(title="YourTTS API")
13
-
14
- # Paths to speaker WAVs in repo
15
- SPEAKERS = {
16
- "male": "speakers/voice1.wav",
17
- "female": "speakers/voice2.wav"
18
  }
19
 
20
- # Load YourTTS model once at startup
21
- TTS_MODEL_PATH = "tts_models/multilingual/multi-dataset/your_tts"
22
- tts = TTS(TTS_MODEL_PATH, gpu=False)
 
 
 
 
 
 
 
 
23
 
24
- # Pydantic model for request
25
- class TTSRequest(BaseModel):
26
- text: str
27
- speaker: str # "male" or "female"
28
 
29
- @app.post("/synthesize")
30
- def synthesize_tts(request: TTSRequest):
31
- text = request.text
32
- speaker_choice = request.speaker.lower()
33
 
34
- if speaker_choice not in SPEAKERS:
35
- return {"error": f"Invalid speaker '{speaker_choice}'. Choose 'male' or 'female'."}
36
 
37
- repo_speaker_path = SPEAKERS[speaker_choice]
38
 
39
- # Create a temporary WAV copy to simulate an uploaded file
40
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
41
- temp_speaker_path = tmp_file.name
42
- shutil.copyfile(repo_speaker_path, temp_speaker_path)
43
 
44
- # Generate unique output file for this request
45
- output_path = f"output_{uuid.uuid4().hex}.wav"
46
  start_time = time.time()
47
 
 
 
 
48
  try:
49
- # Generate TTS using the temporary speaker WAV
50
- tts.tts_to_file(
51
- text=text,
52
- speaker_wav=temp_speaker_path,
53
- file_path=output_path,
54
- language="en"
55
- )
56
 
57
- # Verify the file exists and is non-empty
58
- if not os.path.exists(output_path) or os.path.getsize(output_path) == 0:
59
- return {"error": "TTS generation failed, output file is empty."}
60
 
61
- except Exception as e:
62
- return {"error": str(e)}
63
- finally:
64
- # Clean up the temporary speaker WAV
65
- if os.path.exists(temp_speaker_path):
66
- os.remove(temp_speaker_path)
 
 
 
 
 
 
67
 
68
  total_time = time.time() - start_time
69
- est_duration = len(text.split()) / 2.5 # rough estimate
70
  rtf = round(total_time / est_duration, 3)
71
 
72
- metadata = {
73
- "language": "English",
74
  "processing_time_sec": round(total_time, 3),
75
  "real_time_factor": rtf,
76
- "model_used": TTS_MODEL_PATH,
77
- "speaker_used": speaker_choice
78
  }
79
 
80
- # Return the audio file
81
- return FileResponse(
82
- output_path,
83
- media_type="audio/wav",
84
- filename="output.wav",
85
- headers={"X-Metadata": str(metadata)}
86
- )
87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
 
89
- if __name__ == "__main__":
90
- import uvicorn
91
- uvicorn.run(app, host="0.0.0.0", port=7860)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
  from TTS.api import TTS
3
  import time
4
+
5
+ # Available models
6
+ default_models = {
7
+ "FastPitch (Female - LJSpeech)": "tts_models/en/ljspeech/fast_pitch",
8
+ "Glow-TTS (Female - LJSpeech)": "tts_models/en/ljspeech/glow-tts",
9
+ "Tactron2 (Female - LJSpeaker)": "tts_models/en/ljspeech/tacotron2-DDC",
10
+ "VCTK (Multi-speaker)": "tts_models/en/vctk/vits",
11
+ "YourTTS (Cloning + Multi-speaker)": "tts_models/multilingual/multi-dataset/your_tts",
 
 
 
 
 
 
 
12
  }
13
 
14
+ # Supported speaker IDs for VCTK
15
+ vctk_speakers = ["p225", "p227", "p229", "p230", "p233", "p234", "p236"]
16
+
17
+ # Language display name -> model language code
18
+ language_map = {
19
+ "English": "en",
20
+ "French": "fr-fr",
21
+ "Portuguese": "pt-br",
22
+ "Hindi": "hi", # Not supported in YourTTS
23
+ "Japanese": "ja" # Not supported in YourTTS
24
+ }
25
 
26
+ # Supported languages for YourTTS
27
+ yourtts_supported_languages = ["en", "fr-fr", "pt-br"]
 
 
28
 
29
+ # Initial model setup
30
+ current_model_key = list(default_models.values())[0]
31
+ tts = TTS(current_model_key, gpu=False)
 
32
 
33
+ def synthesize(text, selected_model, speaker_id, custom_model_url, speaker_wav_path, selected_language):
34
+ global tts, current_model_key
35
 
36
+ model_path = custom_model_url if custom_model_url else default_models[selected_model]
37
 
38
+ # Load the model only if different from current
39
+ if model_path != current_model_key:
40
+ tts = TTS(model_path, gpu=False)
41
+ current_model_key = model_path
42
 
43
+ output_path = "output.wav"
 
44
  start_time = time.time()
45
 
46
+ lang_code = language_map.get(selected_language, "en")
47
+ speaker_info = "Default"
48
+
49
  try:
50
+ if "your_tts" in model_path.lower():
51
+ if lang_code not in yourtts_supported_languages:
52
+ raise ValueError(f"❌ '{selected_language}' is not supported by YourTTS. Please choose from English, French, or Portuguese.")
 
 
 
 
53
 
54
+ if not speaker_wav_path:
55
+ raise ValueError("❌ Speaker WAV file is required for cloning with YourTTS.")
 
56
 
57
+ tts.tts_to_file(text=text, speaker_wav=speaker_wav_path, file_path=output_path, language=lang_code)
58
+ speaker_info = f"WAV Upload: {speaker_wav_path.split('/')[-1]}"
59
+
60
+ elif "vctk" in model_path.lower() and speaker_id and speaker_id != "None":
61
+ tts.tts_to_file(text=text, speaker=speaker_id, file_path=output_path)
62
+ speaker_info = speaker_id
63
+
64
+ else:
65
+ tts.tts_to_file(text=text, file_path=output_path)
66
+
67
+ except ValueError as e:
68
+ return None, {"error": str(e)}
69
 
70
  total_time = time.time() - start_time
71
+ est_duration = len(text.split()) / 2.5
72
  rtf = round(total_time / est_duration, 3)
73
 
74
+ return output_path, {
75
+ "language_selected": selected_language,
76
  "processing_time_sec": round(total_time, 3),
77
  "real_time_factor": rtf,
78
+ "model_used": model_path,
79
+ "speaker_used": speaker_info
80
  }
81
 
82
+ # Gradio UI
83
+ with gr.Blocks() as demo:
84
+ gr.Markdown("## 🗣️ TTS App (Model, Speaker, Language, Cloning, API-ready)")
 
 
 
 
85
 
86
+ with gr.Row():
87
+ input_text = gr.Textbox(label="Text", placeholder="Type something...", lines=3)
88
+
89
+ with gr.Row():
90
+ language_dropdown = gr.Dropdown(
91
+ choices=list(language_map.keys()),
92
+ value="English",
93
+ label="Select Language"
94
+ )
95
+
96
+ with gr.Row():
97
+ model_dropdown = gr.Dropdown(choices=list(default_models.keys()), label="Select TTS Model")
98
+ speaker_dropdown = gr.Dropdown(choices=["None"] + vctk_speakers, label="Speaker ID (for VCTK)")
99
+
100
+ custom_model_box = gr.Textbox(label="Custom Model URL or Path (optional)")
101
+ speaker_wav = gr.Audio(label="Upload Speaker Voice (WAV, 5–10s)", type="filepath")
102
+
103
+ with gr.Row():
104
+ generate_btn = gr.Button("🔊 Generate Speech")
105
+
106
+ output_audio = gr.Audio(label="Output Audio", type="filepath")
107
+ metadata_json = gr.JSON(label="Meta Info (Time, Model, RTF, Language / Error)")
108
+
109
+ generate_btn.click(
110
+ fn=synthesize,
111
+ inputs=[input_text, model_dropdown, speaker_dropdown, custom_model_box, speaker_wav, language_dropdown],
112
+ outputs=[output_audio, metadata_json]
113
+ )
114
 
115
+ gr.Markdown("### 🔌 API Access Available")
116
+
117
+ # API Interface
118
+ api = gr.Interface(
119
+ fn=synthesize,
120
+ inputs=[
121
+ gr.Text(), # text
122
+ gr.Text(), # model
123
+ gr.Text(), # speaker id
124
+ gr.Text(), # custom model url
125
+ gr.Audio(type="filepath"), # speaker wav
126
+ gr.Text() # language
127
+ ],
128
+ outputs=[gr.Audio(type="filepath"), gr.JSON()],
129
+ )
130
+
131
+ # Launch both
132
+ demo.queue()
133
+ api.queue()
134
+ demo.launch()