anuj-exe commited on
Commit
0d474f0
Β·
verified Β·
1 Parent(s): fed065a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -90
app.py CHANGED
@@ -2,69 +2,27 @@ import gradio as gr
2
  from TTS.api import TTS
3
  import time
4
 
5
- # Available models
6
- default_models = {
7
- "FastPitch (Female - LJSpeech)": "tts_models/en/ljspeech/fast_pitch",
8
- "Glow-TTS (Female - LJSpeech)": "tts_models/en/ljspeech/glow-tts",
9
- "Tactron2 (Female - LJSpeaker)": "tts_models/en/ljspeech/tacotron2-DDC",
10
- "VCTK (Multi-speaker)": "tts_models/en/vctk/vits",
11
- "YourTTS (Cloning + Multi-speaker)": "tts_models/multilingual/multi-dataset/your_tts",
12
- }
13
 
14
- # Supported speaker IDs for VCTK
15
- vctk_speakers = ["p225", "p227", "p229", "p230", "p233", "p234", "p236"]
16
-
17
- # Language display name -> model language code
18
- language_map = {
19
- "English": "en",
20
- "French": "fr-fr",
21
- "Portuguese": "pt-br",
22
- "Hindi": "hi", # Not supported in YourTTS
23
- "Japanese": "ja" # Not supported in YourTTS
24
- }
25
-
26
- # Supported languages for YourTTS
27
- yourtts_supported_languages = ["en", "fr-fr", "pt-br"]
28
-
29
- # Initial model setup
30
- current_model_key = list(default_models.values())[0]
31
- tts = TTS(current_model_key, gpu=False)
32
-
33
- def synthesize(text, selected_model, speaker_id, custom_model_url, speaker_wav_path, selected_language):
34
- global tts, current_model_key
35
-
36
- model_path = custom_model_url if custom_model_url else default_models[selected_model]
37
-
38
- # Load the model only if different from current
39
- if model_path != current_model_key:
40
- tts = TTS(model_path, gpu=False)
41
- current_model_key = model_path
42
 
 
43
  output_path = "output.wav"
44
  start_time = time.time()
45
 
46
- lang_code = language_map.get(selected_language, "en")
47
- speaker_info = "Default"
48
 
49
  try:
50
- if "your_tts" in model_path.lower():
51
- if lang_code not in yourtts_supported_languages:
52
- raise ValueError(f"❌ '{selected_language}' is not supported by YourTTS. Please choose from English, French, or Portuguese.")
53
-
54
- if not speaker_wav_path:
55
- raise ValueError("❌ Speaker WAV file is required for cloning with YourTTS.")
56
-
57
- tts.tts_to_file(text=text, speaker_wav=speaker_wav_path, file_path=output_path, language=lang_code)
58
- speaker_info = f"WAV Upload: {speaker_wav_path.split('/')[-1]}"
59
-
60
- elif "vctk" in model_path.lower() and speaker_id and speaker_id != "None":
61
- tts.tts_to_file(text=text, speaker=speaker_id, file_path=output_path)
62
- speaker_info = speaker_id
63
-
64
- else:
65
- tts.tts_to_file(text=text, file_path=output_path)
66
-
67
- except ValueError as e:
68
  return None, {"error": str(e)}
69
 
70
  total_time = time.time() - start_time
@@ -72,63 +30,50 @@ def synthesize(text, selected_model, speaker_id, custom_model_url, speaker_wav_p
72
  rtf = round(total_time / est_duration, 3)
73
 
74
  return output_path, {
75
- "language_selected": selected_language,
76
  "processing_time_sec": round(total_time, 3),
77
  "real_time_factor": rtf,
78
- "model_used": model_path,
79
- "speaker_used": speaker_info
80
  }
81
 
82
  # Gradio UI
83
  with gr.Blocks() as demo:
84
- gr.Markdown("## πŸ—£οΈ TTS App (Model, Speaker, Language, Cloning, API-ready)")
85
 
86
- with gr.Row():
87
- input_text = gr.Textbox(label="Text", placeholder="Type something...", lines=3)
88
-
89
- with gr.Row():
90
- language_dropdown = gr.Dropdown(
91
- choices=list(language_map.keys()),
92
- value="English",
93
- label="Select Language"
94
- )
95
-
96
- with gr.Row():
97
- model_dropdown = gr.Dropdown(choices=list(default_models.keys()), label="Select TTS Model")
98
- speaker_dropdown = gr.Dropdown(choices=["None"] + vctk_speakers, label="Speaker ID (for VCTK)")
99
 
100
- custom_model_box = gr.Textbox(label="Custom Model URL or Path (optional)")
101
- speaker_wav = gr.Audio(label="Upload Speaker Voice (WAV, 5–10s)", type="filepath")
 
 
102
 
103
- with gr.Row():
104
- generate_btn = gr.Button("πŸ”Š Generate Speech")
105
 
106
  output_audio = gr.Audio(label="Output Audio", type="filepath")
107
- metadata_json = gr.JSON(label="Meta Info (Time, Model, RTF, Language / Error)")
108
 
109
  generate_btn.click(
110
  fn=synthesize,
111
- inputs=[input_text, model_dropdown, speaker_dropdown, custom_model_box, speaker_wav, language_dropdown],
112
  outputs=[output_audio, metadata_json]
113
  )
114
 
115
- gr.Markdown("### πŸ”Œ API Access Available")
116
-
117
- # API Interface
118
  api = gr.Interface(
119
  fn=synthesize,
120
  inputs=[
121
- gr.Text(), # text
122
- gr.Text(), # model
123
- gr.Text(), # speaker id
124
- gr.Text(), # custom model url
125
- gr.Audio(type="filepath"), # speaker wav
126
- gr.Text() # language
127
  ],
128
  outputs=[gr.Audio(type="filepath"), gr.JSON()],
129
  )
130
 
131
- # Launch both
132
  demo.queue()
133
  api.queue()
134
- demo.launch()
 
2
  from TTS.api import TTS
3
  import time
4
 
5
+ # Fixed model (YourTTS in English)
6
+ YOURTTS_MODEL = "tts_models/multilingual/multi-dataset/your_tts"
 
 
 
 
 
 
7
 
8
+ # Initialize model once
9
+ tts = TTS(YOURTTS_MODEL, gpu=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
+ def synthesize(text, speaker_wav_path):
12
  output_path = "output.wav"
13
  start_time = time.time()
14
 
15
+ if not speaker_wav_path:
16
+ return None, {"error": "❌ Please upload a speaker WAV file for cloning."}
17
 
18
  try:
19
+ tts.tts_to_file(
20
+ text=text,
21
+ speaker_wav=speaker_wav_path,
22
+ file_path=output_path,
23
+ language="en"
24
+ )
25
+ except Exception as e:
 
 
 
 
 
 
 
 
 
 
 
26
  return None, {"error": str(e)}
27
 
28
  total_time = time.time() - start_time
 
30
  rtf = round(total_time / est_duration, 3)
31
 
32
  return output_path, {
33
+ "language": "English",
34
  "processing_time_sec": round(total_time, 3),
35
  "real_time_factor": rtf,
36
+ "model_used": YOURTTS_MODEL,
37
+ "speaker_used": speaker_wav_path.split("/")[-1]
38
  }
39
 
40
  # Gradio UI
41
  with gr.Blocks() as demo:
42
+ gr.Markdown("## πŸ—£οΈ YourTTS Voice Cloning (English Only)")
43
 
44
+ input_text = gr.Textbox(
45
+ label="Text",
46
+ placeholder="Type something to synthesize...",
47
+ lines=3
48
+ )
 
 
 
 
 
 
 
 
49
 
50
+ speaker_wav = gr.Audio(
51
+ label="Upload Speaker Voice (WAV, 5–10s)",
52
+ type="filepath"
53
+ )
54
 
55
+ generate_btn = gr.Button("πŸ”Š Generate Speech")
 
56
 
57
  output_audio = gr.Audio(label="Output Audio", type="filepath")
58
+ metadata_json = gr.JSON(label="Meta Info (Time, Model, RTF, etc.)")
59
 
60
  generate_btn.click(
61
  fn=synthesize,
62
+ inputs=[input_text, speaker_wav],
63
  outputs=[output_audio, metadata_json]
64
  )
65
 
66
+ # API interface (English only)
 
 
67
  api = gr.Interface(
68
  fn=synthesize,
69
  inputs=[
70
+ gr.Text(), # text
71
+ gr.Audio(type="filepath") # speaker wav
 
 
 
 
72
  ],
73
  outputs=[gr.Audio(type="filepath"), gr.JSON()],
74
  )
75
 
76
+ # Launch app
77
  demo.queue()
78
  api.queue()
79
+ demo.launch()