Samit-khedekar commited on
Commit
c30d3ee
·
verified ·
1 Parent(s): 7f3065e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +67 -37
app.py CHANGED
@@ -1,57 +1,87 @@
1
  import gradio as gr
2
  from TTS.api import TTS
3
  import time
4
- import os
5
 
6
- # Available voice models
7
- models = {
8
- "fast_pitch": "tts_models/en/ljspeech/fast_pitch",
9
- "tacotron2": "tts_models/en/ljspeech/tacotron2-DDC",
10
- "glow_tts": "tts_models/en/ljspeech/glow-tts"
 
 
 
11
  }
12
 
13
- # Current model loaded (default)
14
- current_model_key = "fast_pitch"
15
- tts = TTS(models[current_model_key], gpu=False)
16
 
17
- # Synthesize function
18
- def synthesize(text, selected_model=None):
 
 
 
19
  global tts, current_model_key
20
 
21
- # Switch model if needed
22
- if selected_model and selected_model != current_model_key:
23
- current_model_key = selected_model
24
- tts = TTS(models[current_model_key], gpu=False)
 
 
 
 
 
25
 
26
  output_path = "output.wav"
27
  start_time = time.time()
28
- tts.tts_to_file(text=text, file_path=output_path)
29
- total_time = time.time() - start_time
30
 
31
- # Calculate RTF (approximate)
32
- audio_duration = len(text.split()) / 2.5 # est. 2.5 words/sec
33
- rtf = round(total_time / audio_duration, 3)
 
 
 
 
 
 
 
 
 
 
 
34
 
35
  return output_path, {
36
  "processing_time_sec": round(total_time, 3),
37
  "real_time_factor": rtf,
38
- "model_used": current_model_key
 
39
  }
40
 
41
- # Gradio API
42
- api = gr.Interface(
43
- fn=synthesize,
44
- inputs=[
45
- gr.Textbox(label="Input Text"),
46
- gr.Dropdown(list(models.keys()), value="fast_pitch", label="Select Voice Model"),
47
- ],
48
- outputs=[
49
- gr.Audio(type="filepath", label="Synthesized Audio"),
50
- gr.JSON(label="Meta Info (Time, Model, RTF)")
51
- ],
52
- title="🗣️ TTS API with Model Selector",
53
- description="Send text to convert it to speech and get metadata via Gradio API. Change the voice model dynamically."
54
- )
55
-
56
- api.launch()
 
 
 
 
 
 
 
 
 
 
 
57
 
 
1
  import gradio as gr
2
  from TTS.api import TTS
3
  import time
 
4
 
5
+ # Available models
6
+ default_models = {
7
+ "FastPitch (Female - LJSpeech)": "tts_models/en/ljspeech/fast_pitch",
8
+ "Glow-TTS (Female - LJSpeech)": "tts_models/en/ljspeech/glow-tts",
9
+ "Tactron2 (Female- LJSpeaker)": "tts_models/en/ljspeech/tacotron2-DDC",
10
+ "VCTK (Multi-speaker)": "tts_models/en/vctk/vits",
11
+ "YourTTS (Cloning + Multi-speaker)": "tts_models/multilingual/multi-dataset/your_tts",
12
+
13
  }
14
 
15
+ # Example speaker IDs (VCTK)
16
+ vctk_speakers = ["p225", "p227", "p229", "p230", "p233", "p234", "p236"]
 
17
 
18
+ # Default state
19
+ current_model_key = list(default_models.values())[0]
20
+ tts = TTS(current_model_key, gpu=False)
21
+
22
+ def synthesize(text, selected_model, speaker_id, custom_model_url, speaker_wav_path):
23
  global tts, current_model_key
24
 
25
+ # Decide model
26
+ if custom_model_url:
27
+ model_path = custom_model_url
28
+ else:
29
+ model_path = default_models[selected_model]
30
+
31
+ if model_path != current_model_key:
32
+ tts = TTS(model_path, gpu=False)
33
+ current_model_key = model_path
34
 
35
  output_path = "output.wav"
36
  start_time = time.time()
 
 
37
 
38
+ # Handle speaker cloning
39
+ if "your_tts" in model_path.lower() and speaker_wav_path:
40
+ tts.tts_to_file(text=text, speaker_wav=speaker_wav_path, file_path=output_path)
41
+ speaker_info = f"WAV Upload: {speaker_wav_path.split('/')[-1]}"
42
+ elif "vctk" in model_path.lower() and speaker_id and speaker_id != "None":
43
+ tts.tts_to_file(text=text, speaker=speaker_id, file_path=output_path)
44
+ speaker_info = speaker_id
45
+ else:
46
+ tts.tts_to_file(text=text, file_path=output_path)
47
+ speaker_info = "Default"
48
+
49
+ total_time = time.time() - start_time
50
+ est_duration = len(text.split()) / 2.5
51
+ rtf = round(total_time / est_duration, 3)
52
 
53
  return output_path, {
54
  "processing_time_sec": round(total_time, 3),
55
  "real_time_factor": rtf,
56
+ "model_used": model_path,
57
+ "speaker_used": speaker_info
58
  }
59
 
60
+ # Gradio UI
61
+ with gr.Blocks() as demo:
62
+ gr.Markdown("## 🗣️ TTS App with Model + Speaker Selection + Cloning")
63
+
64
+ with gr.Row():
65
+ input_text = gr.Textbox(label="Text", placeholder="Type something...", lines=3)
66
+
67
+ with gr.Row():
68
+ model_dropdown = gr.Dropdown(choices=list(default_models.keys()), label="Select TTS Model")
69
+ speaker_dropdown = gr.Dropdown(choices=["None"] + vctk_speakers, label="Speaker ID (for VCTK)")
70
+
71
+ custom_model_box = gr.Textbox(label="Custom Model URL or Path (optional)")
72
+ speaker_wav = gr.Audio(label="Upload Speaker Voice (WAV, 5–10s)", type="filepath")
73
+
74
+ with gr.Row():
75
+ generate_btn = gr.Button("🔊 Generate Speech")
76
+
77
+ output_audio = gr.Audio(label="Output Audio", type="filepath")
78
+ metadata_json = gr.JSON(label="Meta Info (Time, Model, RTF)")
79
+
80
+ generate_btn.click(
81
+ fn=synthesize,
82
+ inputs=[input_text, model_dropdown, speaker_dropdown, custom_model_box, speaker_wav],
83
+ outputs=[output_audio, metadata_json]
84
+ )
85
+
86
+ demo.launch()
87