John Meade commited on
Commit
f975abb
·
1 Parent(s): 3646fe5

add ref wav vad trimming option

Browse files
Files changed (2) hide show
  1. app.py +12 -8
  2. chatterbox/src/chatterbox/tts.py +5 -3
app.py CHANGED
@@ -49,13 +49,14 @@ def generate_tts_audio(
49
  exaggeration_input: float = 0.5,
50
  temperature_input: float = 0.8,
51
  seed_num_input: int = 0,
52
- cfgw_input: float = 0.5
 
53
  ) -> tuple[int, np.ndarray]:
54
  """
55
  Generate high-quality speech audio from text using ChatterboxTTS model with optional reference audio styling.
56
-
57
- This tool synthesizes natural-sounding speech from input text. When a reference audio file
58
- is provided, it captures the speaker's voice characteristics and speaking style. The generated audio
59
  maintains the prosody, tone, and vocal qualities of the reference speaker, or uses default voice if no reference is provided.
60
 
61
  Args:
@@ -78,17 +79,18 @@ def generate_tts_audio(
78
  set_seed(int(seed_num_input))
79
 
80
  print(f"Generating audio for text: '{text_input[:50]}...'")
81
-
82
  # Handle optional audio prompt
83
  generate_kwargs = {
84
  "exaggeration": exaggeration_input,
85
  "temperature": temperature_input,
86
  "cfg_weight": cfgw_input,
 
87
  }
88
-
89
  if audio_prompt_path_input:
90
  generate_kwargs["audio_prompt_path"] = audio_prompt_path_input
91
-
92
  wav = current_model.generate(
93
  text_input[:300], # Truncate text to max chars
94
  **generate_kwargs
@@ -126,6 +128,7 @@ with gr.Blocks() as demo:
126
  with gr.Accordion("More options", open=False):
127
  seed_num = gr.Number(value=0, label="Random seed (0 for random)")
128
  temp = gr.Slider(0.05, 5, step=.05, label="Temperature", value=.8)
 
129
 
130
  run_btn = gr.Button("Generate", variant="primary")
131
 
@@ -141,8 +144,9 @@ with gr.Blocks() as demo:
141
  temp,
142
  seed_num,
143
  cfg_weight,
 
144
  ],
145
  outputs=[audio_output],
146
  )
147
 
148
- demo.launch(mcp_server=True)
 
49
  exaggeration_input: float = 0.5,
50
  temperature_input: float = 0.8,
51
  seed_num_input: int = 0,
52
+ cfgw_input: float = 0.5,
53
+ vad_trim_input: bool = False,
54
  ) -> tuple[int, np.ndarray]:
55
  """
56
  Generate high-quality speech audio from text using ChatterboxTTS model with optional reference audio styling.
57
+
58
+ This tool synthesizes natural-sounding speech from input text. When a reference audio file
59
+ is provided, it captures the speaker's voice characteristics and speaking style. The generated audio
60
  maintains the prosody, tone, and vocal qualities of the reference speaker, or uses default voice if no reference is provided.
61
 
62
  Args:
 
79
  set_seed(int(seed_num_input))
80
 
81
  print(f"Generating audio for text: '{text_input[:50]}...'")
82
+
83
  # Handle optional audio prompt
84
  generate_kwargs = {
85
  "exaggeration": exaggeration_input,
86
  "temperature": temperature_input,
87
  "cfg_weight": cfgw_input,
88
+ "vad_trim": vad_trim_input,
89
  }
90
+
91
  if audio_prompt_path_input:
92
  generate_kwargs["audio_prompt_path"] = audio_prompt_path_input
93
+
94
  wav = current_model.generate(
95
  text_input[:300], # Truncate text to max chars
96
  **generate_kwargs
 
128
  with gr.Accordion("More options", open=False):
129
  seed_num = gr.Number(value=0, label="Random seed (0 for random)")
130
  temp = gr.Slider(0.05, 5, step=.05, label="Temperature", value=.8)
131
+ vad_trim = gr.Checkbox(label="Ref VAD trimming", value=False)
132
 
133
  run_btn = gr.Button("Generate", variant="primary")
134
 
 
144
  temp,
145
  seed_num,
146
  cfg_weight,
147
+ vad_trim,
148
  ],
149
  outputs=[audio_output],
150
  )
151
 
152
+ demo.launch(mcp_server=True)
chatterbox/src/chatterbox/tts.py CHANGED
@@ -183,10 +183,11 @@ class ChatterboxTTS:
183
  # Trim out silence
184
  return wav[dilated_vad]
185
 
186
- def prepare_conditionals(self, wav_fpath, exaggeration=0.5):
187
  # Load reference wav at high SR and trim silence
188
  ref_wav, highres_sr = librosa.load(wav_fpath, sr=48_000)
189
- ref_wav = self.trim_excess_silence(ref_wav, highres_sr)
 
190
 
191
  # Resample down
192
  s3gen_ref_wav = librosa.resample(ref_wav, orig_sr=highres_sr, target_sr=S3GEN_SR)
@@ -219,9 +220,10 @@ class ChatterboxTTS:
219
  exaggeration=0.5,
220
  cfg_weight=0.5,
221
  temperature=0.8,
 
222
  ):
223
  if audio_prompt_path:
224
- self.prepare_conditionals(audio_prompt_path, exaggeration=exaggeration)
225
  else:
226
  assert self.conds is not None, "Please `prepare_conditionals` first or specify `audio_prompt_path`"
227
 
 
183
  # Trim out silence
184
  return wav[dilated_vad]
185
 
186
+ def prepare_conditionals(self, wav_fpath, exaggeration=0.5, vad_trim=False):
187
  # Load reference wav at high SR and trim silence
188
  ref_wav, highres_sr = librosa.load(wav_fpath, sr=48_000)
189
+ if vad_trim:
190
+ ref_wav = self.trim_excess_silence(ref_wav, highres_sr)
191
 
192
  # Resample down
193
  s3gen_ref_wav = librosa.resample(ref_wav, orig_sr=highres_sr, target_sr=S3GEN_SR)
 
220
  exaggeration=0.5,
221
  cfg_weight=0.5,
222
  temperature=0.8,
223
+ vad_trim=False,
224
  ):
225
  if audio_prompt_path:
226
+ self.prepare_conditionals(audio_prompt_path, exaggeration=exaggeration, vad_trim=vad_trim)
227
  else:
228
  assert self.conds is not None, "Please `prepare_conditionals` first or specify `audio_prompt_path`"
229