Spaces:

ResembleAI
/

Chatterbox

Running on Zero

App Files Files Community

John Meade commited on 10 days ago

Commit

f975abb

1 Parent(s): 3646fe5

add ref wav vad trimming option

Browse files

Files changed (2) hide show

app.py +12 -8
chatterbox/src/chatterbox/tts.py +5 -3

app.py CHANGED Viewed

@@ -49,13 +49,14 @@ def generate_tts_audio(
     exaggeration_input: float = 0.5,
     temperature_input: float = 0.8,
     seed_num_input: int = 0,
-    cfgw_input: float = 0.5
 ) -> tuple[int, np.ndarray]:
     """
     Generate high-quality speech audio from text using ChatterboxTTS model with optional reference audio styling.
-    This tool synthesizes natural-sounding speech from input text. When a reference audio file
-    is provided, it captures the speaker's voice characteristics and speaking style. The generated audio
     maintains the prosody, tone, and vocal qualities of the reference speaker, or uses default voice if no reference is provided.
     Args:
@@ -78,17 +79,18 @@ def generate_tts_audio(
         set_seed(int(seed_num_input))
     print(f"Generating audio for text: '{text_input[:50]}...'")
     # Handle optional audio prompt
     generate_kwargs = {
         "exaggeration": exaggeration_input,
         "temperature": temperature_input,
         "cfg_weight": cfgw_input,
     }
     if audio_prompt_path_input:
         generate_kwargs["audio_prompt_path"] = audio_prompt_path_input
     wav = current_model.generate(
         text_input[:300],  # Truncate text to max chars
         **generate_kwargs
@@ -126,6 +128,7 @@ with gr.Blocks() as demo:
             with gr.Accordion("More options", open=False):
                 seed_num = gr.Number(value=0, label="Random seed (0 for random)")
                 temp = gr.Slider(0.05, 5, step=.05, label="Temperature", value=.8)
             run_btn = gr.Button("Generate", variant="primary")
@@ -141,8 +144,9 @@ with gr.Blocks() as demo:
             temp,
             seed_num,
             cfg_weight,
         ],
         outputs=[audio_output],
     )
-demo.launch(mcp_server=True)

     exaggeration_input: float = 0.5,
     temperature_input: float = 0.8,
     seed_num_input: int = 0,
+    cfgw_input: float = 0.5,
+    vad_trim_input: bool = False,
 ) -> tuple[int, np.ndarray]:
     """
     Generate high-quality speech audio from text using ChatterboxTTS model with optional reference audio styling.
+    This tool synthesizes natural-sounding speech from input text. When a reference audio file
+    is provided, it captures the speaker's voice characteristics and speaking style. The generated audio
     maintains the prosody, tone, and vocal qualities of the reference speaker, or uses default voice if no reference is provided.
     Args:
         set_seed(int(seed_num_input))
     print(f"Generating audio for text: '{text_input[:50]}...'")
     # Handle optional audio prompt
     generate_kwargs = {
         "exaggeration": exaggeration_input,
         "temperature": temperature_input,
         "cfg_weight": cfgw_input,
+        "vad_trim": vad_trim_input,
     }
     if audio_prompt_path_input:
         generate_kwargs["audio_prompt_path"] = audio_prompt_path_input
     wav = current_model.generate(
         text_input[:300],  # Truncate text to max chars
         **generate_kwargs
             with gr.Accordion("More options", open=False):
                 seed_num = gr.Number(value=0, label="Random seed (0 for random)")
                 temp = gr.Slider(0.05, 5, step=.05, label="Temperature", value=.8)
+                vad_trim = gr.Checkbox(label="Ref VAD trimming", value=False)
             run_btn = gr.Button("Generate", variant="primary")
             temp,
             seed_num,
             cfg_weight,
+            vad_trim,
         ],
         outputs=[audio_output],
     )
+demo.launch(mcp_server=True)

chatterbox/src/chatterbox/tts.py CHANGED Viewed

@@ -183,10 +183,11 @@ class ChatterboxTTS:
         # Trim out silence
         return wav[dilated_vad]
-    def prepare_conditionals(self, wav_fpath, exaggeration=0.5):
         # Load reference wav at high SR and trim silence
         ref_wav, highres_sr = librosa.load(wav_fpath, sr=48_000)
-        ref_wav = self.trim_excess_silence(ref_wav, highres_sr)
         # Resample down
         s3gen_ref_wav = librosa.resample(ref_wav, orig_sr=highres_sr, target_sr=S3GEN_SR)
@@ -219,9 +220,10 @@ class ChatterboxTTS:
         exaggeration=0.5,
         cfg_weight=0.5,
         temperature=0.8,
     ):
         if audio_prompt_path:
-            self.prepare_conditionals(audio_prompt_path, exaggeration=exaggeration)
         else:
             assert self.conds is not None, "Please `prepare_conditionals` first or specify `audio_prompt_path`"

         # Trim out silence
         return wav[dilated_vad]
+    def prepare_conditionals(self, wav_fpath, exaggeration=0.5, vad_trim=False):
         # Load reference wav at high SR and trim silence
         ref_wav, highres_sr = librosa.load(wav_fpath, sr=48_000)
+        if vad_trim:
+            ref_wav = self.trim_excess_silence(ref_wav, highres_sr)
         # Resample down
         s3gen_ref_wav = librosa.resample(ref_wav, orig_sr=highres_sr, target_sr=S3GEN_SR)
         exaggeration=0.5,
         cfg_weight=0.5,
         temperature=0.8,
+        vad_trim=False,
     ):
         if audio_prompt_path:
+            self.prepare_conditionals(audio_prompt_path, exaggeration=exaggeration, vad_trim=vad_trim)
         else:
             assert self.conds is not None, "Please `prepare_conditionals` first or specify `audio_prompt_path`"