Spaces:

mrfakename
/

E2-F5-TTS

Running on Zero

App Files Files Community

mrfakename commited on Jun 3

Commit

7b01ab0

verified ·

1 Parent(s): abc80dc

Sync from GitHub repo

Browse files

This Space is synced from the GitHub repo: https://github.com/SWivid/F5-TTS. Please submit contributions to the Space there

Files changed (4) hide show

app.py +10 -4
pyproject.toml +1 -1
src/f5_tts/infer/SHARED.md +11 -0
src/f5_tts/infer/utils_infer.py +31 -27

app.py CHANGED Viewed

@@ -3,6 +3,7 @@
 import gc
 import json
 import re
 import tempfile
 from collections import OrderedDict
@@ -41,6 +42,7 @@ from f5_tts.infer.utils_infer import (
     preprocess_ref_audio_text,
     remove_silence_for_generated_wav,
     save_spectrogram,
 )
 from f5_tts.model import DiT, UNetT
@@ -189,16 +191,20 @@ def infer(
     # Remove silence
     if remove_silence:
-        with tempfile.NamedTemporaryFile(suffix=".wav") as f:
-            sf.write(f.name, final_wave, final_sample_rate)
             remove_silence_for_generated_wav(f.name)
             final_wave, _ = torchaudio.load(f.name)
         final_wave = final_wave.squeeze().cpu().numpy()
     # Save the spectrogram
-    with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_spectrogram:
         spectrogram_path = tmp_spectrogram.name
-        save_spectrogram(combined_spectrogram, spectrogram_path)
     return (final_sample_rate, final_wave), spectrogram_path, ref_text, used_seed

 import gc
 import json
+import os
 import re
 import tempfile
 from collections import OrderedDict
     preprocess_ref_audio_text,
     remove_silence_for_generated_wav,
     save_spectrogram,
+    tempfile_kwargs,
 )
 from f5_tts.model import DiT, UNetT
     # Remove silence
     if remove_silence:
+        with tempfile.NamedTemporaryFile(suffix=".wav", **tempfile_kwargs) as f:
+            temp_path = f.name
+        try:
+            sf.write(temp_path, final_wave, final_sample_rate)
             remove_silence_for_generated_wav(f.name)
             final_wave, _ = torchaudio.load(f.name)
+        finally:
+            os.unlink(temp_path)
         final_wave = final_wave.squeeze().cpu().numpy()
     # Save the spectrogram
+    with tempfile.NamedTemporaryFile(suffix=".png", **tempfile_kwargs) as tmp_spectrogram:
         spectrogram_path = tmp_spectrogram.name
+    save_spectrogram(combined_spectrogram, spectrogram_path)
     return (final_sample_rate, final_wave), spectrogram_path, ref_text, used_seed

pyproject.toml CHANGED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "f5-tts"
-version = "1.1.4"
 description = "F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching"
 readme = "README.md"
 license = {text = "MIT License"}

 [project]
 name = "f5-tts"
+version = "1.1.5"
 description = "F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching"
 readme = "README.md"
 license = {text = "MIT License"}

src/f5_tts/infer/SHARED.md CHANGED Viewed

@@ -33,6 +33,8 @@
     - [F5-TTS Base @ ru @ HotDro4illa](#f5-tts-base--ru--hotdro4illa)
 - [Spanish](#spanish)
     - [F5-TTS Base @ es @ jpgallegoar](#f5-tts-base--es--jpgallegoar)
 ## Multilingual
@@ -173,3 +175,12 @@ Config: {"dim": 1024, "depth": 22, "heads": 16, "ff_mult": 2, "text_dim": 512, "
 |F5-TTS Base|[ckpt & vocab](https://huggingface.co/jpgallegoar/F5-Spanish)|[Voxpopuli](https://huggingface.co/datasets/facebook/voxpopuli) & Crowdsourced & TEDx, 218 hours|cc0-1.0|
 - @jpgallegoar [GitHub repo](https://github.com/jpgallegoar/Spanish-F5), Jupyter Notebook and Gradio usage for Spanish model.

     - [F5-TTS Base @ ru @ HotDro4illa](#f5-tts-base--ru--hotdro4illa)
 - [Spanish](#spanish)
     - [F5-TTS Base @ es @ jpgallegoar](#f5-tts-base--es--jpgallegoar)
+- [German](#german)
+    - [F5-TTS Base @ de @ hvoss-techfak](#f5-tts-base--de--hvoss-techfak)
 ## Multilingual
 |F5-TTS Base|[ckpt & vocab](https://huggingface.co/jpgallegoar/F5-Spanish)|[Voxpopuli](https://huggingface.co/datasets/facebook/voxpopuli) & Crowdsourced & TEDx, 218 hours|cc0-1.0|
 - @jpgallegoar [GitHub repo](https://github.com/jpgallegoar/Spanish-F5), Jupyter Notebook and Gradio usage for Spanish model.
+## German
+#### F5-TTS Base @ de @ hvoss-techfak
+|Model|🤗Hugging Face|Data (Hours)|Model License|
+|:---:|:------------:|:-----------:|:-------------:|
+|F5-TTS Base|[ckpt & vocab](https://huggingface.co/hvoss-techfak/F5-TTS-German)|[Mozilla Common Voice 19.0](https://commonvoice.mozilla.org/en/datasets) & 800 hours Crowdsourced |cc-by-nc-4.0|
+- Finetuned by [@hvoss-techfak](https://github.com/hvoss-techfak)

src/f5_tts/infer/utils_infer.py CHANGED Viewed

@@ -45,6 +45,8 @@ device = (
     else "cpu"
 )
 # -----------------------------------------
 target_sample_rate = 24000
@@ -306,42 +308,44 @@ def preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=print):
         ref_audio = _ref_audio_cache[audio_hash]
     else:  # first pass, do preprocess
-        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
-            aseg = AudioSegment.from_file(ref_audio_orig)
-            # 1. try to find long silence for clipping
             non_silent_segs = silence.split_on_silence(
-                aseg, min_silence_len=1000, silence_thresh=-50, keep_silence=1000, seek_step=10
             )
             non_silent_wave = AudioSegment.silent(duration=0)
             for non_silent_seg in non_silent_segs:
                 if len(non_silent_wave) > 6000 and len(non_silent_wave + non_silent_seg) > 12000:
-                    show_info("Audio is over 12s, clipping short. (1)")
                     break
                 non_silent_wave += non_silent_seg
-            # 2. try to find short silence for clipping if 1. failed
-            if len(non_silent_wave) > 12000:
-                non_silent_segs = silence.split_on_silence(
-                    aseg, min_silence_len=100, silence_thresh=-40, keep_silence=1000, seek_step=10
-                )
-                non_silent_wave = AudioSegment.silent(duration=0)
-                for non_silent_seg in non_silent_segs:
-                    if len(non_silent_wave) > 6000 and len(non_silent_wave + non_silent_seg) > 12000:
-                        show_info("Audio is over 12s, clipping short. (2)")
-                        break
-                    non_silent_wave += non_silent_seg
-            aseg = non_silent_wave
-            # 3. if no proper silence found for clipping
-            if len(aseg) > 12000:
-                aseg = aseg[:12000]
-                show_info("Audio is over 12s, clipping short. (3)")
-            aseg = remove_silence_edges(aseg) + AudioSegment.silent(duration=50)
-            aseg.export(f.name, format="wav")
-            ref_audio = f.name
         # Cache the processed reference audio
         _ref_audio_cache[audio_hash] = ref_audio

     else "cpu"
 )
+tempfile_kwargs = {"delete_on_close": False} if sys.version_info >= (3, 12) else {"delete": False}
 # -----------------------------------------
 target_sample_rate = 24000
         ref_audio = _ref_audio_cache[audio_hash]
     else:  # first pass, do preprocess
+        with tempfile.NamedTemporaryFile(suffix=".wav", **tempfile_kwargs) as f:
+            temp_path = f.name
+        aseg = AudioSegment.from_file(ref_audio_orig)
+        # 1. try to find long silence for clipping
+        non_silent_segs = silence.split_on_silence(
+            aseg, min_silence_len=1000, silence_thresh=-50, keep_silence=1000, seek_step=10
+        )
+        non_silent_wave = AudioSegment.silent(duration=0)
+        for non_silent_seg in non_silent_segs:
+            if len(non_silent_wave) > 6000 and len(non_silent_wave + non_silent_seg) > 12000:
+                show_info("Audio is over 12s, clipping short. (1)")
+                break
+            non_silent_wave += non_silent_seg
+        # 2. try to find short silence for clipping if 1. failed
+        if len(non_silent_wave) > 12000:
             non_silent_segs = silence.split_on_silence(
+                aseg, min_silence_len=100, silence_thresh=-40, keep_silence=1000, seek_step=10
             )
             non_silent_wave = AudioSegment.silent(duration=0)
             for non_silent_seg in non_silent_segs:
                 if len(non_silent_wave) > 6000 and len(non_silent_wave + non_silent_seg) > 12000:
+                    show_info("Audio is over 12s, clipping short. (2)")
                     break
                 non_silent_wave += non_silent_seg
+        aseg = non_silent_wave
+        # 3. if no proper silence found for clipping
+        if len(aseg) > 12000:
+            aseg = aseg[:12000]
+            show_info("Audio is over 12s, clipping short. (3)")
+        aseg = remove_silence_edges(aseg) + AudioSegment.silent(duration=50)
+        aseg.export(temp_path, format="wav")
+        ref_audio = temp_path
         # Cache the processed reference audio
         _ref_audio_cache[audio_hash] = ref_audio