Spaces:
Running
on
Zero
Running
on
Zero
Sync from GitHub repo
Browse filesThis Space is synced from the GitHub repo: https://github.com/SWivid/F5-TTS. Please submit contributions to the Space there
- app.py +10 -4
- pyproject.toml +1 -1
- src/f5_tts/infer/SHARED.md +11 -0
- src/f5_tts/infer/utils_infer.py +31 -27
app.py
CHANGED
|
@@ -3,6 +3,7 @@
|
|
| 3 |
|
| 4 |
import gc
|
| 5 |
import json
|
|
|
|
| 6 |
import re
|
| 7 |
import tempfile
|
| 8 |
from collections import OrderedDict
|
|
@@ -41,6 +42,7 @@ from f5_tts.infer.utils_infer import (
|
|
| 41 |
preprocess_ref_audio_text,
|
| 42 |
remove_silence_for_generated_wav,
|
| 43 |
save_spectrogram,
|
|
|
|
| 44 |
)
|
| 45 |
from f5_tts.model import DiT, UNetT
|
| 46 |
|
|
@@ -189,16 +191,20 @@ def infer(
|
|
| 189 |
|
| 190 |
# Remove silence
|
| 191 |
if remove_silence:
|
| 192 |
-
with tempfile.NamedTemporaryFile(suffix=".wav") as f:
|
| 193 |
-
|
|
|
|
|
|
|
| 194 |
remove_silence_for_generated_wav(f.name)
|
| 195 |
final_wave, _ = torchaudio.load(f.name)
|
|
|
|
|
|
|
| 196 |
final_wave = final_wave.squeeze().cpu().numpy()
|
| 197 |
|
| 198 |
# Save the spectrogram
|
| 199 |
-
with tempfile.NamedTemporaryFile(suffix=".png",
|
| 200 |
spectrogram_path = tmp_spectrogram.name
|
| 201 |
-
|
| 202 |
|
| 203 |
return (final_sample_rate, final_wave), spectrogram_path, ref_text, used_seed
|
| 204 |
|
|
|
|
| 3 |
|
| 4 |
import gc
|
| 5 |
import json
|
| 6 |
+
import os
|
| 7 |
import re
|
| 8 |
import tempfile
|
| 9 |
from collections import OrderedDict
|
|
|
|
| 42 |
preprocess_ref_audio_text,
|
| 43 |
remove_silence_for_generated_wav,
|
| 44 |
save_spectrogram,
|
| 45 |
+
tempfile_kwargs,
|
| 46 |
)
|
| 47 |
from f5_tts.model import DiT, UNetT
|
| 48 |
|
|
|
|
| 191 |
|
| 192 |
# Remove silence
|
| 193 |
if remove_silence:
|
| 194 |
+
with tempfile.NamedTemporaryFile(suffix=".wav", **tempfile_kwargs) as f:
|
| 195 |
+
temp_path = f.name
|
| 196 |
+
try:
|
| 197 |
+
sf.write(temp_path, final_wave, final_sample_rate)
|
| 198 |
remove_silence_for_generated_wav(f.name)
|
| 199 |
final_wave, _ = torchaudio.load(f.name)
|
| 200 |
+
finally:
|
| 201 |
+
os.unlink(temp_path)
|
| 202 |
final_wave = final_wave.squeeze().cpu().numpy()
|
| 203 |
|
| 204 |
# Save the spectrogram
|
| 205 |
+
with tempfile.NamedTemporaryFile(suffix=".png", **tempfile_kwargs) as tmp_spectrogram:
|
| 206 |
spectrogram_path = tmp_spectrogram.name
|
| 207 |
+
save_spectrogram(combined_spectrogram, spectrogram_path)
|
| 208 |
|
| 209 |
return (final_sample_rate, final_wave), spectrogram_path, ref_text, used_seed
|
| 210 |
|
pyproject.toml
CHANGED
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
| 4 |
|
| 5 |
[project]
|
| 6 |
name = "f5-tts"
|
| 7 |
-
version = "1.1.
|
| 8 |
description = "F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching"
|
| 9 |
readme = "README.md"
|
| 10 |
license = {text = "MIT License"}
|
|
|
|
| 4 |
|
| 5 |
[project]
|
| 6 |
name = "f5-tts"
|
| 7 |
+
version = "1.1.5"
|
| 8 |
description = "F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching"
|
| 9 |
readme = "README.md"
|
| 10 |
license = {text = "MIT License"}
|
src/f5_tts/infer/SHARED.md
CHANGED
|
@@ -33,6 +33,8 @@
|
|
| 33 |
- [F5-TTS Base @ ru @ HotDro4illa](#f5-tts-base--ru--hotdro4illa)
|
| 34 |
- [Spanish](#spanish)
|
| 35 |
- [F5-TTS Base @ es @ jpgallegoar](#f5-tts-base--es--jpgallegoar)
|
|
|
|
|
|
|
| 36 |
|
| 37 |
|
| 38 |
## Multilingual
|
|
@@ -173,3 +175,12 @@ Config: {"dim": 1024, "depth": 22, "heads": 16, "ff_mult": 2, "text_dim": 512, "
|
|
| 173 |
|F5-TTS Base|[ckpt & vocab](https://huggingface.co/jpgallegoar/F5-Spanish)|[Voxpopuli](https://huggingface.co/datasets/facebook/voxpopuli) & Crowdsourced & TEDx, 218 hours|cc0-1.0|
|
| 174 |
|
| 175 |
- @jpgallegoar [GitHub repo](https://github.com/jpgallegoar/Spanish-F5), Jupyter Notebook and Gradio usage for Spanish model.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
- [F5-TTS Base @ ru @ HotDro4illa](#f5-tts-base--ru--hotdro4illa)
|
| 34 |
- [Spanish](#spanish)
|
| 35 |
- [F5-TTS Base @ es @ jpgallegoar](#f5-tts-base--es--jpgallegoar)
|
| 36 |
+
- [German](#german)
|
| 37 |
+
- [F5-TTS Base @ de @ hvoss-techfak](#f5-tts-base--de--hvoss-techfak)
|
| 38 |
|
| 39 |
|
| 40 |
## Multilingual
|
|
|
|
| 175 |
|F5-TTS Base|[ckpt & vocab](https://huggingface.co/jpgallegoar/F5-Spanish)|[Voxpopuli](https://huggingface.co/datasets/facebook/voxpopuli) & Crowdsourced & TEDx, 218 hours|cc0-1.0|
|
| 176 |
|
| 177 |
- @jpgallegoar [GitHub repo](https://github.com/jpgallegoar/Spanish-F5), Jupyter Notebook and Gradio usage for Spanish model.
|
| 178 |
+
|
| 179 |
+
## German
|
| 180 |
+
|
| 181 |
+
#### F5-TTS Base @ de @ hvoss-techfak
|
| 182 |
+
|Model|🤗Hugging Face|Data (Hours)|Model License|
|
| 183 |
+
|:---:|:------------:|:-----------:|:-------------:|
|
| 184 |
+
|F5-TTS Base|[ckpt & vocab](https://huggingface.co/hvoss-techfak/F5-TTS-German)|[Mozilla Common Voice 19.0](https://commonvoice.mozilla.org/en/datasets) & 800 hours Crowdsourced |cc-by-nc-4.0|
|
| 185 |
+
|
| 186 |
+
- Finetuned by [@hvoss-techfak](https://github.com/hvoss-techfak)
|
src/f5_tts/infer/utils_infer.py
CHANGED
|
@@ -45,6 +45,8 @@ device = (
|
|
| 45 |
else "cpu"
|
| 46 |
)
|
| 47 |
|
|
|
|
|
|
|
| 48 |
# -----------------------------------------
|
| 49 |
|
| 50 |
target_sample_rate = 24000
|
|
@@ -306,42 +308,44 @@ def preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=print):
|
|
| 306 |
ref_audio = _ref_audio_cache[audio_hash]
|
| 307 |
|
| 308 |
else: # first pass, do preprocess
|
| 309 |
-
with tempfile.NamedTemporaryFile(
|
| 310 |
-
|
|
|
|
|
|
|
| 311 |
|
| 312 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 313 |
non_silent_segs = silence.split_on_silence(
|
| 314 |
-
aseg, min_silence_len=
|
| 315 |
)
|
| 316 |
non_silent_wave = AudioSegment.silent(duration=0)
|
| 317 |
for non_silent_seg in non_silent_segs:
|
| 318 |
if len(non_silent_wave) > 6000 and len(non_silent_wave + non_silent_seg) > 12000:
|
| 319 |
-
show_info("Audio is over 12s, clipping short. (
|
| 320 |
break
|
| 321 |
non_silent_wave += non_silent_seg
|
| 322 |
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
non_silent_wave += non_silent_seg
|
| 334 |
-
|
| 335 |
-
aseg = non_silent_wave
|
| 336 |
-
|
| 337 |
-
# 3. if no proper silence found for clipping
|
| 338 |
-
if len(aseg) > 12000:
|
| 339 |
-
aseg = aseg[:12000]
|
| 340 |
-
show_info("Audio is over 12s, clipping short. (3)")
|
| 341 |
-
|
| 342 |
-
aseg = remove_silence_edges(aseg) + AudioSegment.silent(duration=50)
|
| 343 |
-
aseg.export(f.name, format="wav")
|
| 344 |
-
ref_audio = f.name
|
| 345 |
|
| 346 |
# Cache the processed reference audio
|
| 347 |
_ref_audio_cache[audio_hash] = ref_audio
|
|
|
|
| 45 |
else "cpu"
|
| 46 |
)
|
| 47 |
|
| 48 |
+
tempfile_kwargs = {"delete_on_close": False} if sys.version_info >= (3, 12) else {"delete": False}
|
| 49 |
+
|
| 50 |
# -----------------------------------------
|
| 51 |
|
| 52 |
target_sample_rate = 24000
|
|
|
|
| 308 |
ref_audio = _ref_audio_cache[audio_hash]
|
| 309 |
|
| 310 |
else: # first pass, do preprocess
|
| 311 |
+
with tempfile.NamedTemporaryFile(suffix=".wav", **tempfile_kwargs) as f:
|
| 312 |
+
temp_path = f.name
|
| 313 |
+
|
| 314 |
+
aseg = AudioSegment.from_file(ref_audio_orig)
|
| 315 |
|
| 316 |
+
# 1. try to find long silence for clipping
|
| 317 |
+
non_silent_segs = silence.split_on_silence(
|
| 318 |
+
aseg, min_silence_len=1000, silence_thresh=-50, keep_silence=1000, seek_step=10
|
| 319 |
+
)
|
| 320 |
+
non_silent_wave = AudioSegment.silent(duration=0)
|
| 321 |
+
for non_silent_seg in non_silent_segs:
|
| 322 |
+
if len(non_silent_wave) > 6000 and len(non_silent_wave + non_silent_seg) > 12000:
|
| 323 |
+
show_info("Audio is over 12s, clipping short. (1)")
|
| 324 |
+
break
|
| 325 |
+
non_silent_wave += non_silent_seg
|
| 326 |
+
|
| 327 |
+
# 2. try to find short silence for clipping if 1. failed
|
| 328 |
+
if len(non_silent_wave) > 12000:
|
| 329 |
non_silent_segs = silence.split_on_silence(
|
| 330 |
+
aseg, min_silence_len=100, silence_thresh=-40, keep_silence=1000, seek_step=10
|
| 331 |
)
|
| 332 |
non_silent_wave = AudioSegment.silent(duration=0)
|
| 333 |
for non_silent_seg in non_silent_segs:
|
| 334 |
if len(non_silent_wave) > 6000 and len(non_silent_wave + non_silent_seg) > 12000:
|
| 335 |
+
show_info("Audio is over 12s, clipping short. (2)")
|
| 336 |
break
|
| 337 |
non_silent_wave += non_silent_seg
|
| 338 |
|
| 339 |
+
aseg = non_silent_wave
|
| 340 |
+
|
| 341 |
+
# 3. if no proper silence found for clipping
|
| 342 |
+
if len(aseg) > 12000:
|
| 343 |
+
aseg = aseg[:12000]
|
| 344 |
+
show_info("Audio is over 12s, clipping short. (3)")
|
| 345 |
+
|
| 346 |
+
aseg = remove_silence_edges(aseg) + AudioSegment.silent(duration=50)
|
| 347 |
+
aseg.export(temp_path, format="wav")
|
| 348 |
+
ref_audio = temp_path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 349 |
|
| 350 |
# Cache the processed reference audio
|
| 351 |
_ref_audio_cache[audio_hash] = ref_audio
|