Spaces:
Running
on
Zero
Running
on
Zero
Create inference_cli.toml
Browse files- inference_cli.toml +76 -0
inference_cli.toml
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Configuration for inference-cli.py
|
| 2 |
+
|
| 3 |
+
# --- Input Files and Text ---
|
| 4 |
+
|
| 5 |
+
# Path or Hugging Face Hub ID (e.g., "hf://user/repo/model.safetensors") to the TTS model checkpoint.
|
| 6 |
+
# This is the primary required setting. The script infers model type (DiT/UNetT) from this path.
|
| 7 |
+
ckpt_path = "hf://Gregniuki/F5-tts_English_German_Polish/multi3/model_1100000.pt" # Default used in script
|
| 8 |
+
|
| 9 |
+
# Path to the reference audio file (WAV, MP3, etc.). Recommended < 10 seconds.
|
| 10 |
+
ref_audio = "tests/ref_audio/test_en_1_ref_short.wav"
|
| 11 |
+
|
| 12 |
+
# Text transcription of the reference audio.
|
| 13 |
+
# If set to "", the script will attempt to transcribe ref_audio using Whisper.
|
| 14 |
+
ref_text = "Some call me nature, others call me mother nature."
|
| 15 |
+
|
| 16 |
+
# Text to be synthesized by the TTS model.
|
| 17 |
+
gen_text = "I don't really care what you call me. I've been a silent spectator, watching species evolve, empires rise and fall. But always remember, I am mighty and enduring. Respect me and I'll nurture you; ignore me and you shall face the consequences."
|
| 18 |
+
|
| 19 |
+
# Optional: Path to a UTF-8 encoded text file containing the text to synthesize.
|
| 20 |
+
# If provided, this overrides the gen_text setting above.
|
| 21 |
+
gen_file = ""
|
| 22 |
+
|
| 23 |
+
# Path to the tokenizer.json file required by the model.
|
| 24 |
+
tokenizer_path = "data/Emilia_ZH_EN_pinyin/tokenizer.json" # Default used in script
|
| 25 |
+
|
| 26 |
+
# --- Output Settings ---
|
| 27 |
+
|
| 28 |
+
# Directory where the output audio (.wav) and spectrogram (.png) will be saved.
|
| 29 |
+
output_dir = "tests"
|
| 30 |
+
|
| 31 |
+
# Base name for the output files (e.g., "my_speech" -> my_speech.wav, my_speech.png).
|
| 32 |
+
output_name = "out" # Default: "out"
|
| 33 |
+
|
| 34 |
+
# --- Language Settings ---
|
| 35 |
+
|
| 36 |
+
# Language code for phonemizing the *reference* text (e.g., en-us, en-gb, de, pl, fr-fr).
|
| 37 |
+
# Needs to match the language spoken in ref_audio / ref_text. See phonemizer docs for codes.
|
| 38 |
+
ref_language = "en-us" # Default: "en-us"
|
| 39 |
+
|
| 40 |
+
# Language code for phonemizing the *generated* text (gen_text / gen_file).
|
| 41 |
+
# Needs to match the language you want the model to speak.
|
| 42 |
+
language = "en-us" # Default: "en-us"
|
| 43 |
+
|
| 44 |
+
# --- Inference Parameters ---
|
| 45 |
+
|
| 46 |
+
# Speech speed multiplier. > 1.0 is faster, < 1.0 is slower.
|
| 47 |
+
speed = 1.0 # Default: 1.0
|
| 48 |
+
|
| 49 |
+
# Number of Function Evaluations (sampling steps). Higher values may improve quality but increase time.
|
| 50 |
+
nfe = 32 # Default: 32
|
| 51 |
+
|
| 52 |
+
# Classifier-Free Guidance strength. Higher values increase adherence to reference timbre but can reduce naturalness.
|
| 53 |
+
cfg = 2.0 # Default: 2.0
|
| 54 |
+
|
| 55 |
+
# Sway sampling coefficient (experimental). Often -1.0 or disabled.
|
| 56 |
+
sway = -1.0 # Default: -1.0
|
| 57 |
+
|
| 58 |
+
# --- Postprocessing ---
|
| 59 |
+
|
| 60 |
+
# Duration (in seconds) for cross-fading between generated audio batches. 0 disables cross-fading.
|
| 61 |
+
cross_fade = 0.15 # Default: 0.15
|
| 62 |
+
|
| 63 |
+
# Apply silence removal to the final generated audio using pydub.
|
| 64 |
+
remove_silence = false # Default: false
|
| 65 |
+
|
| 66 |
+
# --- System Settings ---
|
| 67 |
+
|
| 68 |
+
# Optional: Hugging Face API token for downloading private models or high-rate limiting.
|
| 69 |
+
# Can also be set via environment variable HUGGING_FACE_HUB_TOKEN.
|
| 70 |
+
hf_token = "" # Default: "" (uses cached credentials or public access)
|
| 71 |
+
|
| 72 |
+
# Optional: Specify the device ('cuda', 'cpu', 'mps'). If commented out or empty, defaults to auto-detection.
|
| 73 |
+
# device = "cuda"
|
| 74 |
+
|
| 75 |
+
# Optional: Specify the data type ('float16', 'bfloat16', 'float32'). If commented out or empty, defaults based on device.
|
| 76 |
+
# dtype = "float16"
|