Gregniuki commited on
Commit
731923a
·
verified ·
1 Parent(s): c91fbe6

Create inference_cli.toml

Browse files
Files changed (1) hide show
  1. inference_cli.toml +76 -0
inference_cli.toml ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Configuration for inference-cli.py
2
+
3
+ # --- Input Files and Text ---
4
+
5
+ # Path or Hugging Face Hub ID (e.g., "hf://user/repo/model.safetensors") to the TTS model checkpoint.
6
+ # This is the primary required setting. The script infers model type (DiT/UNetT) from this path.
7
+ ckpt_path = "hf://Gregniuki/F5-tts_English_German_Polish/multi3/model_1100000.pt" # Default used in script
8
+
9
+ # Path to the reference audio file (WAV, MP3, etc.). Recommended < 10 seconds.
10
+ ref_audio = "tests/ref_audio/test_en_1_ref_short.wav"
11
+
12
+ # Text transcription of the reference audio.
13
+ # If set to "", the script will attempt to transcribe ref_audio using Whisper.
14
+ ref_text = "Some call me nature, others call me mother nature."
15
+
16
+ # Text to be synthesized by the TTS model.
17
+ gen_text = "I don't really care what you call me. I've been a silent spectator, watching species evolve, empires rise and fall. But always remember, I am mighty and enduring. Respect me and I'll nurture you; ignore me and you shall face the consequences."
18
+
19
+ # Optional: Path to a UTF-8 encoded text file containing the text to synthesize.
20
+ # If provided, this overrides the gen_text setting above.
21
+ gen_file = ""
22
+
23
+ # Path to the tokenizer.json file required by the model.
24
+ tokenizer_path = "data/Emilia_ZH_EN_pinyin/tokenizer.json" # Default used in script
25
+
26
+ # --- Output Settings ---
27
+
28
+ # Directory where the output audio (.wav) and spectrogram (.png) will be saved.
29
+ output_dir = "tests"
30
+
31
+ # Base name for the output files (e.g., "my_speech" -> my_speech.wav, my_speech.png).
32
+ output_name = "out" # Default: "out"
33
+
34
+ # --- Language Settings ---
35
+
36
+ # Language code for phonemizing the *reference* text (e.g., en-us, en-gb, de, pl, fr-fr).
37
+ # Needs to match the language spoken in ref_audio / ref_text. See phonemizer docs for codes.
38
+ ref_language = "en-us" # Default: "en-us"
39
+
40
+ # Language code for phonemizing the *generated* text (gen_text / gen_file).
41
+ # Needs to match the language you want the model to speak.
42
+ language = "en-us" # Default: "en-us"
43
+
44
+ # --- Inference Parameters ---
45
+
46
+ # Speech speed multiplier. > 1.0 is faster, < 1.0 is slower.
47
+ speed = 1.0 # Default: 1.0
48
+
49
+ # Number of Function Evaluations (sampling steps). Higher values may improve quality but increase time.
50
+ nfe = 32 # Default: 32
51
+
52
+ # Classifier-Free Guidance strength. Higher values increase adherence to reference timbre but can reduce naturalness.
53
+ cfg = 2.0 # Default: 2.0
54
+
55
+ # Sway sampling coefficient (experimental). Often -1.0 or disabled.
56
+ sway = -1.0 # Default: -1.0
57
+
58
+ # --- Postprocessing ---
59
+
60
+ # Duration (in seconds) for cross-fading between generated audio batches. 0 disables cross-fading.
61
+ cross_fade = 0.15 # Default: 0.15
62
+
63
+ # Apply silence removal to the final generated audio using pydub.
64
+ remove_silence = false # Default: false
65
+
66
+ # --- System Settings ---
67
+
68
+ # Optional: Hugging Face API token for downloading private models or high-rate limiting.
69
+ # Can also be set via environment variable HUGGING_FACE_HUB_TOKEN.
70
+ hf_token = "" # Default: "" (uses cached credentials or public access)
71
+
72
+ # Optional: Specify the device ('cuda', 'cpu', 'mps'). If commented out or empty, defaults to auto-detection.
73
+ # device = "cuda"
74
+
75
+ # Optional: Specify the data type ('float16', 'bfloat16', 'float32'). If commented out or empty, defaults based on device.
76
+ # dtype = "float16"