File size: 3,260 Bytes
731923a
 
 
 
 
 
e1feaa3
731923a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# Configuration for inference-cli.py

# --- Input Files and Text ---

# Path or Hugging Face Hub ID (e.g., "hf://user/repo/model.safetensors") to the TTS model checkpoint.
# This is the primary required setting. The script infers model type (DiT/UNetT) from this path.
ckpt_path = "hf://Gregniuki/F5-tts_English_German_Polish/multi3/model_900000.pt" # Default used in script

# Path to the reference audio file (WAV, MP3, etc.). Recommended < 10 seconds.
ref_audio = "tests/ref_audio/test_en_1_ref_short.wav"

# Text transcription of the reference audio.
# If set to "", the script will attempt to transcribe ref_audio using Whisper.
ref_text = "Some call me nature, others call me mother nature."

# Text to be synthesized by the TTS model.
gen_text = "I don't really care what you call me. I've been a silent spectator, watching species evolve, empires rise and fall. But always remember, I am mighty and enduring. Respect me and I'll nurture you; ignore me and you shall face the consequences."

# Optional: Path to a UTF-8 encoded text file containing the text to synthesize.
# If provided, this overrides the gen_text setting above.
gen_file = ""

# Path to the tokenizer.json file required by the model.
tokenizer_path = "data/Emilia_ZH_EN_pinyin/tokenizer.json" # Default used in script

# --- Output Settings ---

# Directory where the output audio (.wav) and spectrogram (.png) will be saved.
output_dir = "tests"

# Base name for the output files (e.g., "my_speech" -> my_speech.wav, my_speech.png).
output_name = "out" # Default: "out"

# --- Language Settings ---

# Language code for phonemizing the *reference* text (e.g., en-us, en-gb, de, pl, fr-fr).
# Needs to match the language spoken in ref_audio / ref_text. See phonemizer docs for codes.
ref_language = "en-us" # Default: "en-us"

# Language code for phonemizing the *generated* text (gen_text / gen_file).
# Needs to match the language you want the model to speak.
language = "en-us" # Default: "en-us"

# --- Inference Parameters ---

# Speech speed multiplier. > 1.0 is faster, < 1.0 is slower.
speed = 1.0 # Default: 1.0

# Number of Function Evaluations (sampling steps). Higher values may improve quality but increase time.
nfe = 32 # Default: 32

# Classifier-Free Guidance strength. Higher values increase adherence to reference timbre but can reduce naturalness.
cfg = 2.0 # Default: 2.0

# Sway sampling coefficient (experimental). Often -1.0 or disabled.
sway = -1.0 # Default: -1.0

# --- Postprocessing ---

# Duration (in seconds) for cross-fading between generated audio batches. 0 disables cross-fading.
cross_fade = 0.15 # Default: 0.15

# Apply silence removal to the final generated audio using pydub.
remove_silence = false # Default: false

# --- System Settings ---

# Optional: Hugging Face API token for downloading private models or high-rate limiting.
# Can also be set via environment variable HUGGING_FACE_HUB_TOKEN.
hf_token = "" # Default: "" (uses cached credentials or public access)

# Optional: Specify the device ('cuda', 'cpu', 'mps'). If commented out or empty, defaults to auto-detection.
# device = "cuda"

# Optional: Specify the data type ('float16', 'bfloat16', 'float32'). If commented out or empty, defaults based on device.
# dtype = "float16"