Spaces:
Sleeping
Sleeping
| { | |
| "base_config": "config/tts.json", | |
| "model_type": "FastSpeech2", | |
| "task_type": "tts", | |
| "dataset": ["LJSpeech"], | |
| "preprocess": { | |
| // acoustic features | |
| "extract_audio": true, | |
| "extract_mel": true, | |
| "mel_extract_mode": "taco", | |
| "mel_min_max_norm": false, | |
| "extract_pitch": true, | |
| "extract_uv": false, | |
| "pitch_extractor": "dio", | |
| "extract_energy": true, | |
| "energy_extract_mode": "from_tacotron_stft", | |
| "extract_duration": true, | |
| "use_phone": true, | |
| "pitch_norm": true, | |
| "energy_norm": true, | |
| "pitch_remove_outlier": true, | |
| "energy_remove_outlier": true, | |
| // Default config | |
| "n_mel": 80, | |
| "win_size": 1024, // todo | |
| "hop_size": 256, | |
| "sample_rate": 22050, | |
| "n_fft": 1024, // todo | |
| "fmin": 0, | |
| "fmax": 8000, // todo | |
| "raw_data": "raw_data", | |
| "text_cleaners": ["english_cleaners"], | |
| "f0_min": 71, // ~C2 | |
| "f0_max": 800, //1100, // ~C6(1100), ~G5(800) | |
| "pitch_bin": 256, | |
| "pitch_max": 1100.0, | |
| "pitch_min": 50.0, | |
| "is_label": true, | |
| "is_mu_law": true, | |
| "bits": 8, | |
| "mel_min_max_stats_dir": "mel_min_max_stats", | |
| "whisper_dir": "whisper", | |
| "content_vector_dir": "content_vector", | |
| "wenet_dir": "wenet", | |
| "mert_dir": "mert", | |
| "spk2id":"spk2id.json", | |
| "utt2spk":"utt2spk", | |
| // Features used for model training | |
| "use_mel": true, | |
| "use_min_max_norm_mel": false, | |
| "use_frame_pitch": false, | |
| "use_frame_energy": false, | |
| "use_phone_pitch": true, | |
| "use_phone_energy": true, | |
| "use_log_scale_pitch": false, | |
| "use_log_scale_energy": false, | |
| "use_spkid": false, | |
| "align_mel_duration": true, | |
| "text_cleaners": ["english_cleaners"], | |
| "phone_extractor": "lexicon", // "espeak, pypinyin, pypinyin_initials_finals, lexicon (only for language=en-us right now)" | |
| }, | |
| "model": { | |
| // Settings for transformer | |
| "transformer": { | |
| "encoder_layer": 4, | |
| "encoder_head": 2, | |
| "encoder_hidden": 256, | |
| "decoder_layer": 6, | |
| "decoder_head": 2, | |
| "decoder_hidden": 256, | |
| "conv_filter_size": 1024, | |
| "conv_kernel_size": [9, 1], | |
| "encoder_dropout": 0.2, | |
| "decoder_dropout": 0.2 | |
| }, | |
| // Settings for variance_predictor | |
| "variance_predictor":{ | |
| "filter_size": 256, | |
| "kernel_size": 3, | |
| "dropout": 0.5 | |
| }, | |
| "variance_embedding":{ | |
| "pitch_quantization": "linear", // support 'linear' or 'log', 'log' is allowed only if the pitch values are not normalized during preprocessing | |
| "energy_quantization": "linear", // support 'linear' or 'log', 'log' is allowed only if the energy values are not normalized during preprocessing | |
| "n_bins": 256 | |
| }, | |
| "max_seq_len": 1000 | |
| }, | |
| "train":{ | |
| "batch_size": 16, | |
| "sort_sample": true, | |
| "drop_last": true, | |
| "group_size": 4, | |
| "grad_clip_thresh": 1.0, | |
| "dataloader": { | |
| "num_worker": 8, | |
| "pin_memory": true | |
| }, | |
| "lr_scheduler":{ | |
| "num_warmup": 4000 | |
| }, | |
| // LR Scheduler | |
| "scheduler": "NoamLR", | |
| // Optimizer | |
| "optimizer": "Adam", | |
| "adam": { | |
| "lr": 0.0625, | |
| "betas": [0.9, 0.98], | |
| "eps": 0.000000001, | |
| "weight_decay": 0.0 | |
| }, | |
| } | |
| } | |