| log_dir: "runs/run_mel_seed_uvit_xlsr_tiny" | |
| save_freq: 1 | |
| log_interval: 10 | |
| save_interval: 500 | |
| device: "cuda" | |
| epochs: 1000 # number of epochs for first stage training (pre-training) | |
| batch_size: 2 | |
| batch_length: 100 # maximum duration of audio in a batch (in seconds) | |
| max_len: 80 # maximum number of frames | |
| pretrained_model: "DiT_uvit_tat_xlsr_ema.pth" | |
| pretrained_encoder: "" | |
| load_only_params: False # set to true if do not want to load epoch numbers and optimizer parameters | |
| preprocess_params: | |
| sr: 22050 | |
| spect_params: | |
| n_fft: 1024 | |
| win_length: 1024 | |
| hop_length: 256 | |
| n_mels: 80 | |
| fmin: 0 | |
| fmax: 8000 | |
| model_params: | |
| dit_type: "DiT" # uDiT or DiT | |
| reg_loss_type: "l1" # l1 or l2 | |
| diffusion_type: "flow" | |
| timbre_shifter: | |
| se_db_path: "./modules/openvoice/checkpoints_v2/converter/se_db.pt" | |
| ckpt_path: './modules/openvoice/checkpoints_v2/converter' | |
| vocoder: | |
| type: "hifigan" | |
| speech_tokenizer: | |
| type: 'xlsr' | |
| output_layer: 12 | |
| name: 'facebook/wav2vec2-xls-r-300m' | |
| style_encoder: | |
| dim: 192 | |
| campplus_path: "campplus_cn_common.bin" | |
| length_regulator: | |
| channels: 384 | |
| is_discrete: false | |
| in_channels: 1024 | |
| content_codebook_size: 1024 | |
| sampling_ratios: [1, 1, 1, 1] | |
| vector_quantize: false | |
| n_codebooks: 2 | |
| quantizer_dropout: 0.0 | |
| f0_condition: false | |
| n_f0_bins: 512 | |
| DiT: | |
| hidden_dim: 384 | |
| num_heads: 6 | |
| depth: 9 | |
| class_dropout_prob: 0.1 | |
| block_size: 8192 | |
| in_channels: 80 | |
| style_condition: true | |
| final_layer_type: 'mlp' | |
| target: 'mel' # mel or betavae | |
| content_dim: 384 | |
| content_codebook_size: 1024 | |
| content_type: 'discrete' | |
| f0_condition: false | |
| n_f0_bins: 512 | |
| content_codebooks: 1 | |
| is_causal: false | |
| long_skip_connection: false | |
| zero_prompt_speech_token: false # for prompt component, do not input corresponding speech token | |
| time_as_token: true | |
| style_as_token: true | |
| uvit_skip_connection: true | |
| add_resblock_in_transformer: false | |
| loss_params: | |
| base_lr: 0.0001 |