--- tags: - espnet - audio - universa language: multilingual datasets: - universa_unite license: cc-by-4.0 --- ## ESPnet2 universa model ### `espnet/arecho_base_v0` This model was trained by ftshijt using universa_unite recipe in [espnet](https://github.com/espnet/espnet/). ### Demo: How to use in ESPnet2 Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html) if you haven't done that already. ```bash cd espnet git checkout 0b68ffd26362f4b50e7c73942c5bbdbc0a220bd4 pip install -e . cd egs2/universa_unite/uni_versa1 ./run.sh --skip_data_prep false --skip_train true --download_model espnet/arecho_base_v0 ``` ## universa config
expand ``` accum_grad: 2 adapter: lora adapter_conf: {} allow_multi_rates: false allow_variable_data_keys: false batch_bins: 1000000 batch_size: 16 batch_type: sorted best_model_criterion: - - train - loss - min - - valid - loss - min - - train - acc - max - - valid - acc - max bpemodel: null category_sample_size: 10 chunk_default_fs: null chunk_discard_short_samples: true chunk_excluded_key_prefixes: [] chunk_length: 500 chunk_max_abs_length: null chunk_shift_ratio: 0.5 cleaner: null collect_stats: false config: conf/train_aruniversa_wavlm.yaml create_graph_in_tensorboard: false cudnn_benchmark: false cudnn_deterministic: false cudnn_enabled: true ddp_comm_hook: null deepspeed_config: null detect_anomaly: false dist_backend: nccl dist_init_method: env:// dist_launcher: null dist_master_addr: null dist_master_port: null dist_rank: null dist_world_size: null distributed: false drop_last_iter: false dry_run: false early_stopping_criterion: - valid - loss - min exclude_weight_decay: false exclude_weight_decay_conf: {} fold_length: - 256000 freeze_param: - frontend.upstream frontend: s3prl frontend_conf: download_dir: ./hub frontend_conf: upstream: wavlm_large multilayer_feature: true g2p: null grad_clip: -1 grad_clip_type: 2.0 grad_noise: false gradient_as_bucket_view: true ignore_init_mismatch: false init: null init_param: [] iterator_type: sequence keep_nbest_models: 1 local_rank: 0 log_interval: 50 log_level: INFO max_cache_fd: 32 max_cache_size: 0.0 max_epoch: 100 metric2id: dump/raw/overall_base/metric2id metric2type: dump/raw/overall_base/metric2type metric_pad_value: -100 metric_token_info: data/token_list/metric_500_percentile_overall_base_w-numerical/tokens.json metric_token_pad_value: 0 model_conf: {} multi_task_dataset: false multiple_iterator: false multiprocessing_distributed: false nbest_averaging_interval: 0 ngpu: 1 no_forward_run: false non_linguistic_symbols: null num_att_plot: 0 num_cache_chunks: 1024 num_iters_per_epoch: null num_workers: 1 optim: adamw optim_conf: lr: 0.001 output_dir: exp/universa_universa_ar_overall_base_token_wavlm patience: null pretrain_path: null print_config: false randomize_sequential_metric: true required: - output_dir - metric2id resume: true save_strategy: all scheduler: warmuplr scheduler_conf: warmup_steps: 25000 seed: 777 sequential_metric: true sharded_ddp: false shuffle_within_batch: false sort_batch: descending sort_in_batch: descending token_list: null token_type: bpe tokenize_numerical_metric: true train_data_path_and_name_and_type: - - dump/raw/overall_base/wav.scp - audio - kaldi_ark - - dump/raw/overall_base/metric.scp - metrics - metric - - dump/raw/overall_base/ref_wav.scp - ref_audio - kaldi_ark train_dtype: float32 train_shape_file: - exp/universa_stats_overall_base/train/audio_shape - exp/universa_stats_overall_base/train/ref_audio_shape universa: ar_universa universa_conf: audio_encoder_params: attention_dropout_rate: 0.1 attention_heads: 4 concat_after: false dropout_rate: 0.1 input_layer: conv2d layer_drop_rate: 0.1 linear_units: 1024 normalize_before: true num_blocks: 4 positional_dropout_rate: 0.1 positionwise_conv_kernel_size: 1 positionwise_layer_type: linear qk_norm: false use_flash_attn: false audio_encoder_type: transformer cross_attention_params: dropout_rate: 0.1 n_head: 2 cross_attention_type: multihead embedding_dim: 256 lsm_weight: 0.1 metric_decoder_params: attention_heads: 4 concat_after: false dropout_rate: 0.1 input_layer: embed layer_drop_rate: 0.1 linear_units: 1024 normalize_before: true num_blocks: 4 positional_dropout_rate: 0.1 qk_norm: false self_attention_dropout_rate: 0.1 src_attention_dropout_rate: 0.1 use_flash_attn: false sym_eos: sym_sos: use_rope: true unused_parameters: false use_adapter: false use_amp: false use_deepspeed: false use_matplotlib: true use_preprocessor: true use_ref_audio: true use_ref_text: false use_tensorboard: true use_tf32: false use_wandb: false val_scheduler_criterion: - valid - loss valid_batch_bins: null valid_batch_size: null valid_batch_type: null valid_data_path_and_name_and_type: - - dump/raw/overall_dev/wav.scp - audio - kaldi_ark - - dump/raw/overall_dev/metric.scp - metrics - metric - - dump/raw/overall_dev/ref_wav.scp - ref_audio - kaldi_ark valid_iterator_type: null valid_max_cache_size: null valid_shape_file: - exp/universa_stats_overall_base/valid/audio_shape - exp/universa_stats_overall_base/valid/ref_audio_shape version: '202503' wandb_entity: null wandb_id: null wandb_model_log_interval: -1 wandb_name: null wandb_project: null write_collected_feats: false ```
### Citing ESPnet ```BibTex @inproceedings{watanabe2018espnet, author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai}, title={{ESPnet}: End-to-End Speech Processing Toolkit}, year={2018}, booktitle={Proceedings of Interspeech}, pages={2207--2211}, doi={10.21437/Interspeech.2018-1456}, url={http://dx.doi.org/10.21437/Interspeech.2018-1456} } ``` or arXiv: ```bibtex @misc{watanabe2018espnet, title={ESPnet: End-to-End Speech Processing Toolkit}, author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai}, year={2018}, eprint={1804.00015}, archivePrefix={arXiv}, primaryClass={cs.CL} } ```