| # Inference Configuration | |
| # Model | |
| model: | |
| checkpoint: "model/vulehuubinh" | |
| name: "nguyenvulebinh/wav2vec2-base-vi-vlsp2020" | |
| head_hidden_dim: 256 | |
| # Audio Processing | |
| audio: | |
| sampling_rate: 16000 | |
| max_duration: 5 | |
| # Inference | |
| inference: | |
| batch_size: 1 | |
| device: "cuda" | |
| # Input | |
| input: | |
| audio_path: null | |
| audio_dir: null | |
| # Output | |
| output: | |
| dir: "output/predictions" | |
| save_results: true | |
| format: "json" | |
| # Label Mappings | |
| # NOTE: Model was trained with Female=0, Male=1 (opposite of finetune.yaml order) | |
| # This is because pandas .map() may have processed labels in different order | |
| labels: | |
| gender: | |
| 0: "Female" | |
| 1: "Male" | |
| dialect: | |
| 0: "North" | |
| 1: "Central" | |
| 2: "South" | |