| # Evaluation Configuration | |
| # Architecture: Encoder + Attentive Pooling + LayerNorm | |
| # Model | |
| model: | |
| checkpoint: "output/speaker-profiling/best_model" | |
| name: "microsoft/wavlm-base-plus" | |
| head_hidden_dim: 256 | |
| # Audio Processing | |
| audio: | |
| sampling_rate: 16000 | |
| max_duration: 5 | |
| # Evaluation | |
| evaluation: | |
| batch_size: 32 | |
| dataloader_num_workers: 2 | |
| # Data Paths (relative to repo root) | |
| data: | |
| # === ViSpeech (CSV format) === | |
| clean_test_meta: "/home/ubuntu/DataScience/Voice_Pro_filling/vispeech_data/ViSpeech/metadata/clean_testset.csv" | |
| clean_test_audio: "/home/ubuntu/DataScience/Voice_Pro_filling/vispeech_data/ViSpeech/clean_testset" | |
| noisy_test_meta: "/home/ubuntu/DataScience/Voice_Pro_filling/vispeech_data/ViSpeech/metadata/noisy_testset.csv" | |
| noisy_test_audio: "/home/ubuntu/DataScience/Voice_Pro_filling/vispeech_data/ViSpeech/noisy_testset" | |
| # === ViMD (HuggingFace format) === | |
| vimd_path: "/kaggle/input/vimd-dataset" | |
| # Output | |
| output: | |
| dir: "output/evaluation" | |
| save_predictions: true | |
| save_confusion_matrix: true | |
| # Label Mappings | |
| labels: | |
| gender: | |
| Male: 0 | |
| Female: 1 | |
| 0: 0 | |
| 1: 1 | |
| dialect: | |
| North: 0 | |
| Central: 1 | |
| South: 2 | |
| region_to_dialect: | |
| North: 0 | |
| Central: 1 | |
| South: 2 | |
| # Baseline Comparison (PACLIC 2024 - ResNet34) | |
| baseline: | |
| gender: | |
| clean: 98.73 | |
| noisy: 98.14 | |
| dialect: | |
| clean: 81.47 | |
| noisy: 74.80 | |