| # Config for ECAPA-TDNN (SpeechBrain) | |
| # Model: speechbrain/spkrec-ecapa-voxceleb | |
| # Model | |
| model: | |
| name: "speechbrain/spkrec-ecapa-voxceleb" | |
| num_genders: 2 | |
| num_dialects: 3 | |
| dropout: 0.1 | |
| head_hidden_dim: 128 # Smaller head for 192-dim embeddings | |
| # Audio processing | |
| audio: | |
| sampling_rate: 16000 | |
| max_duration: 5 # seconds | |
| # Training | |
| training: | |
| batch_size: 32 | |
| learning_rate: 1e-4 # Higher LR since only training heads | |
| num_epochs: 15 | |
| warmup_ratio: 0.1 | |
| weight_decay: 0.01 | |
| gradient_clip: 1.0 | |
| lr_scheduler: "linear" | |
| fp16: false # ECAPA-TDNN does not support fp16 | |
| dataloader_num_workers: 4 | |
| # Data Augmentation | |
| augmentation: | |
| enabled: true | |
| prob: 0.8 | |
| # Loss | |
| loss: | |
| dialect_weight: 3.0 | |
| # WandB Configuration | |
| wandb: | |
| enabled: true | |
| api_key: "f05e29c3466ec288e97041e0e3d541c4087096a6" | |
| project: "vispeech-speaker-profiling" | |
| run_name: "ecapa-tdnn" | |
| # Dataset paths | |
| data: | |
| source: "vispeech" # Options: vispeech, vimd | |
| # === ViSpeech (CSV format) === | |
| vispeech_root: "/home/ubuntu/DataScience/Voice_Pro_filing/vispeech_data/ViSpeech" | |
| train_meta: "/home/ubuntu/DataScience/Voice_Pro_filing/vispeech_data/ViSpeech/metadata/trainset.csv" | |
| train_audio: "/home/ubuntu/DataScience/Voice_Pro_filing/vispeech_data/ViSpeech/trainset" | |
| clean_test_meta: "/home/ubuntu/DataScience/Voice_Pro_filing/vispeech_data/ViSpeech/metadata/clean_testset.csv" | |
| clean_test_audio: "/home/ubuntu/DataScience/Voice_Pro_filing/vispeech_data/ViSpeech/clean_testset" | |
| noisy_test_meta: "/home/ubuntu/DataScience/Voice_Pro_filing/vispeech_data/ViSpeech/metadata/noisy_testset.csv" | |
| noisy_test_audio: "/home/ubuntu/DataScience/Voice_Pro_filing/vispeech_data/ViSpeech/noisy_testset" | |
| val_split: 0.15 | |
| # === ViMD (HuggingFace format) === | |
| vimd_path: "/kaggle/input/vimd-dataset" | |
| # Output | |
| output: | |
| dir: "output/ecapa-tdnn" | |
| save_total_limit: 3 | |
| metric_for_best_model: "dialect_acc" | |
| # Early Stopping | |
| early_stopping: | |
| patience: 3 | |
| threshold: 0.0025 | |
| # Label Mappings | |
| labels: | |
| gender: | |
| Male: 0 | |
| Female: 1 | |
| 0: 0 | |
| 1: 1 | |
| dialect: | |
| North: 0 | |
| Central: 1 | |
| South: 2 | |
| region_to_dialect: | |
| North: 0 | |
| Central: 1 | |
| South: 2 | |
| # Reproducibility | |
| seed: 42 | |