Thanh-Lam's picture
Vietnamese Speaker Profiling with wav2vec2-base-vi-vlsp2020
c3418e9
# Config for ECAPA-TDNN (SpeechBrain)
# Model: speechbrain/spkrec-ecapa-voxceleb
# Model
model:
name: "speechbrain/spkrec-ecapa-voxceleb"
num_genders: 2
num_dialects: 3
dropout: 0.1
head_hidden_dim: 128 # Smaller head for 192-dim embeddings
# Audio processing
audio:
sampling_rate: 16000
max_duration: 5 # seconds
# Training
training:
batch_size: 32
learning_rate: 1e-4 # Higher LR since only training heads
num_epochs: 15
warmup_ratio: 0.1
weight_decay: 0.01
gradient_clip: 1.0
lr_scheduler: "linear"
fp16: false # ECAPA-TDNN does not support fp16
dataloader_num_workers: 4
# Data Augmentation
augmentation:
enabled: true
prob: 0.8
# Loss
loss:
dialect_weight: 3.0
# WandB Configuration
wandb:
enabled: true
api_key: "f05e29c3466ec288e97041e0e3d541c4087096a6"
project: "vispeech-speaker-profiling"
run_name: "ecapa-tdnn"
# Dataset paths
data:
source: "vispeech" # Options: vispeech, vimd
# === ViSpeech (CSV format) ===
vispeech_root: "/home/ubuntu/DataScience/Voice_Pro_filing/vispeech_data/ViSpeech"
train_meta: "/home/ubuntu/DataScience/Voice_Pro_filing/vispeech_data/ViSpeech/metadata/trainset.csv"
train_audio: "/home/ubuntu/DataScience/Voice_Pro_filing/vispeech_data/ViSpeech/trainset"
clean_test_meta: "/home/ubuntu/DataScience/Voice_Pro_filing/vispeech_data/ViSpeech/metadata/clean_testset.csv"
clean_test_audio: "/home/ubuntu/DataScience/Voice_Pro_filing/vispeech_data/ViSpeech/clean_testset"
noisy_test_meta: "/home/ubuntu/DataScience/Voice_Pro_filing/vispeech_data/ViSpeech/metadata/noisy_testset.csv"
noisy_test_audio: "/home/ubuntu/DataScience/Voice_Pro_filing/vispeech_data/ViSpeech/noisy_testset"
val_split: 0.15
# === ViMD (HuggingFace format) ===
vimd_path: "/kaggle/input/vimd-dataset"
# Output
output:
dir: "output/ecapa-tdnn"
save_total_limit: 3
metric_for_best_model: "dialect_acc"
# Early Stopping
early_stopping:
patience: 3
threshold: 0.0025
# Label Mappings
labels:
gender:
Male: 0
Female: 1
0: 0
1: 1
dialect:
North: 0
Central: 1
South: 2
region_to_dialect:
North: 0
Central: 1
South: 2
# Reproducibility
seed: 42