Thanh-Lam's picture
Vietnamese Speaker Profiling with wav2vec2-base-vi-vlsp2020
c3418e9
# Evaluation Configuration
# Architecture: Encoder + Attentive Pooling + LayerNorm
# Model
model:
checkpoint: "output/speaker-profiling/best_model"
name: "microsoft/wavlm-base-plus"
head_hidden_dim: 256
# Audio Processing
audio:
sampling_rate: 16000
max_duration: 5
# Evaluation
evaluation:
batch_size: 32
dataloader_num_workers: 2
# Data Paths (relative to repo root)
data:
# === ViSpeech (CSV format) ===
clean_test_meta: "/home/ubuntu/DataScience/Voice_Pro_filling/vispeech_data/ViSpeech/metadata/clean_testset.csv"
clean_test_audio: "/home/ubuntu/DataScience/Voice_Pro_filling/vispeech_data/ViSpeech/clean_testset"
noisy_test_meta: "/home/ubuntu/DataScience/Voice_Pro_filling/vispeech_data/ViSpeech/metadata/noisy_testset.csv"
noisy_test_audio: "/home/ubuntu/DataScience/Voice_Pro_filling/vispeech_data/ViSpeech/noisy_testset"
# === ViMD (HuggingFace format) ===
vimd_path: "/kaggle/input/vimd-dataset"
# Output
output:
dir: "output/evaluation"
save_predictions: true
save_confusion_matrix: true
# Label Mappings
labels:
gender:
Male: 0
Female: 1
0: 0
1: 1
dialect:
North: 0
Central: 1
South: 2
region_to_dialect:
North: 0
Central: 1
South: 2
# Baseline Comparison (PACLIC 2024 - ResNet34)
baseline:
gender:
clean: 98.73
noisy: 98.14
dialect:
clean: 81.47
noisy: 74.80