Thanh-Lam's picture
Vietnamese Speaker Profiling with wav2vec2-base-vi-vlsp2020
c3418e9
raw
history blame contribute delete
695 Bytes
# Inference Configuration
# Model
model:
checkpoint: "model/vulehuubinh"
name: "nguyenvulebinh/wav2vec2-base-vi-vlsp2020"
head_hidden_dim: 256
# Audio Processing
audio:
sampling_rate: 16000
max_duration: 5
# Inference
inference:
batch_size: 1
device: "cuda"
# Input
input:
audio_path: null
audio_dir: null
# Output
output:
dir: "output/predictions"
save_results: true
format: "json"
# Label Mappings
# NOTE: Model was trained with Female=0, Male=1 (opposite of finetune.yaml order)
# This is because pandas .map() may have processed labels in different order
labels:
gender:
0: "Female"
1: "Male"
dialect:
0: "North"
1: "Central"
2: "South"