Spaces:

Thanh-Lam
/

vietnamese-speaker-profiling-v2

Sleeping

Vietnamese Speaker Profiling with wav2vec2-base-vi-vlsp2020

c3418e9 9 days ago

1.38 kB

	# Evaluation Configuration
	# Architecture: Encoder + Attentive Pooling + LayerNorm

	# Model
	model:
	checkpoint: "output/speaker-profiling/best_model"
	name: "microsoft/wavlm-base-plus"
	head_hidden_dim: 256

	# Audio Processing
	audio:
	sampling_rate: 16000
	max_duration: 5

	# Evaluation
	evaluation:
	batch_size: 32
	dataloader_num_workers: 2

	# Data Paths (relative to repo root)
	data:
	# === ViSpeech (CSV format) ===
	clean_test_meta: "/home/ubuntu/DataScience/Voice_Pro_filling/vispeech_data/ViSpeech/metadata/clean_testset.csv"
	clean_test_audio: "/home/ubuntu/DataScience/Voice_Pro_filling/vispeech_data/ViSpeech/clean_testset"
	noisy_test_meta: "/home/ubuntu/DataScience/Voice_Pro_filling/vispeech_data/ViSpeech/metadata/noisy_testset.csv"
	noisy_test_audio: "/home/ubuntu/DataScience/Voice_Pro_filling/vispeech_data/ViSpeech/noisy_testset"

	# === ViMD (HuggingFace format) ===
	vimd_path: "/kaggle/input/vimd-dataset"

	# Output
	output:
	dir: "output/evaluation"
	save_predictions: true
	save_confusion_matrix: true

	# Label Mappings
	labels:
	gender:
	Male: 0
	Female: 1
	0: 0
	1: 1
	dialect:
	North: 0
	Central: 1
	South: 2
	region_to_dialect:
	North: 0
	Central: 1
	South: 2

	# Baseline Comparison (PACLIC 2024 - ResNet34)
	baseline:
	gender:
	clean: 98.73
	noisy: 98.14
	dialect:
	clean: 81.47
	noisy: 74.80