Thanh-Lam's picture
Vietnamese Speaker Profiling with wav2vec2-base-vi-vlsp2020
c3418e9
# Evaluation Configuration# Evaluation Configuration# Evaluation Configuration
# Evaluate model on test sets from raw audio
# Copy this file to eval.yaml and update paths# Evaluate model on test sets from raw audio# Architecture: WavLM + Attentive Pooling + LayerNorm + Deeper Heads
# Model# Copy this file to eval.yaml and update paths# Copy this file to eval.yaml and update paths
model:
checkpoint: "path/to/best_model"
name: "microsoft/wavlm-base-plus"
head_hidden_dim: 256# Model# Model
# Audio Processingmodel:model:
audio:
sampling_rate: 16000 checkpoint: "path/to/best_model" checkpoint: "path/to/best_model"
max_duration: 5
name: "microsoft/wavlm-base-plus" name: "microsoft/wavlm-base-plus"
# Evaluation
evaluation: head_hidden_dim: 256 head_hidden_dim: 256
batch_size: 32
dataloader_num_workers: 2
# Data Paths# Audio Processing# Audio Processing
data:
# === ViSpeech (CSV format) ===audio:audio:
clean_test_meta: "path/to/metadata/clean_testset.csv"
clean_test_audio: "path/to/clean_testset" sampling_rate: 16000 sampling_rate: 16000
noisy_test_meta: "path/to/metadata/noisy_testset.csv"
noisy_test_audio: "path/to/noisy_testset" max_duration: 5 max_duration: 5
# === ViMD (HuggingFace format) ===
vimd_path: "/path/to/vimd-dataset"
# Evaluation# Evaluation
# Output
output:evaluation:evaluation:
dir: "output/evaluation"
save_predictions: true batch_size: 32 batch_size: 32
save_confusion_matrix: true
dataloader_num_workers: 2 dataloader_num_workers: 2
# Label Mappings
labels:
gender:
Male: 0# Data Paths# Data Paths (UPDATE THESE PATHS)
Female: 1
0: 0data:data:
1: 1
dialect: clean_test_meta: "path/to/metadata/clean_testset.csv" clean_test_meta: "path/to/metadata/clean_testset.csv"
North: 0
Central: 1 clean_test_audio: "path/to/clean_testset" clean_test_audio: "path/to/clean_testset"
South: 2
region_to_dialect: noisy_test_meta: "path/to/metadata/noisy_testset.csv" noisy_test_meta: "path/to/metadata/noisy_testset.csv"
North: 0
Central: 1 noisy_test_audio: "path/to/noisy_testset" noisy_test_audio: "path/to/noisy_testset"
South: 2
# Baseline Comparison (PACLIC 2024 - ResNet34)
baseline:# Output# Output
gender:
clean: 98.73output:output:
noisy: 98.14
dialect: dir: "output/evaluation" dir: "output/evaluation"
clean: 81.47
noisy: 74.80 save_predictions: true save_predictions: true
save_confusion_matrix: true save_confusion_matrix: true
# Label Mappings# Label Mappings
labels:labels:
gender: gender:
Male: 0 Male: 0
Female: 1 Female: 1
0: 0 dialect:
1: 1 North: 0
dialect: Central: 1
North: 0 South: 2
Central: 1
South: 2# Baseline Comparison (PACLIC 2024 - ResNet34)
baseline:
# Baseline Comparison (PACLIC 2024 - ResNet34) gender:
baseline: clean: 98.73
gender: noisy: 98.14
clean: 98.73 dialect:
noisy: 98.14 clean: 81.47
dialect: noisy: 74.80
clean: 81.47
noisy: 74.80