Thanh-Lam's picture
Vietnamese Speaker Profiling with wav2vec2-base-vi-vlsp2020
c3418e9
# Model (for classification heads only - features are pre-extracted)
model:
name: "microsoft/wavlm-base-plus" # Used for hidden_size reference
num_genders: 2
num_dialects: 3
dropout: 0.1
head_hidden_dim: 256
# Audio processing
audio:
sampling_rate: 16000
max_duration: 5 # seconds
# Training
training:
batch_size: 32
learning_rate: 5e-5
num_epochs: 15
warmup_ratio: 0.125
weight_decay: 0.0125
gradient_clip: 0.5
lr_scheduler: "linear"
fp16: true
dataloader_num_workers: 4
# Data Augmentation
augmentation:
enabled: true
prob: 0.8
# Loss
loss:
dialect_weight: 3.0
# WandB Configuration
wandb:
enabled: true
api_key: "f05e29c3466ec288e97041e0e3d541c4087096a6"
project: "speaker-profiling"
run_name: null
# Dataset paths
# source: "vispeech" (CSV format) or "vimd" (HuggingFace format)
data:
source: "vispeech" # Options: vispeech, vimd
# === ViSpeech (CSV format) ===
vispeech_root: "/home/ubuntu/DataScience/Voice_Pro_filing/vispeech_data/ViSpeech"
train_meta: "/home/ubuntu/DataScience/Voice_Pro_filing/vispeech_data/ViSpeech/metadata/trainset.csv"
train_audio: "/home/ubuntu/DataScience/Voice_Pro_filing/vispeech_data/ViSpeech/trainset"
clean_test_meta: "/home/ubuntu/DataScience/Voice_Pro_filing/vispeech_data/ViSpeech/metadata/clean_testset.csv"
clean_test_audio: "/home/ubuntu/DataScience/Voice_Pro_filing/vispeech_data/ViSpeech/clean_testset"
noisy_test_meta: "/home/ubuntu/DataScience/Voice_Pro_filing/vispeech_data/ViSpeech/metadata/noisy_testset.csv"
noisy_test_audio: "/home/ubuntu/DataScience/Voice_Pro_filing/vispeech_data/ViSpeech/noisy_testset"
val_split: 0.15
# === ViMD (HuggingFace format) ===
vimd_path: "/kaggle/input/vimd-dataset"
# Output
output:
dir: "output/speaker-profiling"
save_total_limit: 3
metric_for_best_model: "dialect_acc"
# Early Stopping
early_stopping:
patience: 3
threshold: 0.0025
# Label Mappings
labels:
gender:
Male: 0
Female: 1
0: 0 # Support int labels (ViMD)
1: 1
dialect:
North: 0
Central: 1
South: 2
# ViMD uses 'region' column
region_to_dialect:
North: 0
Central: 1
South: 2
# Reproducibility
seed: 42