Thanh-Lam's picture
Vietnamese Speaker Profiling with wav2vec2-base-vi-vlsp2020
c3418e9
# Inference Configuration# Inference Configuration
# Predict gender and dialect from audio# Architecture: WavLM + Attentive Pooling + LayerNorm + Deeper Heads
# Copy this file to infer.yaml and update paths# Copy this file to infer.yaml and update paths
# Model# Model
model:model:
checkpoint: "path/to/best_model" checkpoint: "path/to/best_model"
name: "microsoft/wavlm-base-plus" name: "microsoft/wavlm-base-plus"
head_hidden_dim: 256 head_hidden_dim: 256
# Audio Processing# Audio Processing
audio:audio:
sampling_rate: 16000 sampling_rate: 16000
max_duration: 5 max_duration: 5
# Inference# Inference
inference:inference:
batch_size: 1 batch_size: 1
device: "cuda" device: "cuda"
# Input# Input
input:input:
audio_path: null audio_path: null
audio_dir: null audio_dir: null
# Output# Output
output:output:
dir: "output/predictions" dir: "output/predictions"
save_results: true save_results: true
format: "json" format: "json"
# Label Mappings# Label Mappings
labels:labels:
gender: gender:
0: "Male" 0: "Male"
1: "Female" 1: "Female"
dialect: dialect:
0: "North" 0: "North"
1: "Central" 1: "Central"
2: "South" 2: "South"