|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
model:model: |
|
|
|
|
|
name: "microsoft/wavlm-base-plus" name: "microsoft/wavlm-base-plus" # Used for hidden_size reference |
|
|
|
|
|
num_genders: 2 hidden_size: 768 # WavLM base hidden dimension |
|
|
|
|
|
num_dialects: 3 num_genders: 2 |
|
|
|
|
|
dropout: 0.1 num_dialects: 3 |
|
|
|
|
|
head_hidden_dim: 256 dropout: 0.1 |
|
|
|
|
|
head_hidden_dim: 256 |
|
|
|
|
|
|
|
|
|
|
|
audio:# Training |
|
|
|
|
|
sampling_rate: 16000training: |
|
|
|
|
|
max_duration: 5 # seconds batch_size: 32 |
|
|
|
|
|
learning_rate: 5e-5 |
|
|
|
|
|
|
|
|
|
|
|
training: warmup_ratio: 0.125 |
|
|
|
|
|
batch_size: 32 weight_decay: 0.0125 |
|
|
|
|
|
learning_rate: 5e-5 gradient_clip: 1.0 |
|
|
|
|
|
num_epochs: 15 lr_scheduler: "linear" |
|
|
|
|
|
warmup_ratio: 0.125 fp16: true |
|
|
|
|
|
weight_decay: 0.0125 dataloader_num_workers: 4 |
|
|
|
|
|
gradient_clip: 1.0 |
|
|
|
|
|
lr_scheduler: "linear"# Loss |
|
|
|
|
|
fp16: trueloss: |
|
|
|
|
|
dataloader_num_workers: 4 dialect_weight: 3.0 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
augmentation:mlflow: |
|
|
|
|
|
enabled: true enabled: true |
|
|
|
|
|
prob: 0.8 tracking_uri: "mlruns" |
|
|
|
|
|
experiment_name: "speaker-profiling" |
|
|
|
|
|
|
|
|
|
|
|
loss: registered_model_name: null |
|
|
|
|
|
dialect_weight: 3.0 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
mlflow:# STEP 1: Update RAW DATASET PATHS to your local ViSpeech location |
|
|
|
|
|
enabled: true# STEP 2: Run prepare_data.py to extract features |
|
|
|
|
|
tracking_uri: "mlruns"# STEP 3: Features will be saved to train_dir/val_dir folders |
|
|
|
|
|
experiment_name: "speaker-profiling"# ============================================================ |
|
|
|
|
|
run_name: nulldata: |
|
|
|
|
|
registered_model_name: null # === RAW DATASET PATHS (for prepare_data.py) === |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data: |
|
|
|
|
|
source: "vispeech" # Options: vispeech, vimd # Training data |
|
|
|
|
|
train_meta: "/path/to/ViSpeech/metadata/trainset.csv" # <-- UPDATE |
|
|
|
|
|
|
|
|
|
|
|
vispeech_root: "/path/to/ViSpeech" |
|
|
|
|
|
train_meta: "/path/to/ViSpeech/metadata/trainset.csv" # Test data |
|
|
|
|
|
train_audio: "/path/to/ViSpeech/trainset" clean_test_meta: "/path/to/ViSpeech/metadata/clean_testset.csv" |
|
|
|
|
|
clean_test_meta: "/path/to/ViSpeech/metadata/clean_testset.csv" clean_test_audio: "/path/to/ViSpeech/clean_testset" |
|
|
|
|
|
clean_test_audio: "/path/to/ViSpeech/clean_testset" noisy_test_meta: "/path/to/ViSpeech/metadata/noisy_testset.csv" |
|
|
|
|
|
noisy_test_meta: "/path/to/ViSpeech/metadata/noisy_testset.csv" noisy_test_audio: "/path/to/ViSpeech/noisy_testset" |
|
|
|
|
|
noisy_test_audio: "/path/to/ViSpeech/noisy_testset" |
|
|
|
|
|
val_split: 0.15 # Validation split ratio (extracted from trainset) |
|
|
|
|
|
val_split: 0.15 |
|
|
|
|
|
|
|
|
|
|
|
vimd_path: "/path/to/vimd-dataset" # === EXTRACTED FEATURES PATHS (for finetune.py) === |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
output: train_dir: "datasets/ViSpeech/train" |
|
|
|
|
|
dir: "output/speaker-profiling" val_dir: "datasets/ViSpeech/val" |
|
|
|
|
|
save_total_limit: 3 |
|
|
|
|
|
metric_for_best_model: "dialect_acc"# Output |
|
|
|
|
|
output: |
|
|
|
|
|
|
|
|
|
|
|
early_stopping: save_total_limit: 3 |
|
|
|
|
|
patience: 3 metric_for_best_model: "dialect_acc" |
|
|
|
|
|
threshold: 0.0025 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
labels: patience: 3 |
|
|
|
|
|
gender: threshold: 0.0025 |
|
|
|
|
|
Male: 0 |
|
|
|
|
|
Female: 1# Label Mappings (must match prepare_data.py) |
|
|
|
|
|
0: 0labels: |
|
|
|
|
|
1: 1 gender: |
|
|
|
|
|
dialect: Male: 0 |
|
|
|
|
|
North: 0 Female: 1 |
|
|
|
|
|
Central: 1 dialect: |
|
|
|
|
|
South: 2 North: 0 |
|
|
|
|
|
region_to_dialect: Central: 1 |
|
|
|
|
|
North: 0 South: 2 |
|
|
|
|
|
Central: 1 |
|
|
|
|
|
South: 2# Reproducibility |
|
|
|
|
|
seed: 42 |
|
|
|
|
|
|
|
|
seed: 42 |
|
|
|