# Model (for classification heads only - features are pre-extracted) model: name: "microsoft/wavlm-base-plus" # Used for hidden_size reference num_genders: 2 num_dialects: 3 dropout: 0.1 head_hidden_dim: 256 # Audio processing audio: sampling_rate: 16000 max_duration: 5 # seconds # Training training: batch_size: 32 learning_rate: 5e-5 num_epochs: 15 warmup_ratio: 0.125 weight_decay: 0.0125 gradient_clip: 0.5 lr_scheduler: "linear" fp16: true dataloader_num_workers: 4 # Data Augmentation augmentation: enabled: true prob: 0.8 # Loss loss: dialect_weight: 3.0 # WandB Configuration wandb: enabled: true api_key: "f05e29c3466ec288e97041e0e3d541c4087096a6" project: "speaker-profiling" run_name: null # Dataset paths # source: "vispeech" (CSV format) or "vimd" (HuggingFace format) data: source: "vispeech" # Options: vispeech, vimd # === ViSpeech (CSV format) === vispeech_root: "/home/ubuntu/DataScience/Voice_Pro_filing/vispeech_data/ViSpeech" train_meta: "/home/ubuntu/DataScience/Voice_Pro_filing/vispeech_data/ViSpeech/metadata/trainset.csv" train_audio: "/home/ubuntu/DataScience/Voice_Pro_filing/vispeech_data/ViSpeech/trainset" clean_test_meta: "/home/ubuntu/DataScience/Voice_Pro_filing/vispeech_data/ViSpeech/metadata/clean_testset.csv" clean_test_audio: "/home/ubuntu/DataScience/Voice_Pro_filing/vispeech_data/ViSpeech/clean_testset" noisy_test_meta: "/home/ubuntu/DataScience/Voice_Pro_filing/vispeech_data/ViSpeech/metadata/noisy_testset.csv" noisy_test_audio: "/home/ubuntu/DataScience/Voice_Pro_filing/vispeech_data/ViSpeech/noisy_testset" val_split: 0.15 # === ViMD (HuggingFace format) === vimd_path: "/kaggle/input/vimd-dataset" # Output output: dir: "output/speaker-profiling" save_total_limit: 3 metric_for_best_model: "dialect_acc" # Early Stopping early_stopping: patience: 3 threshold: 0.0025 # Label Mappings labels: gender: Male: 0 Female: 1 0: 0 # Support int labels (ViMD) 1: 1 dialect: North: 0 Central: 1 South: 2 # ViMD uses 'region' column region_to_dialect: North: 0 Central: 1 South: 2 # Reproducibility seed: 42