Spaces:

Thanh-Lam
/

vietnamese-speaker-profiling-v2

Sleeping

App Files Files Community

vietnamese-speaker-profiling-v2 / configs /finetune.yaml.example

Thanh-Lam

Vietnamese Speaker Profiling with wav2vec2-base-vi-vlsp2020

c3418e9 9 days ago

raw

history blame contribute delete

4.45 kB

	# Finetune Configuration# Finetune Configuration

	# Full model finetuning from raw audio# Architecture: WavLM + Attentive Pooling + LayerNorm + Deeper Heads

	# Supports: ViSpeech (CSV) and ViMD (HuggingFace)# Uses pre-extracted features from prepare_data.py

	# Copy this file to finetune.yaml and update paths# Copy this file to finetune.yaml and update paths



	# Model# Model (for classification heads only - features are pre-extracted)

	model:model:

	name: "microsoft/wavlm-base-plus" name: "microsoft/wavlm-base-plus" # Used for hidden_size reference

	num_genders: 2 hidden_size: 768 # WavLM base hidden dimension

	num_dialects: 3 num_genders: 2

	dropout: 0.1 num_dialects: 3

	head_hidden_dim: 256 dropout: 0.1

	head_hidden_dim: 256

	# Audio processing

	audio:# Training

	sampling_rate: 16000training:

	max_duration: 5 # seconds batch_size: 32

	learning_rate: 5e-5

	# Training num_epochs: 15

	training: warmup_ratio: 0.125

	batch_size: 32 weight_decay: 0.0125

	learning_rate: 5e-5 gradient_clip: 1.0

	num_epochs: 15 lr_scheduler: "linear"

	warmup_ratio: 0.125 fp16: true

	weight_decay: 0.0125 dataloader_num_workers: 4

	gradient_clip: 1.0

	lr_scheduler: "linear"# Loss

	fp16: trueloss:

	dataloader_num_workers: 4 dialect_weight: 3.0



	# Data Augmentation# MLflow Configuration

	augmentation:mlflow:

	enabled: true enabled: true

	prob: 0.8 tracking_uri: "mlruns"

	experiment_name: "speaker-profiling"

	# Loss run_name: null

	loss: registered_model_name: null

	dialect_weight: 3.0

	# Dataset paths

	# MLflow Configuration# ============================================================

	mlflow:# STEP 1: Update RAW DATASET PATHS to your local ViSpeech location

	enabled: true# STEP 2: Run prepare_data.py to extract features

	tracking_uri: "mlruns"# STEP 3: Features will be saved to train_dir/val_dir folders

	experiment_name: "speaker-profiling"# ============================================================

	run_name: nulldata:

	registered_model_name: null # === RAW DATASET PATHS (for prepare_data.py) ===

	# Download ViSpeech: https://drive.google.com/file/d/1-BbOHf42o6eBje2WqQiiRKMtNxmZiRf9

	# Dataset # Update these paths to match your local dataset location

	# source: "vispeech" (CSV format) or "vimd" (HuggingFace format) vispeech_root: "/path/to/ViSpeech" # <-- UPDATE THIS

	data:

	source: "vispeech" # Options: vispeech, vimd # Training data

	train_meta: "/path/to/ViSpeech/metadata/trainset.csv" # <-- UPDATE

	# === ViSpeech (CSV format) === train_audio: "/path/to/ViSpeech/trainset" # <-- UPDATE

	vispeech_root: "/path/to/ViSpeech"

	train_meta: "/path/to/ViSpeech/metadata/trainset.csv" # Test data

	train_audio: "/path/to/ViSpeech/trainset" clean_test_meta: "/path/to/ViSpeech/metadata/clean_testset.csv"

	clean_test_meta: "/path/to/ViSpeech/metadata/clean_testset.csv" clean_test_audio: "/path/to/ViSpeech/clean_testset"

	clean_test_audio: "/path/to/ViSpeech/clean_testset" noisy_test_meta: "/path/to/ViSpeech/metadata/noisy_testset.csv"

	noisy_test_meta: "/path/to/ViSpeech/metadata/noisy_testset.csv" noisy_test_audio: "/path/to/ViSpeech/noisy_testset"

	noisy_test_audio: "/path/to/ViSpeech/noisy_testset"

	val_split: 0.15 # Validation split ratio (extracted from trainset)

	val_split: 0.15

	# === ViMD (HuggingFace format) ===

	vimd_path: "/path/to/vimd-dataset" # === EXTRACTED FEATURES PATHS (for finetune.py) ===

	# After running prepare_data.py, features will be saved here

	# Output # These paths are relative to project root

	output: train_dir: "datasets/ViSpeech/train"

	dir: "output/speaker-profiling" val_dir: "datasets/ViSpeech/val"

	save_total_limit: 3

	metric_for_best_model: "dialect_acc"# Output

	output:

	# Early Stopping dir: "output/speaker-profiling"

	early_stopping: save_total_limit: 3

	patience: 3 metric_for_best_model: "dialect_acc"

	threshold: 0.0025

	# Early Stopping

	# Label Mappingsearly_stopping:

	labels: patience: 3

	gender: threshold: 0.0025

	Male: 0

	Female: 1# Label Mappings (must match prepare_data.py)

	0: 0labels:

	1: 1 gender:

	dialect: Male: 0

	North: 0 Female: 1

	Central: 1 dialect:

	South: 2 North: 0

	region_to_dialect: Central: 1

	North: 0 South: 2

	Central: 1

	South: 2# Reproducibility

	seed: 42

	# Reproducibility
	seed: 42