File size: 3,102 Bytes
c3418e9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 |
# Evaluation Configuration# Evaluation Configuration# Evaluation Configuration
# Evaluate model on test sets from raw audio
# Copy this file to eval.yaml and update paths# Evaluate model on test sets from raw audio# Architecture: WavLM + Attentive Pooling + LayerNorm + Deeper Heads
# Model# Copy this file to eval.yaml and update paths# Copy this file to eval.yaml and update paths
model:
checkpoint: "path/to/best_model"
name: "microsoft/wavlm-base-plus"
head_hidden_dim: 256# Model# Model
# Audio Processingmodel:model:
audio:
sampling_rate: 16000 checkpoint: "path/to/best_model" checkpoint: "path/to/best_model"
max_duration: 5
name: "microsoft/wavlm-base-plus" name: "microsoft/wavlm-base-plus"
# Evaluation
evaluation: head_hidden_dim: 256 head_hidden_dim: 256
batch_size: 32
dataloader_num_workers: 2
# Data Paths# Audio Processing# Audio Processing
data:
# === ViSpeech (CSV format) ===audio:audio:
clean_test_meta: "path/to/metadata/clean_testset.csv"
clean_test_audio: "path/to/clean_testset" sampling_rate: 16000 sampling_rate: 16000
noisy_test_meta: "path/to/metadata/noisy_testset.csv"
noisy_test_audio: "path/to/noisy_testset" max_duration: 5 max_duration: 5
# === ViMD (HuggingFace format) ===
vimd_path: "/path/to/vimd-dataset"
# Evaluation# Evaluation
# Output
output:evaluation:evaluation:
dir: "output/evaluation"
save_predictions: true batch_size: 32 batch_size: 32
save_confusion_matrix: true
dataloader_num_workers: 2 dataloader_num_workers: 2
# Label Mappings
labels:
gender:
Male: 0# Data Paths# Data Paths (UPDATE THESE PATHS)
Female: 1
0: 0data:data:
1: 1
dialect: clean_test_meta: "path/to/metadata/clean_testset.csv" clean_test_meta: "path/to/metadata/clean_testset.csv"
North: 0
Central: 1 clean_test_audio: "path/to/clean_testset" clean_test_audio: "path/to/clean_testset"
South: 2
region_to_dialect: noisy_test_meta: "path/to/metadata/noisy_testset.csv" noisy_test_meta: "path/to/metadata/noisy_testset.csv"
North: 0
Central: 1 noisy_test_audio: "path/to/noisy_testset" noisy_test_audio: "path/to/noisy_testset"
South: 2
# Baseline Comparison (PACLIC 2024 - ResNet34)
baseline:# Output# Output
gender:
clean: 98.73output:output:
noisy: 98.14
dialect: dir: "output/evaluation" dir: "output/evaluation"
clean: 81.47
noisy: 74.80 save_predictions: true save_predictions: true
save_confusion_matrix: true save_confusion_matrix: true
# Label Mappings# Label Mappings
labels:labels:
gender: gender:
Male: 0 Male: 0
Female: 1 Female: 1
0: 0 dialect:
1: 1 North: 0
dialect: Central: 1
North: 0 South: 2
Central: 1
South: 2# Baseline Comparison (PACLIC 2024 - ResNet34)
baseline:
# Baseline Comparison (PACLIC 2024 - ResNet34) gender:
baseline: clean: 98.73
gender: noisy: 98.14
clean: 98.73 dialect:
noisy: 98.14 clean: 81.47
dialect: noisy: 74.80
clean: 81.47
noisy: 74.80
|