| { | |
| "architectures": [ | |
| "VoiceFilter" | |
| ], | |
| "audio_max_lengh": 20, | |
| "enh_args": { | |
| "batch_size": 8, | |
| "batch_type": "folded", | |
| "best_model_criterion": [ | |
| [ | |
| "valid", | |
| "si_snr", | |
| "max" | |
| ], | |
| [ | |
| "valid", | |
| "loss", | |
| "min" | |
| ] | |
| ], | |
| "criterions": [ | |
| { | |
| "conf": { | |
| "eps": 1e-07 | |
| }, | |
| "name": "si_snr", | |
| "wrapper": "pit", | |
| "wrapper_conf": { | |
| "independent_perm": false, | |
| "weight": 1.0 | |
| } | |
| } | |
| ], | |
| "decoder": "stft", | |
| "decoder_conf": { | |
| "hop_length": 128, | |
| "n_fft": 512 | |
| }, | |
| "encoder": "stft", | |
| "encoder_conf": { | |
| "hop_length": 128, | |
| "n_fft": 512 | |
| }, | |
| "init": "xavier_uniform", | |
| "keep_nbest_models": 1, | |
| "max_epoch": 5, | |
| "model_conf": { | |
| "loss_type": "mask_mse", | |
| "mask_type": "psm" | |
| }, | |
| "num_workers": 4, | |
| "optim": "adam", | |
| "optim_conf": { | |
| "eps": 1e-08, | |
| "lr": 0.001, | |
| "weight_decay": 1e-07 | |
| }, | |
| "patience": 10, | |
| "scheduler": "reducelronplateau", | |
| "scheduler_conf": { | |
| "factor": 0.7, | |
| "mode": "min", | |
| "patience": 1 | |
| }, | |
| "separator": "conformer_voice_filter", | |
| "separator_conf": { | |
| "adim": 1024, | |
| "aheads": 8, | |
| "attention_dropout_rate": 0.1, | |
| "concat_after": false, | |
| "conformer_activation_type": "swish", | |
| "conformer_enc_kernel_size": 5, | |
| "conformer_pos_enc_layer_type": "rel_pos", | |
| "conformer_self_attn_layer_type": "rel_selfattn", | |
| "dropout_rate": 0.1, | |
| "input_layer": "linear", | |
| "layers": 4, | |
| "linear_units": 896, | |
| "nonlinear": "relu", | |
| "normalize_before": false, | |
| "num_spk": 1, | |
| "positional_dropout_rate": 0.1, | |
| "positionwise_conv_kernel_size": 1, | |
| "positionwise_layer_type": "conv1d", | |
| "use_cnn_in_conformer": true, | |
| "use_macaron_style_in_conformer": true | |
| }, | |
| "val_scheduler_criterion": [ | |
| "valid", | |
| "loss" | |
| ], | |
| "xvector_emb_dim": 512 | |
| }, | |
| "enh_chunk_size": 5, | |
| "model_type": "voicefilter", | |
| "sample_rate": 16000, | |
| "torch_dtype": "float32", | |
| "transformers_version": "4.25.1" | |
| } | |