| # ################################ | |
| # Model: wav2vec2 | |
| # Authors: Rudolf A. Braun 2022, Titouan Parcollet 2022 | |
| # ################################ | |
| sample_rate: 16000 | |
| # standard parameters for the BASE model | |
| latent_extractor: !new:speechbrain.lobes.models.wav2vec.W2VLatentExtractor | |
| out_channels: [512, 512, 512, 512, 512, 512, 512] | |
| # standard parameters for the BASE model | |
| latent_encoder: !new:speechbrain.lobes.models.transformer.Transformer.TransformerEncoder | |
| d_model: 768 | |
| num_layers: 12 | |
| nhead: 8 | |
| d_ffn: 3072 | |
| dropout: 0.1 | |
| layerdrop_prob: 0.0 | |
| normalize_before: True | |
| activation: !name:torch.nn.GELU | |
| # standard parameters for the BASE model | |
| encoder_wrapper: !new:speechbrain.lobes.models.wav2vec.EncoderWrapper | |
| in_dim: 512 | |
| embedding_dim: 768 | |
| latent_encoder: !ref <latent_encoder> | |
| dropout_encoder_input: 0.1 | |
| encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential | |
| latent_extractor: !ref <latent_extractor> | |
| encoder_wrapper: !ref <encoder_wrapper> | |
| modules: | |
| encoder: !ref <encoder> | |
| pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer | |
| loadables: | |
| latent_encoder: !ref <encoder_wrapper> | |
| latent_extractor: !ref <latent_extractor> | |