| encoder: chunkformer | |
| is_json_cmvn: true | |
| cmvn_file: chunkformer-large-vie/global_cmvn | |
| input_dim: 80 | |
| output_dim: 6992 | |
| encoder_conf: | |
| output_size: 512 # dimension of attention | |
| attention_heads: 8 | |
| linear_units: 2048 # the number of units of position-wise feed forward | |
| num_blocks: 17 # the number of encoder blocks | |
| dropout_rate: 0.1 | |
| positional_dropout_rate: 0.1 | |
| attention_dropout_rate: 0.1 | |
| input_layer: 'dw_striding' # encoder input type, you can chose conv2d, conv2d6 and conv2d8 | |
| normalize_before: true | |
| cnn_module_kernel: 15 | |
| use_cnn_module: true | |
| activation_type: 'swish' | |
| pos_enc_layer_type: 'chunk_rel_pos' | |
| selfattention_layer_type: 'chunk_rel_seflattn' | |
| causal: false | |
| use_dynamic_chunk: false | |
| dynamic_chunk_sizes: [64, 128, 256] | |
| dynamic_right_context_sizes: [128, 128, 128] | |
| dynamic_left_context_sizes: [128, 256, 128] | |
| cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster | |
| use_dynamic_left_chunk: false | |
| dynamic_conv: true | |