| { | |
| "encoder_name": "WavLM", | |
| "encoder_config": { | |
| "hidden_dims": [512, 512, 512, 512, 512, 512, 512], | |
| "kernel_sizes": [10, 3, 3, 3, 3, 2, 2], | |
| "strides": [5, 2, 2, 2, 2, 2, 2], | |
| "num_layers": 6, | |
| "dim": 1024, | |
| "ffn_dim": 4096, | |
| "num_heads": 16, | |
| "num_buckets": 320, | |
| "max_distance": 800, | |
| "dropout": 0.0, | |
| "conv_pos": 128, | |
| "conv_pos_groups": 16 | |
| }, | |
| "compressor_name": "FocalEncoder", | |
| "compressor_config": { | |
| "input_dim": 1024, | |
| "output_dim": 13, | |
| "hidden_dims": [1024, 512, 256], | |
| "downscale_factors": [2, 2, 1], | |
| "focal_window": 7, | |
| "focal_level": 2, | |
| "focal_factor": 2, | |
| "dropout": 0.0, | |
| "use_post_norm": false, | |
| "use_layerscale": false, | |
| "layerscale_init": 0.0001, | |
| "normalize_modulator": false | |
| }, | |
| "quantizer_name": "BinarySphericalQuantizer", | |
| "quantizer_config": { | |
| "codebook_size": 8192 | |
| }, | |
| "decompressor_name": "FocalDecoder", | |
| "decompressor_config": { | |
| "input_dim": 13, | |
| "output_dim": 1024, | |
| "hidden_dims": [256, 512, 1024], | |
| "upscale_factors": [1, 2, 2], | |
| "focal_window": 7, | |
| "focal_level": 2, | |
| "focal_factor": 2, | |
| "dropout": 0.0, | |
| "use_post_norm": false, | |
| "use_layerscale": false, | |
| "layerscale_init": 0.0001, | |
| "normalize_modulator": false | |
| }, | |
| "decoder_name": "Vocos", | |
| "decoder_config": { | |
| "input_channels": 1024, | |
| "num_layers": 8, | |
| "dim": 512, | |
| "ffn_dim": 1536, | |
| "kernel_size": 7, | |
| "padding": 3, | |
| "layerscale_init": null, | |
| "n_fft": 1024, | |
| "hop_length": 320 | |
| } | |
| } | |