{ "semantic_llm": { "start_text_token": 32000, "stop_text_token": 32001, "num_text_tokens": 32002, "start_audio_token": 16384, "stop_audio_token": 16385, "num_audio_tokens": 16386, "llm_hidden_size": 1024, "llm_intermediate_size": 4096, "llm_num_layers": 30, "llm_num_heads": 16, "llm_max_audio_seq_len": 630, "llm_max_text_seq_len": 402, "llm_max_prompt_len": 250, "code_stride_len": 640, "EOS_TOKEN": 16385 }, "flow": { "spk_channels": 512, "spk_enc_channels": 80, "infer_cfg_rate": 0.7, "token_emb": { "channels": 512 }, "encoder": { "input_size": 512, "output_size": 512, "num_blocks": 6, "num_up_blocks": 4, "normalize_before": true, "up_stride": 2, "pre_lookahead_len": 3, "attention_heads": 4, "key_bias": true, "linear_units": 2048, "dropout_rate": 0.0, "positional_dropout_rate": 0.0, "attention_dropout_rate": 0.0 }, "estimator": { "in_channels": 320, "out_channels": 80, "mlp_ratio": 4, "depth": 16, "num_heads": 8, "head_dim": 64, "hidden_size": 512 } }, "mel": { "num_mels": 80, "n_fft": 1920, "hop_size": 480, "win_size": 1920, "sampling_rate": 24000, "fmin": 0, "fmax": 8000, "center": false }, "bigvgan": { "num_mels": 80, "upsample_initial_channel": 1536, "resblock_kernel_sizes": [3, 7, 11], "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]], "upsample_rates": [5, 4, 3, 2, 2, 2], "upsample_kernel_sizes": [11, 8, 7, 4, 4, 4], "resblock_type": "1", "snake_logscale": true, "activation": "snakebeta", "use_tanh_at_final": false, "use_bias_at_final": false }, "semantic_tokenizer": { "in_dim": 1024, "out_dim": 80, "n_model_size": 512, "downsample_scales": [ 1, 1, 1, 2 ], "upsample_scales": [ [ 2, 1 ], [ 2, 1, 1, 1 ] ], "mel_config": { "style": "BigVGAN", "filter_length": 1024, "hop_length": 160, "win_length": 640, "n_mel_channels": 80, "sampling_rate": 16000 }, "vq_config": { "codebook_size": [ 128, 128 ], "codebook_dim": [ 128, 128 ], "requires_projection": true }, "tree_config": [ { "downsample_rate": 1, "n_groups": 1, "dropout": 0 } ], "n_samples_per_token": 640, "checkpointing": true } }