{
    "semantic_llm": {
        "start_text_token": 32000,
        "stop_text_token": 32001,
        "num_text_tokens": 32002,
        "start_audio_token": 16384,
        "stop_audio_token": 16385,
        "num_audio_tokens": 16386,
        "llm_hidden_size": 1024,
        "llm_intermediate_size": 4096,
        "llm_num_layers": 30,
        "llm_num_heads": 16,
        "llm_max_audio_seq_len": 630,
        "llm_max_text_seq_len": 402,
        "llm_max_prompt_len": 250,
        "code_stride_len": 640,
        "EOS_TOKEN": 16385
    },
    "flow": {
        "spk_channels": 512,
        "spk_enc_channels": 80,
        "infer_cfg_rate": 0.7,
        "token_emb": {
            "channels": 512
        },
        "encoder": {
            "input_size": 512,
            "output_size": 512,
            "num_blocks": 6,
            "num_up_blocks": 4,
            "normalize_before": true,
            "up_stride": 2,
            "pre_lookahead_len": 3,
            "attention_heads": 4,
            "key_bias": true,
            "linear_units": 2048,
            "dropout_rate": 0.0,
            "positional_dropout_rate": 0.0,
            "attention_dropout_rate": 0.0
        },
        "estimator": {
            "in_channels": 320,
            "out_channels": 80,
            "mlp_ratio": 4,
            "depth": 16,
            "num_heads": 8,
            "head_dim": 64,
            "hidden_size": 512
        }
    },
    "mel": {
        "num_mels": 80,
        "n_fft": 1920,
        "hop_size": 480,
        "win_size": 1920,
        "sampling_rate": 24000,
        "fmin": 0,
        "fmax": 8000,
        "center": false
    },
    "bigvgan": {
        "num_mels": 80,
        "upsample_initial_channel": 1536,
        "resblock_kernel_sizes": [3, 7, 11],
        "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
        "upsample_rates": [5, 4, 3, 2, 2, 2],
        "upsample_kernel_sizes": [11, 8, 7, 4, 4, 4],
        "resblock_type": "1",
        "snake_logscale": true,
        "activation": "snakebeta",
        "use_tanh_at_final": false,
        "use_bias_at_final": false
    },
    "semantic_tokenizer": {
        "in_dim": 1024,
        "out_dim": 80,
        "n_model_size": 512,
        "downsample_scales": [
            1,
            1,
            1,
            2
        ],
        "upsample_scales": [
            [
                2,
                1
            ],
            [
                2,
                1,
                1,
                1
            ]
        ],
        "mel_config": {
            "style": "BigVGAN",
            "filter_length": 1024,
            "hop_length": 160,
            "win_length": 640,
            "n_mel_channels": 80,
            "sampling_rate": 16000
        },
        "vq_config": {
            "codebook_size": [
                128,
                128
            ],
            "codebook_dim": [
                128,
                128
            ],
            "requires_projection": true
        },
        "tree_config": [
            {
                "downsample_rate": 1,
                "n_groups": 1,
                "dropout": 0
            }
        ],
        "n_samples_per_token": 640,
        "checkpointing": true
    }
}