| { | |
| "num_attention_heads": 8, | |
| "input_dim": 512, | |
| "embed_dim": 512, | |
| "q_latent_dim": 128, | |
| "kv_latent_dim": 128, | |
| "max_token_len": 512, | |
| "num_shared_experts": 2, | |
| "num_routed_experts": 4, | |
| "moe_top_k": 2, | |
| "expert_intermediate_dim": 1536, | |
| "num_dense_ffn": 1, | |
| "num_moe_ffn": 2, | |
| "vocab_size": 50257, | |
| "max_batch_size": 24 | |
| } |