| { | |
| "d_model": 128, | |
| "num_layers": 2, | |
| "T_local": 3, | |
| "cluster_size": 8, | |
| "seq_len": 256, | |
| "batch_size": 96, | |
| "learning_rate": 4.76e-4, | |
| "weight_decay": 0.0541, | |
| "dropout": 0.30, | |
| "vocab_size": 30522 | |
| } |
| { | |
| "d_model": 128, | |
| "num_layers": 2, | |
| "T_local": 3, | |
| "cluster_size": 8, | |
| "seq_len": 256, | |
| "batch_size": 96, | |
| "learning_rate": 4.76e-4, | |
| "weight_decay": 0.0541, | |
| "dropout": 0.30, | |
| "vocab_size": 30522 | |
| } |