| { | |
| "shorthand": "model.768.lyr.12 - seqlen.1024 - mla.0.0.0 - ah12.64 - rd32.32", | |
| "notes": "GPT-2 run with essentially MHA.", | |
| "model": { | |
| "hidden_size": 768, | |
| "num_hidden_layers": 12, | |
| "intermediate_size": 2048, | |
| "vocab_size": 50257, | |
| "tie_word_embeddings": true, | |
| "max_position_embeddings": 1024, | |
| "norm_type": "rmsnorm", | |
| "layer_norm_eps": 1e-12, | |
| "rms_norm_eps": 1e-06, | |
| "num_dense_layers": 0, | |
| "num_attention_heads": 12, | |
| "q_shared_dim": null, | |
| "kv_shared_dim": null, | |
| "o_shared_dim": null, | |
| "qk_private_dim": 64, | |
| "vo_private_dim": 64, | |
| "rope_dims": 32, | |
| "nope_dims": 32, | |
| "rope_theta": 10000.0, | |
| "rope_scaling": { | |
| "type": "linear", | |
| "factor": 2.0 | |
| }, | |
| "attention_bias": false, | |
| "attention_backend": "flash_attention_2", | |
| "ffn_decompose": false, | |
| "ffn_rank": null, | |
| "vocab_subspace": false, | |
| "vocab_rank": null, | |
| "hidden_dropout_prob": 0.1, | |
| "attention_dropout_prob": 0.1, | |
| "classifier_dropout": null, | |
| "initializer_range": 0.02 | |
| }, | |
| "pre_train": { | |
| "wandb_project": "decoder-pretrain-c4", | |
| "output_dir": "checkpoints/gpt-2_seq1024_mla0-0-0", | |
| "seed": 42, | |
| "logging_steps": 20, | |
| "save_steps": 300, | |
| "train_batch_size": 64, | |
| "gradient_accumulation_steps": 16, | |
| "learning_rate": 0.0005, | |
| "num_train_steps": 3000, | |
| "eval_steps": 300, | |
| "weight_decay": 0.01, | |
| "num_workers": 8, | |
| "pin_memory": true, | |
| "_comment_dataset": "Use preprocessed_dataset_path instead of dataset streaming", | |
| "preprocessed_dataset_path": "/home/ubuntu/c4_en_pct0.02_seq1024/c4_en_pct0.02_seq1024/dataset", | |
| "dataset_name": "allenai/c4", | |
| "dataset_config": "en", | |
| "dataset_subset_pct": 0.02, | |
| "max_seq_length": 1024, | |
| "eval_batch_size": 64, | |
| "fp16": false, | |
| "bf16": true, | |
| "torch_compile": true, | |
| "torch_compile_backend": "inductor", | |
| "torch_compile_mode": "default", | |
| "run_name": "117.88M - model.768.lyr.12 - seqlen.1024 - mla.0.0.0 - ah12.64 - rd32.32", | |
| "run_id": "15ixey47", | |
| "run_url": "https://wandb.ai/chrismccormick/decoder-pretrain-c4/runs/15ixey47", | |
| "best_checkpoint": "checkpoints/gpt-2_seq1024_mla0-0-0/checkpoint-3000" | |
| }, | |
| "stats": { | |
| "total_elements": "117.88M" | |
| } | |
| } |