{ "shorthand": "model.768.lyr.12 - seqlen.1024 - mla.0.128.0 - ah12.64 - rd32.32", "notes": "GPT-2 run with only a kv space, increased to size 128.", "model": { "hidden_size": 768, "num_hidden_layers": 12, "intermediate_size": 2048, "vocab_size": 50257, "tie_word_embeddings": true, "max_position_embeddings": 1024, "norm_type": "rmsnorm", "layer_norm_eps": 1e-12, "rms_norm_eps": 1e-06, "num_dense_layers": 0, "num_attention_heads": 12, "q_shared_dim": null, "kv_shared_dim": 128, "o_shared_dim": null, "qk_private_dim": 64, "vo_private_dim": 64, "rope_dims": 32, "nope_dims": 32, "rope_theta": 10000.0, "rope_scaling": { "type": "linear", "factor": 2.0 }, "attention_bias": false, "attention_backend": "flash_attention_2", "ffn_decompose": false, "ffn_rank": null, "vocab_subspace": false, "vocab_rank": null, "hidden_dropout_prob": 0.1, "attention_dropout_prob": 0.1, "classifier_dropout": null, "initializer_range": 0.02 }, "pre_train": { "wandb_project": "decoder-pretrain-c4", "output_dir": "checkpoints/gpt-2_seq1024_mla0-128-0", "seed": 42, "logging_steps": 20, "save_steps": 300, "train_batch_size": 64, "gradient_accumulation_steps": 16, "learning_rate": 0.0005, "num_train_steps": 3000, "eval_steps": 300, "weight_decay": 0.01, "num_workers": 8, "pin_memory": true, "_comment_dataset": "Use preprocessed_dataset_path instead of dataset streaming", "preprocessed_dataset_path": "/home/ubuntu/c4_en_pct0.02_seq1024/c4_en_pct0.02_seq1024/dataset", "dataset_name": "allenai/c4", "dataset_config": "en", "dataset_subset_pct": 0.02, "max_seq_length": 1024, "eval_batch_size": 64, "fp16": false, "bf16": true, "torch_compile": true, "torch_compile_backend": "inductor", "torch_compile_mode": "default", "run_name": "107.76M - model.768.lyr.12 - seqlen.1024 - mla.0.128.0 - ah12.64 - rd32.32", "run_id": "u647cues", "run_url": "https://wandb.ai/chrismccormick/decoder-pretrain-c4/runs/u647cues", "best_checkpoint": "checkpoints/gpt-2_seq1024_mla0-128-0/checkpoint-3000" }, "stats": { "total_elements": "107.76M" } }