kokolamba
/

SubspaceDecoder_mla0-0-0

Model card Files Files and versions

SubspaceDecoder_mla0-0-0 / full_config.json

kokolamba's picture

Update model files

f640da0 29 days ago

history blame contribute delete

2.26 kB

	{
	"shorthand": "model.768.lyr.12 - seqlen.1024 - mla.0.0.0 - ah12.64 - rd32.32",
	"notes": "GPT-2 run with essentially MHA.",
	"model": {
	"hidden_size": 768,
	"num_hidden_layers": 12,
	"intermediate_size": 2048,
	"vocab_size": 50257,
	"tie_word_embeddings": true,
	"max_position_embeddings": 1024,
	"norm_type": "rmsnorm",
	"layer_norm_eps": 1e-12,
	"rms_norm_eps": 1e-06,
	"num_dense_layers": 0,
	"num_attention_heads": 12,
	"q_shared_dim": null,
	"kv_shared_dim": null,
	"o_shared_dim": null,
	"qk_private_dim": 64,
	"vo_private_dim": 64,
	"rope_dims": 32,
	"nope_dims": 32,
	"rope_theta": 10000.0,
	"rope_scaling": {
	"type": "linear",
	"factor": 2.0
	},
	"attention_bias": false,
	"attention_backend": "flash_attention_2",
	"ffn_decompose": false,
	"ffn_rank": null,
	"vocab_subspace": false,
	"vocab_rank": null,
	"hidden_dropout_prob": 0.1,
	"attention_dropout_prob": 0.1,
	"classifier_dropout": null,
	"initializer_range": 0.02
	},
	"pre_train": {
	"wandb_project": "decoder-pretrain-c4",
	"output_dir": "checkpoints/gpt-2_seq1024_mla0-0-0",
	"seed": 42,
	"logging_steps": 20,
	"save_steps": 300,
	"train_batch_size": 64,
	"gradient_accumulation_steps": 16,
	"learning_rate": 0.0005,
	"num_train_steps": 3000,
	"eval_steps": 300,
	"weight_decay": 0.01,
	"num_workers": 8,
	"pin_memory": true,
	"_comment_dataset": "Use preprocessed_dataset_path instead of dataset streaming",
	"preprocessed_dataset_path": "/home/ubuntu/c4_en_pct0.02_seq1024/c4_en_pct0.02_seq1024/dataset",
	"dataset_name": "allenai/c4",
	"dataset_config": "en",
	"dataset_subset_pct": 0.02,
	"max_seq_length": 1024,
	"eval_batch_size": 64,
	"fp16": false,
	"bf16": true,
	"torch_compile": true,
	"torch_compile_backend": "inductor",
	"torch_compile_mode": "default",
	"run_name": "117.88M - model.768.lyr.12 - seqlen.1024 - mla.0.0.0 - ah12.64 - rd32.32",
	"run_id": "15ixey47",
	"run_url": "https://wandb.ai/chrismccormick/decoder-pretrain-c4/runs/15ixey47",
	"best_checkpoint": "checkpoints/gpt-2_seq1024_mla0-0-0/checkpoint-3000"
	},
	"stats": {
	"total_elements": "117.88M"
	}
	}