diff --git a/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1038731305eb137c02f2b15e5d82fb562c9bee7c --- /dev/null +++ b/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cce38480bfa7fca1248f1ee470d8849252cef771f7b1e1cbdf21bd4b3fe2ad45 +size 442311792 diff --git a/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt b/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..770e06ba8bb73f9953423ac68ae0f1e778dab846 --- /dev/null +++ b/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06a6651b185c471cf611b313ddc77989b2c17e1edf487b961cd5a4fde9b0fbce +size 442311866 diff --git a/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt b/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..81041793c5ae95596269e62677b9e78bad897ace --- /dev/null +++ b/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb093d0d34d20d87d04026a60b6d4bb64cd5485dce06d2f64ce6c2e0bc69a5ef +size 442311930 diff --git a/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt b/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9f4c79bb6677f35d086f3d1922411992abb1a9cd --- /dev/null +++ b/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d139fc2f7da52f1a458b78066ae86024231838bd2b3802fe8d85645f73f38132 +size 442312058 diff --git a/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt b/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..793f6e34ace742ae4cae9ee2969048401900f153 --- /dev/null +++ b/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8391be9b4dbe790b6f7bc534f179b775d6b03d09a13ecd1ff2bb4866a4071f28 +size 442311866 diff --git a/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt b/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4b98e407ca9df08c84b13fb92c0a38595c47c253 --- /dev/null +++ b/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44c603d8a6593ca64514d9a3188ff290bcb6256fb6adc1914a7f8ff2a3ac432f +size 442311994 diff --git a/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt b/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f1582a41e1989092093584b3448463012249b2f3 --- /dev/null +++ b/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e98082949fb075618d3aa67c84f9ecb83dd2f404a1383ef244fefea2ea95e445 +size 442311866 diff --git a/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt b/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3456cebd9d664687bb2302299781ce67777d1523 --- /dev/null +++ b/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:327872918862f1d1b754fcedff2e7db00bc9c869c9f35ddd601982cdb75bc5a2 +size 442311930 diff --git a/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt b/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3c737e9d6688eb6ee43e3cea12815788a2818948 --- /dev/null +++ b/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0ca5ef9844b1e10d19e84576797cf414423558c24862d9bf76f6a0887c53b23 +size 442311994 diff --git a/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt b/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b429ce9b980c8a8247f0369ab7368d487e842219 --- /dev/null +++ b/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62fe3c26445544ceb36f46c4629744908b61c6810e9d39d95ea7ef6c247c0e99 +size 442311866 diff --git a/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt b/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..97654c3c53cd049c088746d42ee74f19b36adb89 --- /dev/null +++ b/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:43ccb1a684faede8f39b94c120fa3021eb47aadd19299efa74744a6e894fe21a +size 442311994 diff --git a/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8d1eebdc9d92b48d57f7395a352c1ebfa248545c --- /dev/null +++ b/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b62b33b9694704fcb4c3fc1fb18e34ce5c2533eca820123253e97f00323770ce +size 442311792 diff --git a/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt b/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..006259f1c39015605dc175851a2cffdc7f847a9c --- /dev/null +++ b/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:306afba43677d7a58a19523dbfbfc32a398aa383eadbd3fb0997d7035e789828 +size 442311930 diff --git a/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt b/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7aa09b216faf0eceef1f4f662e694617ac2a51f6 --- /dev/null +++ b/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:807e58bf37602f5184fea43486ecfb79dfeea23f91dcd06b743bd3095f50090a +size 442311930 diff --git a/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt b/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a80f3c41ba8ec45e75658cec7e4ee619f87b7db3 --- /dev/null +++ b/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1b99d3c98140a5c590e23765504002605c98e773c94d5bd1c8e3567f54330c3 +size 442312058 diff --git a/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt b/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5ae5de0d5196e80a66348812aaa69341b8eb5236 --- /dev/null +++ b/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4324a323d6fc736055fd916353b4cbfa2524b828dff3683d64e7a1a2fda735d +size 442311866 diff --git a/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt b/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8a71c23098c09a50395071800665bde03366f012 --- /dev/null +++ b/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34c6b91ce618c1eb80c2fa50e7c3e732d1987f1693d79d52fd7d1b61698bf3ef +size 442311994 diff --git a/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt b/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b229a88e6a3a59bfa88d99ed08c0277db8f5f338 --- /dev/null +++ b/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05092d3826c27056a9da7237dc186ce1320f5aa9fc3bf1604260adb233022589 +size 442311930 diff --git a/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt b/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a5d617f62ff5c5e6819d843a29473d5567a75853 --- /dev/null +++ b/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:230138f625ae29e8c410d57497a2812568d3a4636e1c142955f7ab3fba1a3af5 +size 442311930 diff --git a/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt b/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..134ef052ad9768a8966fa3190ec1ee0a55685235 --- /dev/null +++ b/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:598217c317e99da36f7235e25b5aea5baac89357e876abc39016ad931496f3f4 +size 442311994 diff --git a/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt b/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..230ec7963bd56756076d0417893686175fba425c --- /dev/null +++ b/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa082995dba6b4ec2d85e53883d0ffea3b685fb31b1bf03c98babbb0b7412ab2 +size 442311866 diff --git a/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt b/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..88ed5d0c0647a74d8950df91aca6292b654bee80 --- /dev/null +++ b/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7308c6c473889e3e08dbc920134a9f4627ff8bb0bb9ffacc5651b68a8baafd8a +size 442311866 diff --git a/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3249010a859bd4f1a96c09c8824a71a975ace867 --- /dev/null +++ b/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d20f2854e2757b67c9e3da072f058ef69175a754b1c0713c543c1ebbf1354a65 +size 442311920 diff --git a/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt b/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7e9b7b46c9525c476a810be3bf1a05ed817eae07 --- /dev/null +++ b/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee985f4ee212bf6290a4529cb51b5e81851b89f2f19b9ac33c6cf5a262ae6d98 +size 442311802 diff --git a/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt b/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..164ac4e9bbd833e82198b3bf0fa75c20e721940c --- /dev/null +++ b/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2253f41956b7d5f6c68c661dd4c74f1366c2016b1780e10631891e5126f412af +size 442311802 diff --git a/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..80faf831bc98bdb8a3c02945e2fb4fd0f196c521 --- /dev/null +++ b/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e6a4159142914ccb1193b815176a1ee0d5d85a3019275acedd3e743ec095290 +size 442311856 diff --git a/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..169fa4a2b685a929c4ba910e813720adc115a2ea --- /dev/null +++ b/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9859179dfacc2afcfc3c50c87eca96933fbf1ea7027b8600f43adf09d66d6462 +size 442311984 diff --git a/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1dce67d9908748b7f4583c7e40be6732cedb0d4e --- /dev/null +++ b/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a6b0fbd6c2977c130ec897dcb5d1ac86fbbd1e2ff107cb4700161e7c52d6651 +size 442311856 diff --git a/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ed076e93cf163913c9ff51f5cde88238e6f4aa31 --- /dev/null +++ b/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2102d556ddeba7ab0416530ae47fb2795c395664a884495e4411f30f811b5cdc +size 442311984 diff --git a/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1f9bfd5d2cac3a2507c89be28e1e76684f646e30 --- /dev/null +++ b/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c2c5182d6957f76ae01725d51d73a6618a6f5c7eafdc069c2378e24afa9d36d +size 442311920 diff --git a/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt b/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9027b246e1cfc74027cc53fa2623e513841f4dc0 --- /dev/null +++ b/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fed4fadfda1dc836e2971c8300c42cace6fe8c146b94e3ba15032cdcdfffcb7e +size 442311920 diff --git a/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt b/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..76a58025b9edebcbd4220306a2be1c9856cf1762 --- /dev/null +++ b/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7e621bf1c49035f831e86faee7992fae1d62a0713c319424c35604213bc6fa5 +size 442311920 diff --git a/configs/local_setup_privacy.yml b/configs/local_setup_privacy.yml new file mode 100644 index 0000000000000000000000000000000000000000..fbc123941ffc306b43b6ef7c4d5ea6000f3d9fc4 --- /dev/null +++ b/configs/local_setup_privacy.yml @@ -0,0 +1,36 @@ +{ + # Paths are relative to /lustre/fs0/scratch + + # Data etc. + "data_path": "/shared/data/neox-dclm_baseline-100B-perturbed-privacy_only/standard_text_document", + + # or for weighted datasets: + # "train-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"], + # "test-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"], + # "valid-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"], + # "train-data-weights": [1., 2.], + # "test-data-weights": [2., 1.], + # "valid-data-weights": [0.5, 0.4], + + # If weight_by_num_documents is True, Builds dataset weights from a multinomial distribution over groups of data according to the number of documents in each group. + # WARNING: setting this to True will override any user provided weights + # "weight_by_num_documents": false, + # "weighted_sampler_alpha": 0.3, + + # Vocab + "padded_vocab_size": 50304, + "vocab_file": "/shared/ameyagod/HubbleSuite/vocab-data/olmo-0724-hf/tokenizer.json", + "tokenizer_type": "HFTokenizer", + + "save": "/shared/pt_models/Hubble_1.1B/DCLM_100B/Perturbed-GBS_1024-SL_2048-INTF_privacy", + "load": "/shared/pt_models/Hubble_1.1B/DCLM_100B/Perturbed-GBS_1024-SL_2048-INTF_privacy", + "checkpoint_validation_with_forward_pass": False, + + # "tensorboard_dir": "tensorboard", + "log_dir": "logs", + "use_wandb": True, + "wandb_host": "https://api.wandb.ai", + "wandb_team": "usc_and_mpi", + "wandb_project": "Hubble", + "wandb_run_name": "Hubble_1.1B-DCLM_100B-Perturbed-GBS_1024-SL_2048-INTF_privacy", +} diff --git a/configs/src_config.yml b/configs/src_config.yml new file mode 100644 index 0000000000000000000000000000000000000000..5f893eb307573f7779adb0650d930e72da899870 --- /dev/null +++ b/configs/src_config.yml @@ -0,0 +1,123 @@ +# Hubble 1.1B - Copied from TinyLlama https://github.com/Lightning-AI/litgpt/blob/a5021be4bb48e27779586b56b062a1749ecb232f/litgpt/config.py#L1809 +# Modified from https://github.com/aflah02/gpt-neox/blob/olmo-support/configs/hubble/Speed_Exps/1_1B_Baseline_BS_8_GAS_8_No_Activation_Checkpointing_GQA_Llama_3_2_Fusions_All_4_FA_Swiglu.yml +{ + # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages + # across the node boundaries ) + "pipe_parallel_size": 1, + "model_parallel_size": 1, + "make_vocab_size_divisible_by": 128, # Need to set as 64 because code makes it divisible by MP*makes_vocab_size_divisible_by + + # model settings + "num_layers": 16, + "hidden_size": 2048, + "num_attention_heads": 32, + "num_kv_heads": 8, + "intermediate_size": 24576, # 8192*3 + "seq_length": 2048, + "max_position_embeddings": 2048, + "pos_emb": "rotary", + "no_weight_tying": true, + "gpt_j_residual": false, + "output_layer_parallelism": "column", + "norm": "rmsnorm", + "rms_norm_epsilon": 1.0e-5, + "rmsnorm_fusion": true, + + # these should provide some speedup but takes a while to build, set to true if desired + "scaled_upper_triang_masked_softmax_fusion": true, + "scaled_masked_softmax_fusion": true, + "bias_gelu_fusion": false, + "rope_fusion": true, + # "layernorm_fusion": false, + "use_bias_in_norms": false, + "use_bias_in_attn_linear": false, + "activation": "swiglu", + "use_flashattn_swiglu": true, + "mlp_multiple_of": 256, + + # init methods (Copied from OLMo 2) + "init_method": "normal", + "output_layer_init_method": "normal", + "init_method_std": 0.02, + + # optimizer settings + "optimizer": { + "type": "Adam", + "params": { + "lr": 0.0004, + "betas": [0.9, 0.95], + "eps": 1.0e-8, + } + }, + + # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training + "zero_optimization": { + "stage": 1, + "allgather_partitions": true, + "allgather_bucket_size": 1260000000, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 1260000000, + "contiguous_gradients": true, + "cpu_offload": false + }, + "min_lr": 0.00004, + + # batch / data settings + # "n_gpus": 32, + "train_micro_batch_size_per_gpu": 16, + "gradient_accumulation_steps": 2, + "train_batch_size": 1024, + "data_impl": "mmap", + + # activation checkpointing + "checkpoint_activations": false, + "checkpoint_num_layers": 1, + "partition_activations": false, + "synchronize_each_layer": false, + + # regularization + "gradient_clipping": 1.0, + "weight_decay": 0.1, + "hidden_dropout": 0, + "attention_dropout": 0, + # Flash Attention + "attention_config": [[["flash"], 16]], + + # precision settings + "precision": "bfloat16", + "fp32_allreduce": true, + "bf16": { + "enabled": true + }, + "data_types": { + "grad_accum_dtype": "fp32" + }, + + # misc. training settings + "train_iters": 48000, + "lr_decay_iters": 48000, + "distributed_backend": "nccl", + "lr_decay_style": "cosine", + "warmup": 0.05, + "checkpoint_factor": 1000, + "eval_interval": 2000, + "eval_iters": 10, + "extra_save_iters": [0, 1, 4, 16, 64, 256, 512], + # "keep_last_n_checkpoints": 2, + + # logging + "log_interval": 10, + "steps_per_print": 50, + "wall_clock_breakdown": true, + + # "memory_profiling": true, + # "memory_profiling_path": "/NS/llm-pretraining/work/afkhan/USC_Colab/gpt-neox/Artifacts/Profiles/Hubble-1.1B-Baseline_BS_8_GAS_8_No_Activation_Checkpointing_GQA_Llama_3_2_Fusions_All_4_FA_Swiglu", + # "profile_step_start": 0, + # "profile_step_stop": 100, + + # "launcher": "slurm", + # "deepspeed_mpi": true, + # "deepspeed_slurm": true, + "no_ssh_check": true, +} diff --git a/layer_00-model_00-model_states.pt b/layer_00-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..64857ca5ec7342497a81276ff60dcd0f4373b547 --- /dev/null +++ b/layer_00-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d6a35fd508589d4429f8cc48ce898985b74381e502adfc398c9c8d98113f2f2 +size 206046607 diff --git a/layer_02-model_00-model_states.pt b/layer_02-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1fd777f80dde9189fffe7fdec5e56350782bc969 --- /dev/null +++ b/layer_02-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:851879db1382f408c40ad0ffef2cb0159184d7c997a00dfc7b4093c38bd85a6f +size 121683348 diff --git a/layer_03-model_00-model_states.pt b/layer_03-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c68555858e6261126f336386aebc7442bb914cee --- /dev/null +++ b/layer_03-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a91d8de2e3ba9fb74b3f9d75b9a5db047e5a3cd06450dc7b61dff87f242875e +size 121683348 diff --git a/layer_04-model_00-model_states.pt b/layer_04-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..eaf512f3862d219abe2f4ebcf73cd41f8c13501b --- /dev/null +++ b/layer_04-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc004803df4a8467240bf09ef278e70e8e552df5f0700804ed0a17c2d501001c +size 121683348 diff --git a/layer_05-model_00-model_states.pt b/layer_05-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..766e2710b70cf9aa994dffb80fc56b0aa9d16fe8 --- /dev/null +++ b/layer_05-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71b7f6deca5afd357354fd866865884366d006e97e38ef7f49dc5ca61e083e8f +size 121683348 diff --git a/layer_06-model_00-model_states.pt b/layer_06-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0f697ae60491f577e67d8986d0d608196e8116cd --- /dev/null +++ b/layer_06-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f74b6d8e27792bc9de1d76f483341423061f64cea43d8a8829b7b5d5ceea697e +size 121683348 diff --git a/layer_07-model_00-model_states.pt b/layer_07-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0222ce544a510d5cfe836cdc1561281bfb2fbb1c --- /dev/null +++ b/layer_07-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:255e68ea1d6deccb429fbb17e1452990fc196829a9bfc59f19cc9fb3fc71ef75 +size 121683348 diff --git a/layer_08-model_00-model_states.pt b/layer_08-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ceb6957e3fb9c858b99ea588aba4fb378d84c766 --- /dev/null +++ b/layer_08-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5023e4350f06e5a716f480cae307658eb776d09cedcccbf7c8cd95b93934930a +size 121683348 diff --git a/layer_09-model_00-model_states.pt b/layer_09-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4477fcd1744841f8d3eb4a5fcda03113cfc12e3b --- /dev/null +++ b/layer_09-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c489c46d167942125a7f55898a56f0dd26a773aa5d077861db417c0918e8a06f +size 121683348 diff --git a/layer_10-model_00-model_states.pt b/layer_10-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7cf7d28a9322b0668a0bee8c5eff9ca5194cc3ad --- /dev/null +++ b/layer_10-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a1abdf2e909b38748e8855ab3a4418e5deaf5f0524be03e8c8581cf978182f6 +size 121683348 diff --git a/layer_11-model_00-model_states.pt b/layer_11-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4f0897602a77938ec2e076489c2c004e4903707d --- /dev/null +++ b/layer_11-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8f725371ad9886b4c017a28c298754763f8d873dcd0e44192617a6bdb0eb4ff +size 121683348 diff --git a/layer_12-model_00-model_states.pt b/layer_12-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..85f24899e9d3bd1a649d22866f582cf6aba7d3f3 --- /dev/null +++ b/layer_12-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0af4765f9e2a9d28eb2f1da2cf27c7d638984009a6f2002b84bf677bdaa6e61e +size 121683348 diff --git a/layer_13-model_00-model_states.pt b/layer_13-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7e72b18671447465ba9683da28682d1b3b445091 --- /dev/null +++ b/layer_13-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6cf6481c125267b6cf86b89536b4f62f570e0c5ade4909a5c49f90291ffcc3d +size 121683348 diff --git a/layer_14-model_00-model_states.pt b/layer_14-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bb5fd0a3f0819949c5f217ef3e6fa74929c180c7 --- /dev/null +++ b/layer_14-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a289ab498050ff662ad9ab55fbb46fbd6714127d29fc442b5806b182ad1d505 +size 121683348 diff --git a/layer_15-model_00-model_states.pt b/layer_15-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d7c0f2f8a8d97951a021766e21d99c0d581e5d64 --- /dev/null +++ b/layer_15-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3f58c119798807c5c7ba3441c5253f84da7194caa537b99973aec38922206cf +size 121683348 diff --git a/layer_16-model_00-model_states.pt b/layer_16-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f5a2652c01c7bee03904609b95bbd43f0fe49597 --- /dev/null +++ b/layer_16-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27aab39e6bd7de5d58113f44381469566d980e77c8463edddad654839053f903 +size 121683348 diff --git a/layer_17-model_00-model_states.pt b/layer_17-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..367dac9f8b9dd4e3df81ca15a6f5da208ccbf54f --- /dev/null +++ b/layer_17-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1bc593c10a956f2509955209be7f63498aaf7ffa1c0e75754ad480b78166fad +size 121683348 diff --git a/layer_19-model_00-model_states.pt b/layer_19-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b12cd9d3b611f0fd64e01a7af1a0955d761b9581 --- /dev/null +++ b/layer_19-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26652f98854421bc36140b2c31b545db95987411aa1e1c5946d463925ef4ddce +size 5519 diff --git a/layer_20-model_00-model_states.pt b/layer_20-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5ffe82ae9ecb4f9e41cfb0441f1ea72b0fb71388 --- /dev/null +++ b/layer_20-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84d7f6e74b5ba7931867bfe397035874647ab4cc408b22a6f233b57756650fbf +size 206046607 diff --git a/mp_rank_00_model_states.pt b/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4953469df7f3fef93231ea555f46c7f0945e1abd --- /dev/null +++ b/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9de69a346a6f401dad59c165d5f5e7e77a0de6ee679a34c2781f06103502241c +size 23620