upload_1b_neox.py
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
- bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +3 -0
- bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +3 -0
- bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +3 -0
- bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +3 -0
- bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +3 -0
- bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +3 -0
- bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +3 -0
- bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +3 -0
- bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +3 -0
- bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +3 -0
- bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +3 -0
- bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +3 -0
- bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +3 -0
- bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +3 -0
- bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +3 -0
- bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3 -0
- bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3 -0
- bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3 -0
- bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3 -0
- bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3 -0
- bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3 -0
- bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +3 -0
- bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3 -0
- bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3 -0
- bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +3 -0
- bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +3 -0
- bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +3 -0
- bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +3 -0
- bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +3 -0
- bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +3 -0
- bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +3 -0
- configs/local_setup_privacy.yml +36 -0
- configs/src_config.yml +123 -0
- layer_00-model_00-model_states.pt +3 -0
- layer_02-model_00-model_states.pt +3 -0
- layer_03-model_00-model_states.pt +3 -0
- layer_04-model_00-model_states.pt +3 -0
- layer_05-model_00-model_states.pt +3 -0
- layer_06-model_00-model_states.pt +3 -0
- layer_07-model_00-model_states.pt +3 -0
- layer_08-model_00-model_states.pt +3 -0
- layer_09-model_00-model_states.pt +3 -0
- layer_10-model_00-model_states.pt +3 -0
- layer_11-model_00-model_states.pt +3 -0
- layer_12-model_00-model_states.pt +3 -0
- layer_13-model_00-model_states.pt +3 -0
- layer_14-model_00-model_states.pt +3 -0
- layer_15-model_00-model_states.pt +3 -0
- layer_16-model_00-model_states.pt +3 -0
bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cce38480bfa7fca1248f1ee470d8849252cef771f7b1e1cbdf21bd4b3fe2ad45
|
| 3 |
+
size 442311792
|
bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:06a6651b185c471cf611b313ddc77989b2c17e1edf487b961cd5a4fde9b0fbce
|
| 3 |
+
size 442311866
|
bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bb093d0d34d20d87d04026a60b6d4bb64cd5485dce06d2f64ce6c2e0bc69a5ef
|
| 3 |
+
size 442311930
|
bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d139fc2f7da52f1a458b78066ae86024231838bd2b3802fe8d85645f73f38132
|
| 3 |
+
size 442312058
|
bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8391be9b4dbe790b6f7bc534f179b775d6b03d09a13ecd1ff2bb4866a4071f28
|
| 3 |
+
size 442311866
|
bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:44c603d8a6593ca64514d9a3188ff290bcb6256fb6adc1914a7f8ff2a3ac432f
|
| 3 |
+
size 442311994
|
bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e98082949fb075618d3aa67c84f9ecb83dd2f404a1383ef244fefea2ea95e445
|
| 3 |
+
size 442311866
|
bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:327872918862f1d1b754fcedff2e7db00bc9c869c9f35ddd601982cdb75bc5a2
|
| 3 |
+
size 442311930
|
bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b0ca5ef9844b1e10d19e84576797cf414423558c24862d9bf76f6a0887c53b23
|
| 3 |
+
size 442311994
|
bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:62fe3c26445544ceb36f46c4629744908b61c6810e9d39d95ea7ef6c247c0e99
|
| 3 |
+
size 442311866
|
bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:43ccb1a684faede8f39b94c120fa3021eb47aadd19299efa74744a6e894fe21a
|
| 3 |
+
size 442311994
|
bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b62b33b9694704fcb4c3fc1fb18e34ce5c2533eca820123253e97f00323770ce
|
| 3 |
+
size 442311792
|
bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:306afba43677d7a58a19523dbfbfc32a398aa383eadbd3fb0997d7035e789828
|
| 3 |
+
size 442311930
|
bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:807e58bf37602f5184fea43486ecfb79dfeea23f91dcd06b743bd3095f50090a
|
| 3 |
+
size 442311930
|
bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a1b99d3c98140a5c590e23765504002605c98e773c94d5bd1c8e3567f54330c3
|
| 3 |
+
size 442312058
|
bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a4324a323d6fc736055fd916353b4cbfa2524b828dff3683d64e7a1a2fda735d
|
| 3 |
+
size 442311866
|
bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:34c6b91ce618c1eb80c2fa50e7c3e732d1987f1693d79d52fd7d1b61698bf3ef
|
| 3 |
+
size 442311994
|
bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:05092d3826c27056a9da7237dc186ce1320f5aa9fc3bf1604260adb233022589
|
| 3 |
+
size 442311930
|
bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:230138f625ae29e8c410d57497a2812568d3a4636e1c142955f7ab3fba1a3af5
|
| 3 |
+
size 442311930
|
bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:598217c317e99da36f7235e25b5aea5baac89357e876abc39016ad931496f3f4
|
| 3 |
+
size 442311994
|
bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fa082995dba6b4ec2d85e53883d0ffea3b685fb31b1bf03c98babbb0b7412ab2
|
| 3 |
+
size 442311866
|
bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7308c6c473889e3e08dbc920134a9f4627ff8bb0bb9ffacc5651b68a8baafd8a
|
| 3 |
+
size 442311866
|
bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d20f2854e2757b67c9e3da072f058ef69175a754b1c0713c543c1ebbf1354a65
|
| 3 |
+
size 442311920
|
bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ee985f4ee212bf6290a4529cb51b5e81851b89f2f19b9ac33c6cf5a262ae6d98
|
| 3 |
+
size 442311802
|
bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2253f41956b7d5f6c68c661dd4c74f1366c2016b1780e10631891e5126f412af
|
| 3 |
+
size 442311802
|
bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4e6a4159142914ccb1193b815176a1ee0d5d85a3019275acedd3e743ec095290
|
| 3 |
+
size 442311856
|
bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9859179dfacc2afcfc3c50c87eca96933fbf1ea7027b8600f43adf09d66d6462
|
| 3 |
+
size 442311984
|
bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4a6b0fbd6c2977c130ec897dcb5d1ac86fbbd1e2ff107cb4700161e7c52d6651
|
| 3 |
+
size 442311856
|
bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2102d556ddeba7ab0416530ae47fb2795c395664a884495e4411f30f811b5cdc
|
| 3 |
+
size 442311984
|
bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7c2c5182d6957f76ae01725d51d73a6618a6f5c7eafdc069c2378e24afa9d36d
|
| 3 |
+
size 442311920
|
bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fed4fadfda1dc836e2971c8300c42cace6fe8c146b94e3ba15032cdcdfffcb7e
|
| 3 |
+
size 442311920
|
bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f7e621bf1c49035f831e86faee7992fae1d62a0713c319424c35604213bc6fa5
|
| 3 |
+
size 442311920
|
configs/local_setup_privacy.yml
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
# Paths are relative to /lustre/fs0/scratch
|
| 3 |
+
|
| 4 |
+
# Data etc.
|
| 5 |
+
"data_path": "/shared/data/neox-dclm_baseline-100B-perturbed-privacy_only/standard_text_document",
|
| 6 |
+
|
| 7 |
+
# or for weighted datasets:
|
| 8 |
+
# "train-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"],
|
| 9 |
+
# "test-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"],
|
| 10 |
+
# "valid-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"],
|
| 11 |
+
# "train-data-weights": [1., 2.],
|
| 12 |
+
# "test-data-weights": [2., 1.],
|
| 13 |
+
# "valid-data-weights": [0.5, 0.4],
|
| 14 |
+
|
| 15 |
+
# If weight_by_num_documents is True, Builds dataset weights from a multinomial distribution over groups of data according to the number of documents in each group.
|
| 16 |
+
# WARNING: setting this to True will override any user provided weights
|
| 17 |
+
# "weight_by_num_documents": false,
|
| 18 |
+
# "weighted_sampler_alpha": 0.3,
|
| 19 |
+
|
| 20 |
+
# Vocab
|
| 21 |
+
"padded_vocab_size": 50304,
|
| 22 |
+
"vocab_file": "/shared/ameyagod/HubbleSuite/vocab-data/olmo-0724-hf/tokenizer.json",
|
| 23 |
+
"tokenizer_type": "HFTokenizer",
|
| 24 |
+
|
| 25 |
+
"save": "/shared/pt_models/Hubble_1.1B/DCLM_100B/Perturbed-GBS_1024-SL_2048-INTF_privacy",
|
| 26 |
+
"load": "/shared/pt_models/Hubble_1.1B/DCLM_100B/Perturbed-GBS_1024-SL_2048-INTF_privacy",
|
| 27 |
+
"checkpoint_validation_with_forward_pass": False,
|
| 28 |
+
|
| 29 |
+
# "tensorboard_dir": "tensorboard",
|
| 30 |
+
"log_dir": "logs",
|
| 31 |
+
"use_wandb": True,
|
| 32 |
+
"wandb_host": "https://api.wandb.ai",
|
| 33 |
+
"wandb_team": "usc_and_mpi",
|
| 34 |
+
"wandb_project": "Hubble",
|
| 35 |
+
"wandb_run_name": "Hubble_1.1B-DCLM_100B-Perturbed-GBS_1024-SL_2048-INTF_privacy",
|
| 36 |
+
}
|
configs/src_config.yml
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Hubble 1.1B - Copied from TinyLlama https://github.com/Lightning-AI/litgpt/blob/a5021be4bb48e27779586b56b062a1749ecb232f/litgpt/config.py#L1809
|
| 2 |
+
# Modified from https://github.com/aflah02/gpt-neox/blob/olmo-support/configs/hubble/Speed_Exps/1_1B_Baseline_BS_8_GAS_8_No_Activation_Checkpointing_GQA_Llama_3_2_Fusions_All_4_FA_Swiglu.yml
|
| 3 |
+
{
|
| 4 |
+
# parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
|
| 5 |
+
# across the node boundaries )
|
| 6 |
+
"pipe_parallel_size": 1,
|
| 7 |
+
"model_parallel_size": 1,
|
| 8 |
+
"make_vocab_size_divisible_by": 128, # Need to set as 64 because code makes it divisible by MP*makes_vocab_size_divisible_by
|
| 9 |
+
|
| 10 |
+
# model settings
|
| 11 |
+
"num_layers": 16,
|
| 12 |
+
"hidden_size": 2048,
|
| 13 |
+
"num_attention_heads": 32,
|
| 14 |
+
"num_kv_heads": 8,
|
| 15 |
+
"intermediate_size": 24576, # 8192*3
|
| 16 |
+
"seq_length": 2048,
|
| 17 |
+
"max_position_embeddings": 2048,
|
| 18 |
+
"pos_emb": "rotary",
|
| 19 |
+
"no_weight_tying": true,
|
| 20 |
+
"gpt_j_residual": false,
|
| 21 |
+
"output_layer_parallelism": "column",
|
| 22 |
+
"norm": "rmsnorm",
|
| 23 |
+
"rms_norm_epsilon": 1.0e-5,
|
| 24 |
+
"rmsnorm_fusion": true,
|
| 25 |
+
|
| 26 |
+
# these should provide some speedup but takes a while to build, set to true if desired
|
| 27 |
+
"scaled_upper_triang_masked_softmax_fusion": true,
|
| 28 |
+
"scaled_masked_softmax_fusion": true,
|
| 29 |
+
"bias_gelu_fusion": false,
|
| 30 |
+
"rope_fusion": true,
|
| 31 |
+
# "layernorm_fusion": false,
|
| 32 |
+
"use_bias_in_norms": false,
|
| 33 |
+
"use_bias_in_attn_linear": false,
|
| 34 |
+
"activation": "swiglu",
|
| 35 |
+
"use_flashattn_swiglu": true,
|
| 36 |
+
"mlp_multiple_of": 256,
|
| 37 |
+
|
| 38 |
+
# init methods (Copied from OLMo 2)
|
| 39 |
+
"init_method": "normal",
|
| 40 |
+
"output_layer_init_method": "normal",
|
| 41 |
+
"init_method_std": 0.02,
|
| 42 |
+
|
| 43 |
+
# optimizer settings
|
| 44 |
+
"optimizer": {
|
| 45 |
+
"type": "Adam",
|
| 46 |
+
"params": {
|
| 47 |
+
"lr": 0.0004,
|
| 48 |
+
"betas": [0.9, 0.95],
|
| 49 |
+
"eps": 1.0e-8,
|
| 50 |
+
}
|
| 51 |
+
},
|
| 52 |
+
|
| 53 |
+
# for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
|
| 54 |
+
"zero_optimization": {
|
| 55 |
+
"stage": 1,
|
| 56 |
+
"allgather_partitions": true,
|
| 57 |
+
"allgather_bucket_size": 1260000000,
|
| 58 |
+
"overlap_comm": true,
|
| 59 |
+
"reduce_scatter": true,
|
| 60 |
+
"reduce_bucket_size": 1260000000,
|
| 61 |
+
"contiguous_gradients": true,
|
| 62 |
+
"cpu_offload": false
|
| 63 |
+
},
|
| 64 |
+
"min_lr": 0.00004,
|
| 65 |
+
|
| 66 |
+
# batch / data settings
|
| 67 |
+
# "n_gpus": 32,
|
| 68 |
+
"train_micro_batch_size_per_gpu": 16,
|
| 69 |
+
"gradient_accumulation_steps": 2,
|
| 70 |
+
"train_batch_size": 1024,
|
| 71 |
+
"data_impl": "mmap",
|
| 72 |
+
|
| 73 |
+
# activation checkpointing
|
| 74 |
+
"checkpoint_activations": false,
|
| 75 |
+
"checkpoint_num_layers": 1,
|
| 76 |
+
"partition_activations": false,
|
| 77 |
+
"synchronize_each_layer": false,
|
| 78 |
+
|
| 79 |
+
# regularization
|
| 80 |
+
"gradient_clipping": 1.0,
|
| 81 |
+
"weight_decay": 0.1,
|
| 82 |
+
"hidden_dropout": 0,
|
| 83 |
+
"attention_dropout": 0,
|
| 84 |
+
# Flash Attention
|
| 85 |
+
"attention_config": [[["flash"], 16]],
|
| 86 |
+
|
| 87 |
+
# precision settings
|
| 88 |
+
"precision": "bfloat16",
|
| 89 |
+
"fp32_allreduce": true,
|
| 90 |
+
"bf16": {
|
| 91 |
+
"enabled": true
|
| 92 |
+
},
|
| 93 |
+
"data_types": {
|
| 94 |
+
"grad_accum_dtype": "fp32"
|
| 95 |
+
},
|
| 96 |
+
|
| 97 |
+
# misc. training settings
|
| 98 |
+
"train_iters": 48000,
|
| 99 |
+
"lr_decay_iters": 48000,
|
| 100 |
+
"distributed_backend": "nccl",
|
| 101 |
+
"lr_decay_style": "cosine",
|
| 102 |
+
"warmup": 0.05,
|
| 103 |
+
"checkpoint_factor": 1000,
|
| 104 |
+
"eval_interval": 2000,
|
| 105 |
+
"eval_iters": 10,
|
| 106 |
+
"extra_save_iters": [0, 1, 4, 16, 64, 256, 512],
|
| 107 |
+
# "keep_last_n_checkpoints": 2,
|
| 108 |
+
|
| 109 |
+
# logging
|
| 110 |
+
"log_interval": 10,
|
| 111 |
+
"steps_per_print": 50,
|
| 112 |
+
"wall_clock_breakdown": true,
|
| 113 |
+
|
| 114 |
+
# "memory_profiling": true,
|
| 115 |
+
# "memory_profiling_path": "/NS/llm-pretraining/work/afkhan/USC_Colab/gpt-neox/Artifacts/Profiles/Hubble-1.1B-Baseline_BS_8_GAS_8_No_Activation_Checkpointing_GQA_Llama_3_2_Fusions_All_4_FA_Swiglu",
|
| 116 |
+
# "profile_step_start": 0,
|
| 117 |
+
# "profile_step_stop": 100,
|
| 118 |
+
|
| 119 |
+
# "launcher": "slurm",
|
| 120 |
+
# "deepspeed_mpi": true,
|
| 121 |
+
# "deepspeed_slurm": true,
|
| 122 |
+
"no_ssh_check": true,
|
| 123 |
+
}
|
layer_00-model_00-model_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9d6a35fd508589d4429f8cc48ce898985b74381e502adfc398c9c8d98113f2f2
|
| 3 |
+
size 206046607
|
layer_02-model_00-model_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:851879db1382f408c40ad0ffef2cb0159184d7c997a00dfc7b4093c38bd85a6f
|
| 3 |
+
size 121683348
|
layer_03-model_00-model_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5a91d8de2e3ba9fb74b3f9d75b9a5db047e5a3cd06450dc7b61dff87f242875e
|
| 3 |
+
size 121683348
|
layer_04-model_00-model_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cc004803df4a8467240bf09ef278e70e8e552df5f0700804ed0a17c2d501001c
|
| 3 |
+
size 121683348
|
layer_05-model_00-model_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:71b7f6deca5afd357354fd866865884366d006e97e38ef7f49dc5ca61e083e8f
|
| 3 |
+
size 121683348
|
layer_06-model_00-model_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f74b6d8e27792bc9de1d76f483341423061f64cea43d8a8829b7b5d5ceea697e
|
| 3 |
+
size 121683348
|
layer_07-model_00-model_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:255e68ea1d6deccb429fbb17e1452990fc196829a9bfc59f19cc9fb3fc71ef75
|
| 3 |
+
size 121683348
|
layer_08-model_00-model_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5023e4350f06e5a716f480cae307658eb776d09cedcccbf7c8cd95b93934930a
|
| 3 |
+
size 121683348
|
layer_09-model_00-model_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c489c46d167942125a7f55898a56f0dd26a773aa5d077861db417c0918e8a06f
|
| 3 |
+
size 121683348
|
layer_10-model_00-model_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3a1abdf2e909b38748e8855ab3a4418e5deaf5f0524be03e8c8581cf978182f6
|
| 3 |
+
size 121683348
|
layer_11-model_00-model_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a8f725371ad9886b4c017a28c298754763f8d873dcd0e44192617a6bdb0eb4ff
|
| 3 |
+
size 121683348
|
layer_12-model_00-model_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0af4765f9e2a9d28eb2f1da2cf27c7d638984009a6f2002b84bf677bdaa6e61e
|
| 3 |
+
size 121683348
|
layer_13-model_00-model_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e6cf6481c125267b6cf86b89536b4f62f570e0c5ade4909a5c49f90291ffcc3d
|
| 3 |
+
size 121683348
|
layer_14-model_00-model_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5a289ab498050ff662ad9ab55fbb46fbd6714127d29fc442b5806b182ad1d505
|
| 3 |
+
size 121683348
|
layer_15-model_00-model_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f3f58c119798807c5c7ba3441c5253f84da7194caa537b99973aec38922206cf
|
| 3 |
+
size 121683348
|
layer_16-model_00-model_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:27aab39e6bd7de5d58113f44381469566d980e77c8463edddad654839053f903
|
| 3 |
+
size 121683348
|