Upload 10 files

Browse files

Files changed (10) hide show

added_tokens.json +5 -0
config.json +28 -0
generation_config.json +12 -0
merges.txt +0 -0
model.safetensors +3 -0
special_tokens_map.json +20 -0
tokenizer_config.json +44 -0
trainer_state.json +1346 -0
training_args.bin +3 -0
vocab.json +0 -0

added_tokens.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "<|endoftext|>": 151643,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644
+}

config.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "_name_or_path": "/cpfs01/shared/public/ADLab/hug_ckpts/Qwen1.5-1.8B-Chat",
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 5504,
+  "max_position_embeddings": 32768,
+  "max_window_layers": 21,
+  "model_type": "qwen2",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 24,
+  "num_key_value_heads": 16,
+  "rms_norm_eps": 1e-06,
+  "rope_theta": 1000000.0,
+  "sliding_window": 32768,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.1",
+  "use_cache": false,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "bos_token_id": 151643,
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "repetition_penalty": 1.1,
+  "top_p": 0.8,
+  "transformers_version": "4.38.1"
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:91db2bb1215853a60a6a80e7096d9848bfc09c136c151e53eb495f4a0615c842
+size 3673690696

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,44 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "bos_token": null,
+  "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "model_max_length": 8096,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "right",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1346 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9966865473823724,
+  "eval_steps": 500,
+  "global_step": 188,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.01,
+      "grad_norm": 3.1747286526672185,
+      "learning_rate": 0.0,
+      "loss": 0.6071,
+      "step": 1
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 2.8744278125811684,
+      "learning_rate": 1e-05,
+      "loss": 0.5879,
+      "step": 2
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 2.96401630751086,
+      "learning_rate": 1e-05,
+      "loss": 0.5972,
+      "step": 3
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 2.5711555686938237,
+      "learning_rate": 1e-05,
+      "loss": 0.5716,
+      "step": 4
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 2.3039082327504756,
+      "learning_rate": 1e-05,
+      "loss": 0.5343,
+      "step": 5
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 1.6528760769327044,
+      "learning_rate": 1e-05,
+      "loss": 0.4759,
+      "step": 6
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 1.5057635329920682,
+      "learning_rate": 1e-05,
+      "loss": 0.4635,
+      "step": 7
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 1.4512874366733575,
+      "learning_rate": 1e-05,
+      "loss": 0.4533,
+      "step": 8
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 1.0385378050217222,
+      "learning_rate": 1e-05,
+      "loss": 0.3828,
+      "step": 9
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 1.0514739720980217,
+      "learning_rate": 1e-05,
+      "loss": 0.3911,
+      "step": 10
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.9837237195742387,
+      "learning_rate": 1e-05,
+      "loss": 0.3603,
+      "step": 11
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.9688775834776646,
+      "learning_rate": 1e-05,
+      "loss": 0.3533,
+      "step": 12
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.8654392700384863,
+      "learning_rate": 1e-05,
+      "loss": 0.3409,
+      "step": 13
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.8546547238317309,
+      "learning_rate": 1e-05,
+      "loss": 0.3371,
+      "step": 14
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.7680641258209508,
+      "learning_rate": 1e-05,
+      "loss": 0.3376,
+      "step": 15
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.7158077090915566,
+      "learning_rate": 1e-05,
+      "loss": 0.3124,
+      "step": 16
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.6437657576240935,
+      "learning_rate": 1e-05,
+      "loss": 0.3026,
+      "step": 17
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.6181079816147165,
+      "learning_rate": 1e-05,
+      "loss": 0.309,
+      "step": 18
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.5631504450561933,
+      "learning_rate": 1e-05,
+      "loss": 0.2937,
+      "step": 19
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.5074636865803471,
+      "learning_rate": 1e-05,
+      "loss": 0.2766,
+      "step": 20
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.5348837661889406,
+      "learning_rate": 1e-05,
+      "loss": 0.2945,
+      "step": 21
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.464888526177443,
+      "learning_rate": 1e-05,
+      "loss": 0.273,
+      "step": 22
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.4631046843104534,
+      "learning_rate": 1e-05,
+      "loss": 0.2661,
+      "step": 23
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.45005029898365917,
+      "learning_rate": 1e-05,
+      "loss": 0.2689,
+      "step": 24
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.43003033091413906,
+      "learning_rate": 1e-05,
+      "loss": 0.2642,
+      "step": 25
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.4486774042462993,
+      "learning_rate": 1e-05,
+      "loss": 0.2547,
+      "step": 26
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.4321202903931241,
+      "learning_rate": 1e-05,
+      "loss": 0.2497,
+      "step": 27
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.43198162548067304,
+      "learning_rate": 1e-05,
+      "loss": 0.2451,
+      "step": 28
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.42529053274210193,
+      "learning_rate": 1e-05,
+      "loss": 0.2416,
+      "step": 29
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.42704990652407715,
+      "learning_rate": 1e-05,
+      "loss": 0.2446,
+      "step": 30
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.3958306845876151,
+      "learning_rate": 1e-05,
+      "loss": 0.2327,
+      "step": 31
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.38775187941433076,
+      "learning_rate": 1e-05,
+      "loss": 0.2361,
+      "step": 32
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.39129679015161345,
+      "learning_rate": 1e-05,
+      "loss": 0.2455,
+      "step": 33
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.3746266538880689,
+      "learning_rate": 1e-05,
+      "loss": 0.225,
+      "step": 34
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.37254990504997704,
+      "learning_rate": 1e-05,
+      "loss": 0.2325,
+      "step": 35
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.3516460688448446,
+      "learning_rate": 1e-05,
+      "loss": 0.2216,
+      "step": 36
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.3354724757085697,
+      "learning_rate": 1e-05,
+      "loss": 0.2208,
+      "step": 37
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.32752255887432763,
+      "learning_rate": 1e-05,
+      "loss": 0.2033,
+      "step": 38
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.3651242741496675,
+      "learning_rate": 1e-05,
+      "loss": 0.2247,
+      "step": 39
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.3357989435107463,
+      "learning_rate": 1e-05,
+      "loss": 0.2112,
+      "step": 40
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.3265444947267983,
+      "learning_rate": 1e-05,
+      "loss": 0.2135,
+      "step": 41
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.3312232834623035,
+      "learning_rate": 1e-05,
+      "loss": 0.2099,
+      "step": 42
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.325208128018063,
+      "learning_rate": 1e-05,
+      "loss": 0.2148,
+      "step": 43
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.3103050831142777,
+      "learning_rate": 1e-05,
+      "loss": 0.201,
+      "step": 44
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.3152019901705659,
+      "learning_rate": 1e-05,
+      "loss": 0.2014,
+      "step": 45
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.3111577610569211,
+      "learning_rate": 1e-05,
+      "loss": 0.199,
+      "step": 46
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.3064414033100908,
+      "learning_rate": 1e-05,
+      "loss": 0.2045,
+      "step": 47
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.31614609895667245,
+      "learning_rate": 1e-05,
+      "loss": 0.2047,
+      "step": 48
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.31816702750266984,
+      "learning_rate": 1e-05,
+      "loss": 0.1903,
+      "step": 49
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.29796199012405883,
+      "learning_rate": 1e-05,
+      "loss": 0.1967,
+      "step": 50
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.287633927234579,
+      "learning_rate": 1e-05,
+      "loss": 0.181,
+      "step": 51
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.29390033060707677,
+      "learning_rate": 1e-05,
+      "loss": 0.1905,
+      "step": 52
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.29214020884298275,
+      "learning_rate": 1e-05,
+      "loss": 0.1842,
+      "step": 53
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.2916935333966593,
+      "learning_rate": 1e-05,
+      "loss": 0.194,
+      "step": 54
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.29833227194604023,
+      "learning_rate": 1e-05,
+      "loss": 0.1856,
+      "step": 55
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.29514305891318965,
+      "learning_rate": 1e-05,
+      "loss": 0.1793,
+      "step": 56
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.29330994882347694,
+      "learning_rate": 1e-05,
+      "loss": 0.18,
+      "step": 57
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.31625939409312065,
+      "learning_rate": 1e-05,
+      "loss": 0.1771,
+      "step": 58
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.2851852538890017,
+      "learning_rate": 1e-05,
+      "loss": 0.179,
+      "step": 59
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.29626675222390664,
+      "learning_rate": 1e-05,
+      "loss": 0.1737,
+      "step": 60
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.2935071337888655,
+      "learning_rate": 1e-05,
+      "loss": 0.183,
+      "step": 61
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.28642020017440367,
+      "learning_rate": 1e-05,
+      "loss": 0.1725,
+      "step": 62
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.29701780279529105,
+      "learning_rate": 1e-05,
+      "loss": 0.1738,
+      "step": 63
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.2853975587037228,
+      "learning_rate": 1e-05,
+      "loss": 0.1658,
+      "step": 64
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.280743334098327,
+      "learning_rate": 1e-05,
+      "loss": 0.1612,
+      "step": 65
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.2848068517064538,
+      "learning_rate": 1e-05,
+      "loss": 0.1626,
+      "step": 66
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.2826081499641741,
+      "learning_rate": 1e-05,
+      "loss": 0.1661,
+      "step": 67
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.2822062364230369,
+      "learning_rate": 1e-05,
+      "loss": 0.1622,
+      "step": 68
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.28663543900665617,
+      "learning_rate": 1e-05,
+      "loss": 0.1658,
+      "step": 69
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.28336682974385563,
+      "learning_rate": 1e-05,
+      "loss": 0.1589,
+      "step": 70
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.2774115679314652,
+      "learning_rate": 1e-05,
+      "loss": 0.1655,
+      "step": 71
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.27709306628158714,
+      "learning_rate": 1e-05,
+      "loss": 0.1489,
+      "step": 72
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 0.2862365037916789,
+      "learning_rate": 1e-05,
+      "loss": 0.1646,
+      "step": 73
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 0.28675576977214917,
+      "learning_rate": 1e-05,
+      "loss": 0.1614,
+      "step": 74
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.276969452071043,
+      "learning_rate": 1e-05,
+      "loss": 0.1454,
+      "step": 75
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.2847613570553893,
+      "learning_rate": 1e-05,
+      "loss": 0.1647,
+      "step": 76
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 0.2743833852883841,
+      "learning_rate": 1e-05,
+      "loss": 0.1571,
+      "step": 77
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 0.2842819197108497,
+      "learning_rate": 1e-05,
+      "loss": 0.1558,
+      "step": 78
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.2725891106726455,
+      "learning_rate": 1e-05,
+      "loss": 0.151,
+      "step": 79
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.2731655750714622,
+      "learning_rate": 1e-05,
+      "loss": 0.1548,
+      "step": 80
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 0.27553369461320953,
+      "learning_rate": 1e-05,
+      "loss": 0.1495,
+      "step": 81
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 0.26391895632526624,
+      "learning_rate": 1e-05,
+      "loss": 0.1403,
+      "step": 82
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.2711005612262172,
+      "learning_rate": 1e-05,
+      "loss": 0.1476,
+      "step": 83
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 0.2643848929997021,
+      "learning_rate": 1e-05,
+      "loss": 0.146,
+      "step": 84
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 0.2728123832588989,
+      "learning_rate": 1e-05,
+      "loss": 0.1463,
+      "step": 85
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.2712303902737227,
+      "learning_rate": 1e-05,
+      "loss": 0.1483,
+      "step": 86
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.27169791564319207,
+      "learning_rate": 1e-05,
+      "loss": 0.1472,
+      "step": 87
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 0.25810060690144226,
+      "learning_rate": 1e-05,
+      "loss": 0.1368,
+      "step": 88
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 0.27817282484024414,
+      "learning_rate": 1e-05,
+      "loss": 0.1543,
+      "step": 89
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.2521088146493535,
+      "learning_rate": 1e-05,
+      "loss": 0.1375,
+      "step": 90
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.28364342434736595,
+      "learning_rate": 1e-05,
+      "loss": 0.1454,
+      "step": 91
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 0.2662616212401991,
+      "learning_rate": 1e-05,
+      "loss": 0.1416,
+      "step": 92
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 0.2779342897679881,
+      "learning_rate": 1e-05,
+      "loss": 0.1451,
+      "step": 93
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.2646335301665372,
+      "learning_rate": 1e-05,
+      "loss": 0.1443,
+      "step": 94
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.255789758256267,
+      "learning_rate": 1e-05,
+      "loss": 0.1361,
+      "step": 95
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 0.2549905453416931,
+      "learning_rate": 1e-05,
+      "loss": 0.1358,
+      "step": 96
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 0.26632260810259295,
+      "learning_rate": 1e-05,
+      "loss": 0.1314,
+      "step": 97
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.26809336997796596,
+      "learning_rate": 1e-05,
+      "loss": 0.1341,
+      "step": 98
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.27461397618850975,
+      "learning_rate": 1e-05,
+      "loss": 0.1428,
+      "step": 99
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 0.2651115096422905,
+      "learning_rate": 1e-05,
+      "loss": 0.1334,
+      "step": 100
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.2613007356057435,
+      "learning_rate": 1e-05,
+      "loss": 0.1325,
+      "step": 101
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.2633742260809078,
+      "learning_rate": 1e-05,
+      "loss": 0.1303,
+      "step": 102
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 0.2597852394304056,
+      "learning_rate": 1e-05,
+      "loss": 0.1284,
+      "step": 103
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 0.25826526698011837,
+      "learning_rate": 1e-05,
+      "loss": 0.1304,
+      "step": 104
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.2655794821040693,
+      "learning_rate": 1e-05,
+      "loss": 0.1321,
+      "step": 105
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.27098219099500953,
+      "learning_rate": 1e-05,
+      "loss": 0.1294,
+      "step": 106
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 0.26226028196977447,
+      "learning_rate": 1e-05,
+      "loss": 0.129,
+      "step": 107
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 0.263399252714501,
+      "learning_rate": 1e-05,
+      "loss": 0.1353,
+      "step": 108
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.2714025066703335,
+      "learning_rate": 1e-05,
+      "loss": 0.1314,
+      "step": 109
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.2515224984062514,
+      "learning_rate": 1e-05,
+      "loss": 0.1194,
+      "step": 110
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 0.26605486225954983,
+      "learning_rate": 1e-05,
+      "loss": 0.1229,
+      "step": 111
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 0.2549018171846725,
+      "learning_rate": 1e-05,
+      "loss": 0.1197,
+      "step": 112
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.2560680072201203,
+      "learning_rate": 1e-05,
+      "loss": 0.1217,
+      "step": 113
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.2534651230289815,
+      "learning_rate": 1e-05,
+      "loss": 0.1264,
+      "step": 114
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 0.2648967427767519,
+      "learning_rate": 1e-05,
+      "loss": 0.1218,
+      "step": 115
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 0.26771279234407747,
+      "learning_rate": 1e-05,
+      "loss": 0.1296,
+      "step": 116
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.25894648053010244,
+      "learning_rate": 1e-05,
+      "loss": 0.1169,
+      "step": 117
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 0.2585875460715992,
+      "learning_rate": 1e-05,
+      "loss": 0.1199,
+      "step": 118
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 0.2548488101104416,
+      "learning_rate": 1e-05,
+      "loss": 0.1213,
+      "step": 119
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.24487209204733212,
+      "learning_rate": 1e-05,
+      "loss": 0.1168,
+      "step": 120
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.2554367919231805,
+      "learning_rate": 1e-05,
+      "loss": 0.1249,
+      "step": 121
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 0.25526673295561314,
+      "learning_rate": 1e-05,
+      "loss": 0.1174,
+      "step": 122
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 0.24318890394737153,
+      "learning_rate": 1e-05,
+      "loss": 0.1118,
+      "step": 123
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.24844490489908636,
+      "learning_rate": 1e-05,
+      "loss": 0.1207,
+      "step": 124
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.24957568713820535,
+      "learning_rate": 1e-05,
+      "loss": 0.1123,
+      "step": 125
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 0.26167543296489026,
+      "learning_rate": 1e-05,
+      "loss": 0.119,
+      "step": 126
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 0.2528970008222453,
+      "learning_rate": 1e-05,
+      "loss": 0.1119,
+      "step": 127
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.25446291827800727,
+      "learning_rate": 1e-05,
+      "loss": 0.1197,
+      "step": 128
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.2452294668436806,
+      "learning_rate": 1e-05,
+      "loss": 0.106,
+      "step": 129
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 0.26097488739530145,
+      "learning_rate": 1e-05,
+      "loss": 0.1172,
+      "step": 130
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 0.23710311994460034,
+      "learning_rate": 1e-05,
+      "loss": 0.1046,
+      "step": 131
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.25535125260602537,
+      "learning_rate": 1e-05,
+      "loss": 0.1126,
+      "step": 132
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 0.2451995286688119,
+      "learning_rate": 1e-05,
+      "loss": 0.1083,
+      "step": 133
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 0.2507401212273221,
+      "learning_rate": 1e-05,
+      "loss": 0.1161,
+      "step": 134
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.2548047383624151,
+      "learning_rate": 1e-05,
+      "loss": 0.1088,
+      "step": 135
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.25184336668602836,
+      "learning_rate": 1e-05,
+      "loss": 0.1092,
+      "step": 136
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 0.2389388344872007,
+      "learning_rate": 1e-05,
+      "loss": 0.1077,
+      "step": 137
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 0.2368111690990983,
+      "learning_rate": 1e-05,
+      "loss": 0.1007,
+      "step": 138
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.23422758337451635,
+      "learning_rate": 1e-05,
+      "loss": 0.1044,
+      "step": 139
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.2502487147266415,
+      "learning_rate": 1e-05,
+      "loss": 0.1094,
+      "step": 140
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.25134141633205564,
+      "learning_rate": 1e-05,
+      "loss": 0.104,
+      "step": 141
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.23275616546020486,
+      "learning_rate": 1e-05,
+      "loss": 0.1031,
+      "step": 142
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.23810819147753784,
+      "learning_rate": 1e-05,
+      "loss": 0.1016,
+      "step": 143
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.24027783267609687,
+      "learning_rate": 1e-05,
+      "loss": 0.1073,
+      "step": 144
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 0.23865216510806872,
+      "learning_rate": 1e-05,
+      "loss": 0.1006,
+      "step": 145
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 0.23204373209989085,
+      "learning_rate": 1e-05,
+      "loss": 0.1019,
+      "step": 146
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.2389114299180096,
+      "learning_rate": 1e-05,
+      "loss": 0.1044,
+      "step": 147
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.23039463317481657,
+      "learning_rate": 1e-05,
+      "loss": 0.0987,
+      "step": 148
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 0.23845334911947455,
+      "learning_rate": 1e-05,
+      "loss": 0.1028,
+      "step": 149
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.23652539759935484,
+      "learning_rate": 1e-05,
+      "loss": 0.1021,
+      "step": 150
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.23252388772767615,
+      "learning_rate": 1e-05,
+      "loss": 0.0999,
+      "step": 151
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 0.2387303176721335,
+      "learning_rate": 1e-05,
+      "loss": 0.1011,
+      "step": 152
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 0.23834307535754162,
+      "learning_rate": 1e-05,
+      "loss": 0.0977,
+      "step": 153
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.22614856805557293,
+      "learning_rate": 1e-05,
+      "loss": 0.1004,
+      "step": 154
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.22810172167379886,
+      "learning_rate": 1e-05,
+      "loss": 0.0953,
+      "step": 155
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 0.23276576120771708,
+      "learning_rate": 1e-05,
+      "loss": 0.0964,
+      "step": 156
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 0.23860552375581268,
+      "learning_rate": 1e-05,
+      "loss": 0.1019,
+      "step": 157
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.2278245783683838,
+      "learning_rate": 1e-05,
+      "loss": 0.0925,
+      "step": 158
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.24433715719422633,
+      "learning_rate": 1e-05,
+      "loss": 0.0955,
+      "step": 159
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 0.23944392401086093,
+      "learning_rate": 1e-05,
+      "loss": 0.096,
+      "step": 160
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 0.2328615631041631,
+      "learning_rate": 1e-05,
+      "loss": 0.0939,
+      "step": 161
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.2290088869757385,
+      "learning_rate": 1e-05,
+      "loss": 0.0959,
+      "step": 162
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.22438340751204006,
+      "learning_rate": 1e-05,
+      "loss": 0.095,
+      "step": 163
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 0.22880044061044771,
+      "learning_rate": 1e-05,
+      "loss": 0.0916,
+      "step": 164
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 0.22084052201833856,
+      "learning_rate": 1e-05,
+      "loss": 0.0876,
+      "step": 165
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.22372739836885092,
+      "learning_rate": 1e-05,
+      "loss": 0.0905,
+      "step": 166
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 0.21868291038919785,
+      "learning_rate": 1e-05,
+      "loss": 0.0926,
+      "step": 167
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 0.23455367163368512,
+      "learning_rate": 1e-05,
+      "loss": 0.0923,
+      "step": 168
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.2204351585120366,
+      "learning_rate": 1e-05,
+      "loss": 0.0896,
+      "step": 169
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.23401978770074194,
+      "learning_rate": 1e-05,
+      "loss": 0.0922,
+      "step": 170
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 0.23006845604541815,
+      "learning_rate": 1e-05,
+      "loss": 0.0929,
+      "step": 171
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 0.22367772927110594,
+      "learning_rate": 1e-05,
+      "loss": 0.0925,
+      "step": 172
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.21905617183568088,
+      "learning_rate": 1e-05,
+      "loss": 0.0862,
+      "step": 173
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.21928032048467122,
+      "learning_rate": 1e-05,
+      "loss": 0.0917,
+      "step": 174
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 0.2155940569925008,
+      "learning_rate": 1e-05,
+      "loss": 0.0861,
+      "step": 175
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 0.20905996557943637,
+      "learning_rate": 1e-05,
+      "loss": 0.0852,
+      "step": 176
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.21285005995058925,
+      "learning_rate": 1e-05,
+      "loss": 0.0845,
+      "step": 177
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.209712828323914,
+      "learning_rate": 1e-05,
+      "loss": 0.0809,
+      "step": 178
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 0.2221522197639559,
+      "learning_rate": 1e-05,
+      "loss": 0.0913,
+      "step": 179
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 0.22600512819104449,
+      "learning_rate": 1e-05,
+      "loss": 0.0889,
+      "step": 180
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.2253717005046901,
+      "learning_rate": 1e-05,
+      "loss": 0.0919,
+      "step": 181
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.22151678395197472,
+      "learning_rate": 1e-05,
+      "loss": 0.0906,
+      "step": 182
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 0.22227797000994332,
+      "learning_rate": 1e-05,
+      "loss": 0.0877,
+      "step": 183
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.22082709093842154,
+      "learning_rate": 1e-05,
+      "loss": 0.0885,
+      "step": 184
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.2202821891298244,
+      "learning_rate": 1e-05,
+      "loss": 0.0855,
+      "step": 185
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 0.20938170922248253,
+      "learning_rate": 1e-05,
+      "loss": 0.0831,
+      "step": 186
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 0.21449178686089804,
+      "learning_rate": 1e-05,
+      "loss": 0.0876,
+      "step": 187
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.2030838804724525,
+      "learning_rate": 1e-05,
+      "loss": 0.0824,
+      "step": 188
+    },
+    {
+      "epoch": 1.0,
+      "step": 188,
+      "total_flos": 217108736114688.0,
+      "train_loss": 0.17206811350076756,
+      "train_runtime": 8910.5782,
+      "train_samples_per_second": 2.031,
+      "train_steps_per_second": 0.021
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 188,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 1000,
+  "total_flos": 217108736114688.0,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:38211656ed81e8b85532b8f83ec58b79c4bbf84c5d4d551705e659ee1262fef0
+size 6971

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff