zhangchen1991 commited on
Commit
cb6024c
·
verified ·
1 Parent(s): 47a1a8c

Upload 20 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,3 +1,61 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: other
4
+ base_model: Qwen/Qwen2-7B-Instruct
5
+ tags:
6
+ - llama-factory
7
+ - full
8
+ - generated_from_trainer
9
+ model-index:
10
+ - name: r1baseline_qwen2_7b
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # r1baseline_qwen2_7b
18
+
19
+ This model is a fine-tuned version of [Qwen/Qwen2-7B-Instruct](https://huggingface.co/Qwen/Qwen2-7B-Instruct) on the r1baseline dataset.
20
+
21
+ ## Model description
22
+
23
+ More information needed
24
+
25
+ ## Intended uses & limitations
26
+
27
+ More information needed
28
+
29
+ ## Training and evaluation data
30
+
31
+ More information needed
32
+
33
+ ## Training procedure
34
+
35
+ ### Training hyperparameters
36
+
37
+ The following hyperparameters were used during training:
38
+ - learning_rate: 1e-05
39
+ - train_batch_size: 2
40
+ - eval_batch_size: 8
41
+ - seed: 42
42
+ - distributed_type: multi-GPU
43
+ - num_devices: 8
44
+ - gradient_accumulation_steps: 4
45
+ - total_train_batch_size: 64
46
+ - total_eval_batch_size: 64
47
+ - optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
48
+ - lr_scheduler_type: cosine
49
+ - lr_scheduler_warmup_ratio: 0.1
50
+ - num_epochs: 1.0
51
+
52
+ ### Training results
53
+
54
+
55
+
56
+ ### Framework versions
57
+
58
+ - Transformers 4.50.0
59
+ - Pytorch 2.6.0+cu124
60
+ - Datasets 3.4.1
61
+ - Tokenizers 0.21.0
added_tokens.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "<|endoftext|>": 151643,
3
+ "<|im_end|>": 151645,
4
+ "<|im_start|>": 151644
5
+ }
all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 0.999198503873898,
3
+ "total_flos": 294080346587136.0,
4
+ "train_loss": 0.8809508724008652,
5
+ "train_runtime": 6179.7212,
6
+ "train_samples_per_second": 9.691,
7
+ "train_steps_per_second": 0.151
8
+ }
config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen2ForCausalLM"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "bos_token_id": 151643,
7
+ "eos_token_id": 151645,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 3584,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 18944,
12
+ "max_position_embeddings": 32768,
13
+ "max_window_layers": 28,
14
+ "model_type": "qwen2",
15
+ "num_attention_heads": 28,
16
+ "num_hidden_layers": 28,
17
+ "num_key_value_heads": 4,
18
+ "rms_norm_eps": 1e-06,
19
+ "rope_scaling": null,
20
+ "rope_theta": 1000000.0,
21
+ "sliding_window": 131072,
22
+ "tie_word_embeddings": false,
23
+ "torch_dtype": "bfloat16",
24
+ "transformers_version": "4.50.0",
25
+ "use_cache": false,
26
+ "use_sliding_window": false,
27
+ "vocab_size": 152064
28
+ }
generation_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151645,
6
+ 151643
7
+ ],
8
+ "pad_token_id": 151643,
9
+ "repetition_penalty": 1.05,
10
+ "temperature": 0.7,
11
+ "top_k": 20,
12
+ "top_p": 0.8,
13
+ "transformers_version": "4.50.0"
14
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b1e46c7694da8b15a754e6d2e41761106d783bcd73bda315069441388c8c4ce9
3
+ size 4877660776
model-00002-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:39356877d47e3e6ae171668eed56aa63fa32005cfecbd8c2a2ca2687da38477e
3
+ size 4932751008
model-00003-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9267f24e9d26393277a8481b783d6c20db73ebe0146eebed814e14f6ea074feb
3
+ size 4330865200
model-00004-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88e7cdfe6ccdb7eae0fe4a3cc7f8a0f7d6b7a627d4c9c64eb0bb058525ee18b6
3
+ size 1089994880
model.safetensors.index.json ADDED
@@ -0,0 +1,346 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 15231233024
4
+ },
5
+ "weight_map": {
6
+ "lm_head.weight": "model-00004-of-00004.safetensors",
7
+ "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
8
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
9
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
10
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
11
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
12
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
13
+ "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
14
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
15
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
16
+ "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
17
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
18
+ "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
19
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
20
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
21
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
22
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
23
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
24
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
25
+ "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
26
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
27
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
28
+ "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
29
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
30
+ "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
31
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
32
+ "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
33
+ "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
34
+ "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
35
+ "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
36
+ "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
37
+ "model.layers.10.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
38
+ "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
39
+ "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
40
+ "model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
41
+ "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
42
+ "model.layers.10.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
43
+ "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
44
+ "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
45
+ "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
46
+ "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
47
+ "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
48
+ "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
49
+ "model.layers.11.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
50
+ "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
51
+ "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
52
+ "model.layers.11.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
53
+ "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
54
+ "model.layers.11.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
55
+ "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
56
+ "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
57
+ "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
58
+ "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
59
+ "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
60
+ "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
61
+ "model.layers.12.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
62
+ "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
63
+ "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
64
+ "model.layers.12.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
65
+ "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
66
+ "model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
67
+ "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
68
+ "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
69
+ "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
70
+ "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
71
+ "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
72
+ "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
73
+ "model.layers.13.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
74
+ "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
75
+ "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
76
+ "model.layers.13.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
77
+ "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
78
+ "model.layers.13.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
79
+ "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
80
+ "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
81
+ "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
82
+ "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
83
+ "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
84
+ "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
85
+ "model.layers.14.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
86
+ "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
87
+ "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
88
+ "model.layers.14.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
89
+ "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
90
+ "model.layers.14.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
91
+ "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
92
+ "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
93
+ "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
94
+ "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
95
+ "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
96
+ "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
97
+ "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
98
+ "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
99
+ "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
100
+ "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
101
+ "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
102
+ "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
103
+ "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
104
+ "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
105
+ "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
106
+ "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
107
+ "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
108
+ "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
109
+ "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
110
+ "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
111
+ "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
112
+ "model.layers.16.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
113
+ "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
114
+ "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
115
+ "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
116
+ "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
117
+ "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
118
+ "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
119
+ "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
120
+ "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
121
+ "model.layers.17.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
122
+ "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
123
+ "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
124
+ "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
125
+ "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
126
+ "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
127
+ "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
128
+ "model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors",
129
+ "model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
130
+ "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
131
+ "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
132
+ "model.layers.18.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
133
+ "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
134
+ "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
135
+ "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
136
+ "model.layers.18.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
137
+ "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
138
+ "model.layers.18.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
139
+ "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
140
+ "model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors",
141
+ "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
142
+ "model.layers.19.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
143
+ "model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
144
+ "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
145
+ "model.layers.19.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
146
+ "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
147
+ "model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
148
+ "model.layers.19.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
149
+ "model.layers.19.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
150
+ "model.layers.19.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
151
+ "model.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
152
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
153
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
154
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
155
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
156
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
157
+ "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
158
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
159
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
160
+ "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
161
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
162
+ "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
163
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
164
+ "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
165
+ "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
166
+ "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
167
+ "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
168
+ "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
169
+ "model.layers.20.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
170
+ "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
171
+ "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
172
+ "model.layers.20.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
173
+ "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
174
+ "model.layers.20.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
175
+ "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
176
+ "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
177
+ "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
178
+ "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
179
+ "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
180
+ "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
181
+ "model.layers.21.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
182
+ "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
183
+ "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
184
+ "model.layers.21.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
185
+ "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
186
+ "model.layers.21.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
187
+ "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
188
+ "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
189
+ "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
190
+ "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
191
+ "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
192
+ "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
193
+ "model.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
194
+ "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
195
+ "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
196
+ "model.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
197
+ "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
198
+ "model.layers.22.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
199
+ "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
200
+ "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
201
+ "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
202
+ "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
203
+ "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
204
+ "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
205
+ "model.layers.23.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
206
+ "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
207
+ "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
208
+ "model.layers.23.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
209
+ "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
210
+ "model.layers.23.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
211
+ "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
212
+ "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
213
+ "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
214
+ "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
215
+ "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
216
+ "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
217
+ "model.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
218
+ "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
219
+ "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
220
+ "model.layers.24.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
221
+ "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
222
+ "model.layers.24.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
223
+ "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
224
+ "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
225
+ "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
226
+ "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
227
+ "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
228
+ "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
229
+ "model.layers.25.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
230
+ "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
231
+ "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
232
+ "model.layers.25.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
233
+ "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
234
+ "model.layers.25.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
235
+ "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
236
+ "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
237
+ "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
238
+ "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
239
+ "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
240
+ "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
241
+ "model.layers.26.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
242
+ "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
243
+ "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
244
+ "model.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
245
+ "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
246
+ "model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
247
+ "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
248
+ "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
249
+ "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
250
+ "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
251
+ "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
252
+ "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
253
+ "model.layers.27.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
254
+ "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
255
+ "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
256
+ "model.layers.27.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
257
+ "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
258
+ "model.layers.27.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
259
+ "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
260
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
261
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
262
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
263
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
264
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
265
+ "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
266
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
267
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
268
+ "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
269
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
270
+ "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
271
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
272
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
273
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
274
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
275
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
276
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
277
+ "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
278
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
279
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
280
+ "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
281
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
282
+ "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
283
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
284
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
285
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
286
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
287
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
288
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
289
+ "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
290
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
291
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
292
+ "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
293
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
294
+ "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
295
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
296
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
297
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
298
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
299
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
300
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
301
+ "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
302
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
303
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
304
+ "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
305
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
306
+ "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
307
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
308
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors",
309
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
310
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
311
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
312
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
313
+ "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
314
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
315
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
316
+ "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
317
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
318
+ "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
319
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
320
+ "model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors",
321
+ "model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
322
+ "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
323
+ "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
324
+ "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
325
+ "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
326
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
327
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
328
+ "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
329
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
330
+ "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
331
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
332
+ "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
333
+ "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
334
+ "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
335
+ "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
336
+ "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
337
+ "model.layers.9.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
338
+ "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
339
+ "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
340
+ "model.layers.9.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
341
+ "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
342
+ "model.layers.9.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
343
+ "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
344
+ "model.norm.weight": "model-00003-of-00004.safetensors"
345
+ }
346
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>"
5
+ ],
6
+ "eos_token": {
7
+ "content": "<|im_end|>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false
12
+ },
13
+ "pad_token": {
14
+ "content": "<|endoftext|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ }
20
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bcfe42da0a4497e8b2b172c1f9f4ec423a46dc12907f4349c55025f670422ba9
3
+ size 11418266
tokenizer_config.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "151643": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "151644": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "151645": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ }
28
+ },
29
+ "additional_special_tokens": [
30
+ "<|im_start|>",
31
+ "<|im_end|>"
32
+ ],
33
+ "bos_token": null,
34
+ "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
35
+ "clean_up_tokenization_spaces": false,
36
+ "eos_token": "<|im_end|>",
37
+ "errors": "replace",
38
+ "extra_special_tokens": {},
39
+ "model_max_length": 131072,
40
+ "pad_token": "<|endoftext|>",
41
+ "padding_side": "right",
42
+ "split_special_tokens": false,
43
+ "tokenizer_class": "Qwen2Tokenizer",
44
+ "unk_token": null
45
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 0.999198503873898,
3
+ "total_flos": 294080346587136.0,
4
+ "train_loss": 0.8809508724008652,
5
+ "train_runtime": 6179.7212,
6
+ "train_samples_per_second": 9.691,
7
+ "train_steps_per_second": 0.151
8
+ }
trainer_log.jsonl ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"current_steps": 10, "total_steps": 935, "loss": 1.6763, "lr": 1.0638297872340427e-06, "epoch": 0.010686615014694095, "percentage": 1.07, "elapsed_time": "0:01:17", "remaining_time": "1:58:48"}
2
+ {"current_steps": 20, "total_steps": 935, "loss": 1.425, "lr": 2.1276595744680853e-06, "epoch": 0.02137323002938819, "percentage": 2.14, "elapsed_time": "0:02:17", "remaining_time": "1:45:08"}
3
+ {"current_steps": 30, "total_steps": 935, "loss": 1.1906, "lr": 3.191489361702128e-06, "epoch": 0.03205984504408229, "percentage": 3.21, "elapsed_time": "0:03:18", "remaining_time": "1:40:00"}
4
+ {"current_steps": 40, "total_steps": 935, "loss": 1.0792, "lr": 4.255319148936171e-06, "epoch": 0.04274646005877638, "percentage": 4.28, "elapsed_time": "0:04:27", "remaining_time": "1:39:51"}
5
+ {"current_steps": 50, "total_steps": 935, "loss": 1.0284, "lr": 5.319148936170213e-06, "epoch": 0.053433075073470476, "percentage": 5.35, "elapsed_time": "0:05:31", "remaining_time": "1:37:49"}
6
+ {"current_steps": 60, "total_steps": 935, "loss": 1.0023, "lr": 6.382978723404256e-06, "epoch": 0.06411969008816458, "percentage": 6.42, "elapsed_time": "0:06:40", "remaining_time": "1:37:19"}
7
+ {"current_steps": 70, "total_steps": 935, "loss": 0.9698, "lr": 7.446808510638298e-06, "epoch": 0.07480630510285867, "percentage": 7.49, "elapsed_time": "0:07:46", "remaining_time": "1:36:09"}
8
+ {"current_steps": 80, "total_steps": 935, "loss": 0.9759, "lr": 8.510638297872341e-06, "epoch": 0.08549292011755276, "percentage": 8.56, "elapsed_time": "0:08:49", "remaining_time": "1:34:22"}
9
+ {"current_steps": 90, "total_steps": 935, "loss": 0.9531, "lr": 9.574468085106385e-06, "epoch": 0.09617953513224686, "percentage": 9.63, "elapsed_time": "0:09:56", "remaining_time": "1:33:20"}
10
+ {"current_steps": 100, "total_steps": 935, "loss": 0.9628, "lr": 9.998744166446685e-06, "epoch": 0.10686615014694095, "percentage": 10.7, "elapsed_time": "0:11:01", "remaining_time": "1:32:02"}
11
+ {"current_steps": 110, "total_steps": 935, "loss": 0.9529, "lr": 9.991071912495701e-06, "epoch": 0.11755276516163506, "percentage": 11.76, "elapsed_time": "0:12:04", "remaining_time": "1:30:34"}
12
+ {"current_steps": 120, "total_steps": 935, "loss": 0.9425, "lr": 9.97643578176095e-06, "epoch": 0.12823938017632916, "percentage": 12.83, "elapsed_time": "0:13:08", "remaining_time": "1:29:12"}
13
+ {"current_steps": 130, "total_steps": 935, "loss": 0.9351, "lr": 9.95485619554928e-06, "epoch": 0.13892599519102325, "percentage": 13.9, "elapsed_time": "0:14:17", "remaining_time": "1:28:27"}
14
+ {"current_steps": 140, "total_steps": 935, "loss": 0.9183, "lr": 9.926363263140234e-06, "epoch": 0.14961261020571734, "percentage": 14.97, "elapsed_time": "0:15:15", "remaining_time": "1:26:40"}
15
+ {"current_steps": 150, "total_steps": 935, "loss": 0.9164, "lr": 9.890996739775562e-06, "epoch": 0.16029922522041143, "percentage": 16.04, "elapsed_time": "0:16:19", "remaining_time": "1:25:25"}
16
+ {"current_steps": 160, "total_steps": 935, "loss": 0.9158, "lr": 9.848805971190074e-06, "epoch": 0.17098584023510552, "percentage": 17.11, "elapsed_time": "0:17:23", "remaining_time": "1:24:12"}
17
+ {"current_steps": 170, "total_steps": 935, "loss": 0.9049, "lr": 9.799849824761159e-06, "epoch": 0.18167245524979964, "percentage": 18.18, "elapsed_time": "0:18:29", "remaining_time": "1:23:13"}
18
+ {"current_steps": 180, "total_steps": 935, "loss": 0.8936, "lr": 9.744196607373086e-06, "epoch": 0.19235907026449373, "percentage": 19.25, "elapsed_time": "0:19:32", "remaining_time": "1:21:58"}
19
+ {"current_steps": 190, "total_steps": 935, "loss": 0.9022, "lr": 9.681923970110698e-06, "epoch": 0.20304568527918782, "percentage": 20.32, "elapsed_time": "0:20:37", "remaining_time": "1:20:51"}
20
+ {"current_steps": 200, "total_steps": 935, "loss": 0.9003, "lr": 9.613118799915417e-06, "epoch": 0.2137323002938819, "percentage": 21.39, "elapsed_time": "0:21:41", "remaining_time": "1:19:43"}
21
+ {"current_steps": 210, "total_steps": 935, "loss": 0.9008, "lr": 9.537877098354787e-06, "epoch": 0.224418915308576, "percentage": 22.46, "elapsed_time": "0:23:10", "remaining_time": "1:20:01"}
22
+ {"current_steps": 220, "total_steps": 935, "loss": 0.8966, "lr": 9.456303847674674e-06, "epoch": 0.2351055303232701, "percentage": 23.53, "elapsed_time": "0:24:17", "remaining_time": "1:18:56"}
23
+ {"current_steps": 230, "total_steps": 935, "loss": 0.8959, "lr": 9.36851286432104e-06, "epoch": 0.2457921453379642, "percentage": 24.6, "elapsed_time": "0:25:21", "remaining_time": "1:17:44"}
24
+ {"current_steps": 240, "total_steps": 935, "loss": 0.8919, "lr": 9.274626640135616e-06, "epoch": 0.2564787603526583, "percentage": 25.67, "elapsed_time": "0:26:26", "remaining_time": "1:16:33"}
25
+ {"current_steps": 250, "total_steps": 935, "loss": 0.8899, "lr": 9.174776171447126e-06, "epoch": 0.2671653753673524, "percentage": 26.74, "elapsed_time": "0:27:41", "remaining_time": "1:15:51"}
26
+ {"current_steps": 260, "total_steps": 935, "loss": 0.9066, "lr": 9.06910077629645e-06, "epoch": 0.2778519903820465, "percentage": 27.81, "elapsed_time": "0:28:41", "remaining_time": "1:14:28"}
27
+ {"current_steps": 270, "total_steps": 935, "loss": 0.881, "lr": 8.957747900050797e-06, "epoch": 0.2885386053967406, "percentage": 28.88, "elapsed_time": "0:29:46", "remaining_time": "1:13:19"}
28
+ {"current_steps": 280, "total_steps": 935, "loss": 0.8909, "lr": 8.840872909678081e-06, "epoch": 0.2992252204114347, "percentage": 29.95, "elapsed_time": "0:30:49", "remaining_time": "1:12:07"}
29
+ {"current_steps": 290, "total_steps": 935, "loss": 0.8815, "lr": 8.718638876968564e-06, "epoch": 0.30991183542612877, "percentage": 31.02, "elapsed_time": "0:31:49", "remaining_time": "1:10:47"}
30
+ {"current_steps": 300, "total_steps": 935, "loss": 0.8817, "lr": 8.591216351006181e-06, "epoch": 0.32059845044082286, "percentage": 32.09, "elapsed_time": "0:32:54", "remaining_time": "1:09:39"}
31
+ {"current_steps": 310, "total_steps": 935, "loss": 0.8939, "lr": 8.458783120207099e-06, "epoch": 0.33128506545551695, "percentage": 33.16, "elapsed_time": "0:33:56", "remaining_time": "1:08:25"}
32
+ {"current_steps": 320, "total_steps": 935, "loss": 0.8768, "lr": 8.321523964257431e-06, "epoch": 0.34197168047021104, "percentage": 34.22, "elapsed_time": "0:35:06", "remaining_time": "1:07:28"}
33
+ {"current_steps": 330, "total_steps": 935, "loss": 0.8785, "lr": 8.179630396296285e-06, "epoch": 0.3526582954849052, "percentage": 35.29, "elapsed_time": "0:36:13", "remaining_time": "1:06:24"}
34
+ {"current_steps": 340, "total_steps": 935, "loss": 0.8743, "lr": 8.033300395703845e-06, "epoch": 0.36334491049959927, "percentage": 36.36, "elapsed_time": "0:37:10", "remaining_time": "1:05:04"}
35
+ {"current_steps": 350, "total_steps": 935, "loss": 0.8745, "lr": 7.88273813186732e-06, "epoch": 0.37403152551429336, "percentage": 37.43, "elapsed_time": "0:38:18", "remaining_time": "1:04:01"}
36
+ {"current_steps": 360, "total_steps": 935, "loss": 0.8695, "lr": 7.728153679310186e-06, "epoch": 0.38471814052898745, "percentage": 38.5, "elapsed_time": "0:39:20", "remaining_time": "1:02:49"}
37
+ {"current_steps": 370, "total_steps": 935, "loss": 0.8658, "lr": 7.569762724582179e-06, "epoch": 0.39540475554368154, "percentage": 39.57, "elapsed_time": "0:40:21", "remaining_time": "1:01:37"}
38
+ {"current_steps": 380, "total_steps": 935, "loss": 0.8744, "lr": 7.407786265319023e-06, "epoch": 0.40609137055837563, "percentage": 40.64, "elapsed_time": "0:41:24", "remaining_time": "1:00:28"}
39
+ {"current_steps": 390, "total_steps": 935, "loss": 0.8656, "lr": 7.242450301891772e-06, "epoch": 0.4167779855730697, "percentage": 41.71, "elapsed_time": "0:42:29", "remaining_time": "0:59:22"}
40
+ {"current_steps": 400, "total_steps": 935, "loss": 0.8749, "lr": 7.073985522076001e-06, "epoch": 0.4274646005877638, "percentage": 42.78, "elapsed_time": "0:43:32", "remaining_time": "0:58:14"}
41
+ {"current_steps": 410, "total_steps": 935, "loss": 0.8583, "lr": 6.902626979180821e-06, "epoch": 0.4381512156024579, "percentage": 43.85, "elapsed_time": "0:44:56", "remaining_time": "0:57:32"}
42
+ {"current_steps": 420, "total_steps": 935, "loss": 0.8658, "lr": 6.728613764086806e-06, "epoch": 0.448837830617152, "percentage": 44.92, "elapsed_time": "0:46:04", "remaining_time": "0:56:29"}
43
+ {"current_steps": 430, "total_steps": 935, "loss": 0.8588, "lr": 6.552188671650434e-06, "epoch": 0.45952444563184613, "percentage": 45.99, "elapsed_time": "0:47:05", "remaining_time": "0:55:18"}
44
+ {"current_steps": 440, "total_steps": 935, "loss": 0.8474, "lr": 6.373597861940488e-06, "epoch": 0.4702110606465402, "percentage": 47.06, "elapsed_time": "0:48:11", "remaining_time": "0:54:13"}
45
+ {"current_steps": 450, "total_steps": 935, "loss": 0.85, "lr": 6.1930905167791025e-06, "epoch": 0.4808976756612343, "percentage": 48.13, "elapsed_time": "0:49:18", "remaining_time": "0:53:08"}
46
+ {"current_steps": 460, "total_steps": 935, "loss": 0.8446, "lr": 6.010918492066628e-06, "epoch": 0.4915842906759284, "percentage": 49.2, "elapsed_time": "0:50:27", "remaining_time": "0:52:06"}
47
+ {"current_steps": 470, "total_steps": 935, "loss": 0.8568, "lr": 5.827335966375485e-06, "epoch": 0.5022709056906225, "percentage": 50.27, "elapsed_time": "0:51:36", "remaining_time": "0:51:03"}
48
+ {"current_steps": 480, "total_steps": 935, "loss": 0.8546, "lr": 5.642599086303233e-06, "epoch": 0.5129575207053166, "percentage": 51.34, "elapsed_time": "0:52:37", "remaining_time": "0:49:53"}
49
+ {"current_steps": 490, "total_steps": 935, "loss": 0.841, "lr": 5.456965609079741e-06, "epoch": 0.5236441357200107, "percentage": 52.41, "elapsed_time": "0:53:42", "remaining_time": "0:48:46"}
50
+ {"current_steps": 500, "total_steps": 935, "loss": 0.8489, "lr": 5.270694542927089e-06, "epoch": 0.5343307507347048, "percentage": 53.48, "elapsed_time": "0:54:47", "remaining_time": "0:47:40"}
51
+ {"current_steps": 510, "total_steps": 935, "loss": 0.8348, "lr": 5.084045785674001e-06, "epoch": 0.5450173657493989, "percentage": 54.55, "elapsed_time": "0:55:49", "remaining_time": "0:46:31"}
52
+ {"current_steps": 520, "total_steps": 935, "loss": 0.8409, "lr": 4.897279762129044e-06, "epoch": 0.555703980764093, "percentage": 55.61, "elapsed_time": "0:56:56", "remaining_time": "0:45:26"}
53
+ {"current_steps": 530, "total_steps": 935, "loss": 0.8499, "lr": 4.710657060718547e-06, "epoch": 0.566390595778787, "percentage": 56.68, "elapsed_time": "0:58:07", "remaining_time": "0:44:25"}
54
+ {"current_steps": 540, "total_steps": 935, "loss": 0.8433, "lr": 4.52443806989622e-06, "epoch": 0.5770772107934812, "percentage": 57.75, "elapsed_time": "0:59:08", "remaining_time": "0:43:15"}
55
+ {"current_steps": 550, "total_steps": 935, "loss": 0.8412, "lr": 4.338882614831817e-06, "epoch": 0.5877638258081752, "percentage": 58.82, "elapsed_time": "1:00:16", "remaining_time": "0:42:11"}
56
+ {"current_steps": 560, "total_steps": 935, "loss": 0.8335, "lr": 4.154249594885687e-06, "epoch": 0.5984504408228694, "percentage": 59.89, "elapsed_time": "1:01:21", "remaining_time": "0:41:05"}
57
+ {"current_steps": 570, "total_steps": 935, "loss": 0.8257, "lr": 3.970796622375116e-06, "epoch": 0.6091370558375635, "percentage": 60.96, "elapsed_time": "1:02:22", "remaining_time": "0:39:56"}
58
+ {"current_steps": 580, "total_steps": 935, "loss": 0.8417, "lr": 3.78877966313642e-06, "epoch": 0.6198236708522575, "percentage": 62.03, "elapsed_time": "1:03:24", "remaining_time": "0:38:48"}
59
+ {"current_steps": 590, "total_steps": 935, "loss": 0.8394, "lr": 3.608452679384311e-06, "epoch": 0.6305102858669517, "percentage": 63.1, "elapsed_time": "1:04:28", "remaining_time": "0:37:42"}
60
+ {"current_steps": 600, "total_steps": 935, "loss": 0.8363, "lr": 3.4300672753668635e-06, "epoch": 0.6411969008816457, "percentage": 64.17, "elapsed_time": "1:05:33", "remaining_time": "0:36:36"}
61
+ {"current_steps": 610, "total_steps": 935, "loss": 0.8237, "lr": 3.2538723463104737e-06, "epoch": 0.6518835158963399, "percentage": 65.24, "elapsed_time": "1:07:04", "remaining_time": "0:35:44"}
62
+ {"current_steps": 620, "total_steps": 935, "loss": 0.8403, "lr": 3.0801137311446087e-06, "epoch": 0.6625701309110339, "percentage": 66.31, "elapsed_time": "1:08:13", "remaining_time": "0:34:39"}
63
+ {"current_steps": 630, "total_steps": 935, "loss": 0.8283, "lr": 2.9090338694909254e-06, "epoch": 0.673256745925728, "percentage": 67.38, "elapsed_time": "1:09:22", "remaining_time": "0:33:35"}
64
+ {"current_steps": 640, "total_steps": 935, "loss": 0.8288, "lr": 2.740871463395325e-06, "epoch": 0.6839433609404221, "percentage": 68.45, "elapsed_time": "1:10:29", "remaining_time": "0:32:29"}
65
+ {"current_steps": 650, "total_steps": 935, "loss": 0.829, "lr": 2.575861144274914e-06, "epoch": 0.6946299759551162, "percentage": 69.52, "elapsed_time": "1:11:35", "remaining_time": "0:31:23"}
66
+ {"current_steps": 660, "total_steps": 935, "loss": 0.8256, "lr": 2.414233145544585e-06, "epoch": 0.7053165909698104, "percentage": 70.59, "elapsed_time": "1:12:39", "remaining_time": "0:30:16"}
67
+ {"current_steps": 670, "total_steps": 935, "loss": 0.813, "lr": 2.256212981379996e-06, "epoch": 0.7160032059845044, "percentage": 71.66, "elapsed_time": "1:13:42", "remaining_time": "0:29:09"}
68
+ {"current_steps": 680, "total_steps": 935, "loss": 0.8196, "lr": 2.1020211320651135e-06, "epoch": 0.7266898209991985, "percentage": 72.73, "elapsed_time": "1:14:44", "remaining_time": "0:28:01"}
69
+ {"current_steps": 690, "total_steps": 935, "loss": 0.8186, "lr": 1.9518727363634187e-06, "epoch": 0.7373764360138926, "percentage": 73.8, "elapsed_time": "1:15:47", "remaining_time": "0:26:54"}
70
+ {"current_steps": 700, "total_steps": 935, "loss": 0.8301, "lr": 1.8059772913419305e-06, "epoch": 0.7480630510285867, "percentage": 74.87, "elapsed_time": "1:16:56", "remaining_time": "0:25:49"}
71
+ {"current_steps": 710, "total_steps": 935, "loss": 0.8278, "lr": 1.6645383600669124e-06, "epoch": 0.7587496660432808, "percentage": 75.94, "elapsed_time": "1:17:57", "remaining_time": "0:24:42"}
72
+ {"current_steps": 720, "total_steps": 935, "loss": 0.8067, "lr": 1.527753287579084e-06, "epoch": 0.7694362810579749, "percentage": 77.01, "elapsed_time": "1:18:56", "remaining_time": "0:23:34"}
73
+ {"current_steps": 730, "total_steps": 935, "loss": 0.8201, "lr": 1.3958129255446585e-06, "epoch": 0.7801228960726689, "percentage": 78.07, "elapsed_time": "1:20:08", "remaining_time": "0:22:30"}
74
+ {"current_steps": 740, "total_steps": 935, "loss": 0.8143, "lr": 1.268901365966337e-06, "epoch": 0.7908095110873631, "percentage": 79.14, "elapsed_time": "1:21:14", "remaining_time": "0:21:24"}
75
+ {"current_steps": 750, "total_steps": 935, "loss": 0.8227, "lr": 1.1471956843258676e-06, "epoch": 0.8014961261020572, "percentage": 80.21, "elapsed_time": "1:22:28", "remaining_time": "0:20:20"}
76
+ {"current_steps": 760, "total_steps": 935, "loss": 0.8061, "lr": 1.0308656925165033e-06, "epoch": 0.8121827411167513, "percentage": 81.28, "elapsed_time": "1:23:26", "remaining_time": "0:19:12"}
77
+ {"current_steps": 770, "total_steps": 935, "loss": 0.8167, "lr": 9.200737019101169e-07, "epoch": 0.8228693561314454, "percentage": 82.35, "elapsed_time": "1:24:37", "remaining_time": "0:18:08"}
78
+ {"current_steps": 780, "total_steps": 935, "loss": 0.8238, "lr": 8.149742968895253e-07, "epoch": 0.8335559711461394, "percentage": 83.42, "elapsed_time": "1:25:40", "remaining_time": "0:17:01"}
79
+ {"current_steps": 790, "total_steps": 935, "loss": 0.8121, "lr": 7.157141191620548e-07, "epoch": 0.8442425861608336, "percentage": 84.49, "elapsed_time": "1:26:42", "remaining_time": "0:15:54"}
80
+ {"current_steps": 800, "total_steps": 935, "loss": 0.8215, "lr": 6.224316631552207e-07, "epoch": 0.8549292011755276, "percentage": 85.56, "elapsed_time": "1:27:44", "remaining_time": "0:14:48"}
81
+ {"current_steps": 810, "total_steps": 935, "loss": 0.8097, "lr": 5.35257082780069e-07, "epoch": 0.8656158161902218, "percentage": 86.63, "elapsed_time": "1:29:15", "remaining_time": "0:13:46"}
82
+ {"current_steps": 820, "total_steps": 935, "loss": 0.8155, "lr": 4.5431200983174493e-07, "epoch": 0.8763024312049158, "percentage": 87.7, "elapsed_time": "1:30:23", "remaining_time": "0:12:40"}
83
+ {"current_steps": 830, "total_steps": 935, "loss": 0.8074, "lr": 3.7970938428068813e-07, "epoch": 0.88698904621961, "percentage": 88.77, "elapsed_time": "1:31:30", "remaining_time": "0:11:34"}
84
+ {"current_steps": 840, "total_steps": 935, "loss": 0.8174, "lr": 3.1155329669124876e-07, "epoch": 0.897675661234304, "percentage": 89.84, "elapsed_time": "1:32:36", "remaining_time": "0:10:28"}
85
+ {"current_steps": 850, "total_steps": 935, "loss": 0.8224, "lr": 2.4993884298758097e-07, "epoch": 0.9083622762489981, "percentage": 90.91, "elapsed_time": "1:33:42", "remaining_time": "0:09:22"}
86
+ {"current_steps": 860, "total_steps": 935, "loss": 0.8165, "lr": 1.9495199176945977e-07, "epoch": 0.9190488912636923, "percentage": 91.98, "elapsed_time": "1:34:44", "remaining_time": "0:08:15"}
87
+ {"current_steps": 870, "total_steps": 935, "loss": 0.81, "lr": 1.4666946436314832e-07, "epoch": 0.9297355062783863, "percentage": 93.05, "elapsed_time": "1:35:51", "remaining_time": "0:07:09"}
88
+ {"current_steps": 880, "total_steps": 935, "loss": 0.8017, "lr": 1.0515862777468689e-07, "epoch": 0.9404221212930804, "percentage": 94.12, "elapsed_time": "1:36:55", "remaining_time": "0:06:03"}
89
+ {"current_steps": 890, "total_steps": 935, "loss": 0.8027, "lr": 7.047740069494102e-08, "epoch": 0.9511087363077745, "percentage": 95.19, "elapsed_time": "1:37:58", "remaining_time": "0:04:57"}
90
+ {"current_steps": 900, "total_steps": 935, "loss": 0.8162, "lr": 4.267417268758123e-08, "epoch": 0.9617953513224686, "percentage": 96.26, "elapsed_time": "1:39:03", "remaining_time": "0:03:51"}
91
+ {"current_steps": 910, "total_steps": 935, "loss": 0.7977, "lr": 2.178773667273204e-08, "epoch": 0.9724819663371627, "percentage": 97.33, "elapsed_time": "1:40:06", "remaining_time": "0:02:45"}
92
+ {"current_steps": 920, "total_steps": 935, "loss": 0.7983, "lr": 7.84723480049765e-09, "epoch": 0.9831685813518568, "percentage": 98.4, "elapsed_time": "1:41:05", "remaining_time": "0:01:38"}
93
+ {"current_steps": 930, "total_steps": 935, "loss": 0.8076, "lr": 8.721177898912691e-10, "epoch": 0.9938551963665508, "percentage": 99.47, "elapsed_time": "1:42:06", "remaining_time": "0:00:32"}
94
+ {"current_steps": 935, "total_steps": 935, "epoch": 0.999198503873898, "percentage": 100.0, "elapsed_time": "1:42:59", "remaining_time": "0:00:00"}
trainer_state.json ADDED
@@ -0,0 +1,694 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.999198503873898,
6
+ "eval_steps": 500,
7
+ "global_step": 935,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.010686615014694095,
14
+ "grad_norm": 9.303718590529266,
15
+ "learning_rate": 1.0638297872340427e-06,
16
+ "loss": 1.6763,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 0.02137323002938819,
21
+ "grad_norm": 4.2049552040041025,
22
+ "learning_rate": 2.1276595744680853e-06,
23
+ "loss": 1.425,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 0.03205984504408229,
28
+ "grad_norm": 2.5917969502796803,
29
+ "learning_rate": 3.191489361702128e-06,
30
+ "loss": 1.1906,
31
+ "step": 30
32
+ },
33
+ {
34
+ "epoch": 0.04274646005877638,
35
+ "grad_norm": 2.0653738579527476,
36
+ "learning_rate": 4.255319148936171e-06,
37
+ "loss": 1.0792,
38
+ "step": 40
39
+ },
40
+ {
41
+ "epoch": 0.053433075073470476,
42
+ "grad_norm": 2.184126642400671,
43
+ "learning_rate": 5.319148936170213e-06,
44
+ "loss": 1.0284,
45
+ "step": 50
46
+ },
47
+ {
48
+ "epoch": 0.06411969008816458,
49
+ "grad_norm": 1.9923309458736422,
50
+ "learning_rate": 6.382978723404256e-06,
51
+ "loss": 1.0023,
52
+ "step": 60
53
+ },
54
+ {
55
+ "epoch": 0.07480630510285867,
56
+ "grad_norm": 2.287951064859182,
57
+ "learning_rate": 7.446808510638298e-06,
58
+ "loss": 0.9698,
59
+ "step": 70
60
+ },
61
+ {
62
+ "epoch": 0.08549292011755276,
63
+ "grad_norm": 2.043235348375586,
64
+ "learning_rate": 8.510638297872341e-06,
65
+ "loss": 0.9759,
66
+ "step": 80
67
+ },
68
+ {
69
+ "epoch": 0.09617953513224686,
70
+ "grad_norm": 2.1176843964719225,
71
+ "learning_rate": 9.574468085106385e-06,
72
+ "loss": 0.9531,
73
+ "step": 90
74
+ },
75
+ {
76
+ "epoch": 0.10686615014694095,
77
+ "grad_norm": 2.0184548371650677,
78
+ "learning_rate": 9.998744166446685e-06,
79
+ "loss": 0.9628,
80
+ "step": 100
81
+ },
82
+ {
83
+ "epoch": 0.11755276516163506,
84
+ "grad_norm": 1.9759360997141997,
85
+ "learning_rate": 9.991071912495701e-06,
86
+ "loss": 0.9529,
87
+ "step": 110
88
+ },
89
+ {
90
+ "epoch": 0.12823938017632916,
91
+ "grad_norm": 1.9057746837755876,
92
+ "learning_rate": 9.97643578176095e-06,
93
+ "loss": 0.9425,
94
+ "step": 120
95
+ },
96
+ {
97
+ "epoch": 0.13892599519102325,
98
+ "grad_norm": 1.8896256230313964,
99
+ "learning_rate": 9.95485619554928e-06,
100
+ "loss": 0.9351,
101
+ "step": 130
102
+ },
103
+ {
104
+ "epoch": 0.14961261020571734,
105
+ "grad_norm": 1.822347337941142,
106
+ "learning_rate": 9.926363263140234e-06,
107
+ "loss": 0.9183,
108
+ "step": 140
109
+ },
110
+ {
111
+ "epoch": 0.16029922522041143,
112
+ "grad_norm": 1.8750489996356339,
113
+ "learning_rate": 9.890996739775562e-06,
114
+ "loss": 0.9164,
115
+ "step": 150
116
+ },
117
+ {
118
+ "epoch": 0.17098584023510552,
119
+ "grad_norm": 1.9672887139009323,
120
+ "learning_rate": 9.848805971190074e-06,
121
+ "loss": 0.9158,
122
+ "step": 160
123
+ },
124
+ {
125
+ "epoch": 0.18167245524979964,
126
+ "grad_norm": 1.906400621786519,
127
+ "learning_rate": 9.799849824761159e-06,
128
+ "loss": 0.9049,
129
+ "step": 170
130
+ },
131
+ {
132
+ "epoch": 0.19235907026449373,
133
+ "grad_norm": 1.8045388292051818,
134
+ "learning_rate": 9.744196607373086e-06,
135
+ "loss": 0.8936,
136
+ "step": 180
137
+ },
138
+ {
139
+ "epoch": 0.20304568527918782,
140
+ "grad_norm": 1.6986711816736872,
141
+ "learning_rate": 9.681923970110698e-06,
142
+ "loss": 0.9022,
143
+ "step": 190
144
+ },
145
+ {
146
+ "epoch": 0.2137323002938819,
147
+ "grad_norm": 1.7502980629856069,
148
+ "learning_rate": 9.613118799915417e-06,
149
+ "loss": 0.9003,
150
+ "step": 200
151
+ },
152
+ {
153
+ "epoch": 0.224418915308576,
154
+ "grad_norm": 1.7942316840778598,
155
+ "learning_rate": 9.537877098354787e-06,
156
+ "loss": 0.9008,
157
+ "step": 210
158
+ },
159
+ {
160
+ "epoch": 0.2351055303232701,
161
+ "grad_norm": 1.6977332582107252,
162
+ "learning_rate": 9.456303847674674e-06,
163
+ "loss": 0.8966,
164
+ "step": 220
165
+ },
166
+ {
167
+ "epoch": 0.2457921453379642,
168
+ "grad_norm": 1.8046282276555958,
169
+ "learning_rate": 9.36851286432104e-06,
170
+ "loss": 0.8959,
171
+ "step": 230
172
+ },
173
+ {
174
+ "epoch": 0.2564787603526583,
175
+ "grad_norm": 1.7598460527365811,
176
+ "learning_rate": 9.274626640135616e-06,
177
+ "loss": 0.8919,
178
+ "step": 240
179
+ },
180
+ {
181
+ "epoch": 0.2671653753673524,
182
+ "grad_norm": 1.7306968231867716,
183
+ "learning_rate": 9.174776171447126e-06,
184
+ "loss": 0.8899,
185
+ "step": 250
186
+ },
187
+ {
188
+ "epoch": 0.2778519903820465,
189
+ "grad_norm": 1.8487739920505808,
190
+ "learning_rate": 9.06910077629645e-06,
191
+ "loss": 0.9066,
192
+ "step": 260
193
+ },
194
+ {
195
+ "epoch": 0.2885386053967406,
196
+ "grad_norm": 1.8204032354319712,
197
+ "learning_rate": 8.957747900050797e-06,
198
+ "loss": 0.881,
199
+ "step": 270
200
+ },
201
+ {
202
+ "epoch": 0.2992252204114347,
203
+ "grad_norm": 1.7438745810568204,
204
+ "learning_rate": 8.840872909678081e-06,
205
+ "loss": 0.8909,
206
+ "step": 280
207
+ },
208
+ {
209
+ "epoch": 0.30991183542612877,
210
+ "grad_norm": 1.6992585656448118,
211
+ "learning_rate": 8.718638876968564e-06,
212
+ "loss": 0.8815,
213
+ "step": 290
214
+ },
215
+ {
216
+ "epoch": 0.32059845044082286,
217
+ "grad_norm": 1.7385894205587356,
218
+ "learning_rate": 8.591216351006181e-06,
219
+ "loss": 0.8817,
220
+ "step": 300
221
+ },
222
+ {
223
+ "epoch": 0.33128506545551695,
224
+ "grad_norm": 1.84244911709608,
225
+ "learning_rate": 8.458783120207099e-06,
226
+ "loss": 0.8939,
227
+ "step": 310
228
+ },
229
+ {
230
+ "epoch": 0.34197168047021104,
231
+ "grad_norm": 1.9043335502002536,
232
+ "learning_rate": 8.321523964257431e-06,
233
+ "loss": 0.8768,
234
+ "step": 320
235
+ },
236
+ {
237
+ "epoch": 0.3526582954849052,
238
+ "grad_norm": 1.7768671122350428,
239
+ "learning_rate": 8.179630396296285e-06,
240
+ "loss": 0.8785,
241
+ "step": 330
242
+ },
243
+ {
244
+ "epoch": 0.36334491049959927,
245
+ "grad_norm": 1.6006153712263769,
246
+ "learning_rate": 8.033300395703845e-06,
247
+ "loss": 0.8743,
248
+ "step": 340
249
+ },
250
+ {
251
+ "epoch": 0.37403152551429336,
252
+ "grad_norm": 1.5679354254760447,
253
+ "learning_rate": 7.88273813186732e-06,
254
+ "loss": 0.8745,
255
+ "step": 350
256
+ },
257
+ {
258
+ "epoch": 0.38471814052898745,
259
+ "grad_norm": 1.606479410734,
260
+ "learning_rate": 7.728153679310186e-06,
261
+ "loss": 0.8695,
262
+ "step": 360
263
+ },
264
+ {
265
+ "epoch": 0.39540475554368154,
266
+ "grad_norm": 1.6159012242347945,
267
+ "learning_rate": 7.569762724582179e-06,
268
+ "loss": 0.8658,
269
+ "step": 370
270
+ },
271
+ {
272
+ "epoch": 0.40609137055837563,
273
+ "grad_norm": 1.5775692676139867,
274
+ "learning_rate": 7.407786265319023e-06,
275
+ "loss": 0.8744,
276
+ "step": 380
277
+ },
278
+ {
279
+ "epoch": 0.4167779855730697,
280
+ "grad_norm": 1.646902202098876,
281
+ "learning_rate": 7.242450301891772e-06,
282
+ "loss": 0.8656,
283
+ "step": 390
284
+ },
285
+ {
286
+ "epoch": 0.4274646005877638,
287
+ "grad_norm": 1.8815811353495957,
288
+ "learning_rate": 7.073985522076001e-06,
289
+ "loss": 0.8749,
290
+ "step": 400
291
+ },
292
+ {
293
+ "epoch": 0.4381512156024579,
294
+ "grad_norm": 1.7119004087394138,
295
+ "learning_rate": 6.902626979180821e-06,
296
+ "loss": 0.8583,
297
+ "step": 410
298
+ },
299
+ {
300
+ "epoch": 0.448837830617152,
301
+ "grad_norm": 1.7052535262427972,
302
+ "learning_rate": 6.728613764086806e-06,
303
+ "loss": 0.8658,
304
+ "step": 420
305
+ },
306
+ {
307
+ "epoch": 0.45952444563184613,
308
+ "grad_norm": 1.6949567528292615,
309
+ "learning_rate": 6.552188671650434e-06,
310
+ "loss": 0.8588,
311
+ "step": 430
312
+ },
313
+ {
314
+ "epoch": 0.4702110606465402,
315
+ "grad_norm": 1.712559437409894,
316
+ "learning_rate": 6.373597861940488e-06,
317
+ "loss": 0.8474,
318
+ "step": 440
319
+ },
320
+ {
321
+ "epoch": 0.4808976756612343,
322
+ "grad_norm": 1.5211458326520666,
323
+ "learning_rate": 6.1930905167791025e-06,
324
+ "loss": 0.85,
325
+ "step": 450
326
+ },
327
+ {
328
+ "epoch": 0.4915842906759284,
329
+ "grad_norm": 1.6809900461998935,
330
+ "learning_rate": 6.010918492066628e-06,
331
+ "loss": 0.8446,
332
+ "step": 460
333
+ },
334
+ {
335
+ "epoch": 0.5022709056906225,
336
+ "grad_norm": 1.6045002009736615,
337
+ "learning_rate": 5.827335966375485e-06,
338
+ "loss": 0.8568,
339
+ "step": 470
340
+ },
341
+ {
342
+ "epoch": 0.5129575207053166,
343
+ "grad_norm": 1.7403739000791214,
344
+ "learning_rate": 5.642599086303233e-06,
345
+ "loss": 0.8546,
346
+ "step": 480
347
+ },
348
+ {
349
+ "epoch": 0.5236441357200107,
350
+ "grad_norm": 1.5403177746230916,
351
+ "learning_rate": 5.456965609079741e-06,
352
+ "loss": 0.841,
353
+ "step": 490
354
+ },
355
+ {
356
+ "epoch": 0.5343307507347048,
357
+ "grad_norm": 1.6247784516190182,
358
+ "learning_rate": 5.270694542927089e-06,
359
+ "loss": 0.8489,
360
+ "step": 500
361
+ },
362
+ {
363
+ "epoch": 0.5450173657493989,
364
+ "grad_norm": 1.5920701242817856,
365
+ "learning_rate": 5.084045785674001e-06,
366
+ "loss": 0.8348,
367
+ "step": 510
368
+ },
369
+ {
370
+ "epoch": 0.555703980764093,
371
+ "grad_norm": 1.5609228162933224,
372
+ "learning_rate": 4.897279762129044e-06,
373
+ "loss": 0.8409,
374
+ "step": 520
375
+ },
376
+ {
377
+ "epoch": 0.566390595778787,
378
+ "grad_norm": 1.5517291862499378,
379
+ "learning_rate": 4.710657060718547e-06,
380
+ "loss": 0.8499,
381
+ "step": 530
382
+ },
383
+ {
384
+ "epoch": 0.5770772107934812,
385
+ "grad_norm": 1.5438065032843633,
386
+ "learning_rate": 4.52443806989622e-06,
387
+ "loss": 0.8433,
388
+ "step": 540
389
+ },
390
+ {
391
+ "epoch": 0.5877638258081752,
392
+ "grad_norm": 1.570747625419,
393
+ "learning_rate": 4.338882614831817e-06,
394
+ "loss": 0.8412,
395
+ "step": 550
396
+ },
397
+ {
398
+ "epoch": 0.5984504408228694,
399
+ "grad_norm": 1.578653927808263,
400
+ "learning_rate": 4.154249594885687e-06,
401
+ "loss": 0.8335,
402
+ "step": 560
403
+ },
404
+ {
405
+ "epoch": 0.6091370558375635,
406
+ "grad_norm": 1.5460487548547177,
407
+ "learning_rate": 3.970796622375116e-06,
408
+ "loss": 0.8257,
409
+ "step": 570
410
+ },
411
+ {
412
+ "epoch": 0.6198236708522575,
413
+ "grad_norm": 1.6247896398878883,
414
+ "learning_rate": 3.78877966313642e-06,
415
+ "loss": 0.8417,
416
+ "step": 580
417
+ },
418
+ {
419
+ "epoch": 0.6305102858669517,
420
+ "grad_norm": 1.5766609294405798,
421
+ "learning_rate": 3.608452679384311e-06,
422
+ "loss": 0.8394,
423
+ "step": 590
424
+ },
425
+ {
426
+ "epoch": 0.6411969008816457,
427
+ "grad_norm": 1.5009363290484534,
428
+ "learning_rate": 3.4300672753668635e-06,
429
+ "loss": 0.8363,
430
+ "step": 600
431
+ },
432
+ {
433
+ "epoch": 0.6518835158963399,
434
+ "grad_norm": 1.3449975185358125,
435
+ "learning_rate": 3.2538723463104737e-06,
436
+ "loss": 0.8237,
437
+ "step": 610
438
+ },
439
+ {
440
+ "epoch": 0.6625701309110339,
441
+ "grad_norm": 1.5414021316742408,
442
+ "learning_rate": 3.0801137311446087e-06,
443
+ "loss": 0.8403,
444
+ "step": 620
445
+ },
446
+ {
447
+ "epoch": 0.673256745925728,
448
+ "grad_norm": 1.5417414906318716,
449
+ "learning_rate": 2.9090338694909254e-06,
450
+ "loss": 0.8283,
451
+ "step": 630
452
+ },
453
+ {
454
+ "epoch": 0.6839433609404221,
455
+ "grad_norm": 1.611712800585379,
456
+ "learning_rate": 2.740871463395325e-06,
457
+ "loss": 0.8288,
458
+ "step": 640
459
+ },
460
+ {
461
+ "epoch": 0.6946299759551162,
462
+ "grad_norm": 1.5836781341981323,
463
+ "learning_rate": 2.575861144274914e-06,
464
+ "loss": 0.829,
465
+ "step": 650
466
+ },
467
+ {
468
+ "epoch": 0.7053165909698104,
469
+ "grad_norm": 1.5725193301762863,
470
+ "learning_rate": 2.414233145544585e-06,
471
+ "loss": 0.8256,
472
+ "step": 660
473
+ },
474
+ {
475
+ "epoch": 0.7160032059845044,
476
+ "grad_norm": 1.611578698840131,
477
+ "learning_rate": 2.256212981379996e-06,
478
+ "loss": 0.813,
479
+ "step": 670
480
+ },
481
+ {
482
+ "epoch": 0.7266898209991985,
483
+ "grad_norm": 1.6351602980260465,
484
+ "learning_rate": 2.1020211320651135e-06,
485
+ "loss": 0.8196,
486
+ "step": 680
487
+ },
488
+ {
489
+ "epoch": 0.7373764360138926,
490
+ "grad_norm": 1.5674828913506538,
491
+ "learning_rate": 1.9518727363634187e-06,
492
+ "loss": 0.8186,
493
+ "step": 690
494
+ },
495
+ {
496
+ "epoch": 0.7480630510285867,
497
+ "grad_norm": 1.5827636966731728,
498
+ "learning_rate": 1.8059772913419305e-06,
499
+ "loss": 0.8301,
500
+ "step": 700
501
+ },
502
+ {
503
+ "epoch": 0.7587496660432808,
504
+ "grad_norm": 1.5766941762097497,
505
+ "learning_rate": 1.6645383600669124e-06,
506
+ "loss": 0.8278,
507
+ "step": 710
508
+ },
509
+ {
510
+ "epoch": 0.7694362810579749,
511
+ "grad_norm": 1.3889367362711451,
512
+ "learning_rate": 1.527753287579084e-06,
513
+ "loss": 0.8067,
514
+ "step": 720
515
+ },
516
+ {
517
+ "epoch": 0.7801228960726689,
518
+ "grad_norm": 1.4818805825718362,
519
+ "learning_rate": 1.3958129255446585e-06,
520
+ "loss": 0.8201,
521
+ "step": 730
522
+ },
523
+ {
524
+ "epoch": 0.7908095110873631,
525
+ "grad_norm": 1.5506195665632077,
526
+ "learning_rate": 1.268901365966337e-06,
527
+ "loss": 0.8143,
528
+ "step": 740
529
+ },
530
+ {
531
+ "epoch": 0.8014961261020572,
532
+ "grad_norm": 1.5473244103552335,
533
+ "learning_rate": 1.1471956843258676e-06,
534
+ "loss": 0.8227,
535
+ "step": 750
536
+ },
537
+ {
538
+ "epoch": 0.8121827411167513,
539
+ "grad_norm": 1.5416734356294126,
540
+ "learning_rate": 1.0308656925165033e-06,
541
+ "loss": 0.8061,
542
+ "step": 760
543
+ },
544
+ {
545
+ "epoch": 0.8228693561314454,
546
+ "grad_norm": 1.533940388246987,
547
+ "learning_rate": 9.200737019101169e-07,
548
+ "loss": 0.8167,
549
+ "step": 770
550
+ },
551
+ {
552
+ "epoch": 0.8335559711461394,
553
+ "grad_norm": 1.5428459412647224,
554
+ "learning_rate": 8.149742968895253e-07,
555
+ "loss": 0.8238,
556
+ "step": 780
557
+ },
558
+ {
559
+ "epoch": 0.8442425861608336,
560
+ "grad_norm": 1.5243535258550542,
561
+ "learning_rate": 7.157141191620548e-07,
562
+ "loss": 0.8121,
563
+ "step": 790
564
+ },
565
+ {
566
+ "epoch": 0.8549292011755276,
567
+ "grad_norm": 1.5090393842399528,
568
+ "learning_rate": 6.224316631552207e-07,
569
+ "loss": 0.8215,
570
+ "step": 800
571
+ },
572
+ {
573
+ "epoch": 0.8656158161902218,
574
+ "grad_norm": 1.5455226884316506,
575
+ "learning_rate": 5.35257082780069e-07,
576
+ "loss": 0.8097,
577
+ "step": 810
578
+ },
579
+ {
580
+ "epoch": 0.8763024312049158,
581
+ "grad_norm": 1.5725635552110713,
582
+ "learning_rate": 4.5431200983174493e-07,
583
+ "loss": 0.8155,
584
+ "step": 820
585
+ },
586
+ {
587
+ "epoch": 0.88698904621961,
588
+ "grad_norm": 1.5330525300887172,
589
+ "learning_rate": 3.7970938428068813e-07,
590
+ "loss": 0.8074,
591
+ "step": 830
592
+ },
593
+ {
594
+ "epoch": 0.897675661234304,
595
+ "grad_norm": 1.5669753095302341,
596
+ "learning_rate": 3.1155329669124876e-07,
597
+ "loss": 0.8174,
598
+ "step": 840
599
+ },
600
+ {
601
+ "epoch": 0.9083622762489981,
602
+ "grad_norm": 1.4999285751677753,
603
+ "learning_rate": 2.4993884298758097e-07,
604
+ "loss": 0.8224,
605
+ "step": 850
606
+ },
607
+ {
608
+ "epoch": 0.9190488912636923,
609
+ "grad_norm": 1.575834305911122,
610
+ "learning_rate": 1.9495199176945977e-07,
611
+ "loss": 0.8165,
612
+ "step": 860
613
+ },
614
+ {
615
+ "epoch": 0.9297355062783863,
616
+ "grad_norm": 1.5453590208287702,
617
+ "learning_rate": 1.4666946436314832e-07,
618
+ "loss": 0.81,
619
+ "step": 870
620
+ },
621
+ {
622
+ "epoch": 0.9404221212930804,
623
+ "grad_norm": 1.517809787007839,
624
+ "learning_rate": 1.0515862777468689e-07,
625
+ "loss": 0.8017,
626
+ "step": 880
627
+ },
628
+ {
629
+ "epoch": 0.9511087363077745,
630
+ "grad_norm": 1.458666629774803,
631
+ "learning_rate": 7.047740069494102e-08,
632
+ "loss": 0.8027,
633
+ "step": 890
634
+ },
635
+ {
636
+ "epoch": 0.9617953513224686,
637
+ "grad_norm": 1.4699204057681212,
638
+ "learning_rate": 4.267417268758123e-08,
639
+ "loss": 0.8162,
640
+ "step": 900
641
+ },
642
+ {
643
+ "epoch": 0.9724819663371627,
644
+ "grad_norm": 1.4226825019870848,
645
+ "learning_rate": 2.178773667273204e-08,
646
+ "loss": 0.7977,
647
+ "step": 910
648
+ },
649
+ {
650
+ "epoch": 0.9831685813518568,
651
+ "grad_norm": 1.5106026858790984,
652
+ "learning_rate": 7.84723480049765e-09,
653
+ "loss": 0.7983,
654
+ "step": 920
655
+ },
656
+ {
657
+ "epoch": 0.9938551963665508,
658
+ "grad_norm": 1.603850822361151,
659
+ "learning_rate": 8.721177898912691e-10,
660
+ "loss": 0.8076,
661
+ "step": 930
662
+ },
663
+ {
664
+ "epoch": 0.999198503873898,
665
+ "step": 935,
666
+ "total_flos": 294080346587136.0,
667
+ "train_loss": 0.8809508724008652,
668
+ "train_runtime": 6179.7212,
669
+ "train_samples_per_second": 9.691,
670
+ "train_steps_per_second": 0.151
671
+ }
672
+ ],
673
+ "logging_steps": 10,
674
+ "max_steps": 935,
675
+ "num_input_tokens_seen": 0,
676
+ "num_train_epochs": 1,
677
+ "save_steps": 200,
678
+ "stateful_callbacks": {
679
+ "TrainerControl": {
680
+ "args": {
681
+ "should_epoch_stop": false,
682
+ "should_evaluate": false,
683
+ "should_log": false,
684
+ "should_save": true,
685
+ "should_training_stop": true
686
+ },
687
+ "attributes": {}
688
+ }
689
+ },
690
+ "total_flos": 294080346587136.0,
691
+ "train_batch_size": 2,
692
+ "trial_name": null,
693
+ "trial_params": null
694
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:661d07d9ec55bce1aad4a70c54f3d86db40381deb914150ad8abbc6cebda6295
3
+ size 7480
training_loss.png ADDED
vocab.json ADDED
The diff for this file is too large to render. See raw diff