agentlans commited on
Commit
9e0d8e1
·
verified ·
1 Parent(s): 5ab8820

Upload 11 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
all_results.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 10.0,
3
+ "eval_accuracy": 0.9015177065767285,
4
+ "eval_loss": 0.2585154175758362,
5
+ "eval_runtime": 1.5991,
6
+ "eval_samples": 2965,
7
+ "eval_samples_per_second": 1854.212,
8
+ "eval_steps_per_second": 232.011,
9
+ "num_input_tokens_seen": 15175680,
10
+ "total_flos": 1952467720519680.0,
11
+ "train_loss": 0.1272777229185529,
12
+ "train_runtime": 445.3252,
13
+ "train_samples": 11856,
14
+ "train_samples_per_second": 266.232,
15
+ "train_steps_per_second": 33.279
16
+ }
config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertForSequenceClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 0,
7
+ "classifier_dropout": null,
8
+ "dtype": "float32",
9
+ "eos_token_id": 2,
10
+ "finetuning_task": "text-classification",
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout_prob": 0.1,
13
+ "hidden_size": 384,
14
+ "id2label": {
15
+ "0": "False",
16
+ "1": "True"
17
+ },
18
+ "initializer_range": 0.02,
19
+ "intermediate_size": 1536,
20
+ "label2id": {
21
+ "False": 0,
22
+ "True": 1
23
+ },
24
+ "layer_norm_eps": 1e-12,
25
+ "max_position_embeddings": 512,
26
+ "model_type": "bert",
27
+ "num_attention_heads": 12,
28
+ "num_hidden_layers": 12,
29
+ "pad_token_id": 1,
30
+ "position_embedding_type": "absolute",
31
+ "problem_type": "single_label_classification",
32
+ "tokenizer_class": "XLMRobertaTokenizer",
33
+ "transformers_version": "5.0.0.dev0",
34
+ "type_vocab_size": 2,
35
+ "use_cache": false,
36
+ "vocab_size": 250037
37
+ }
eval_results.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 10.0,
3
+ "eval_accuracy": 0.9015177065767285,
4
+ "eval_loss": 0.2585154175758362,
5
+ "eval_runtime": 1.5991,
6
+ "eval_samples": 2965,
7
+ "eval_samples_per_second": 1854.212,
8
+ "eval_steps_per_second": 232.011,
9
+ "num_input_tokens_seen": 15175680
10
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4058cab6a49e456434b0ed21f33d32bde158795e7e8c36347ffb94bd3c7c06dd
3
+ size 470641664
sentencepiece.bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
3
+ size 5069051
special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "<s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "<mask>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "<pad>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "</s>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "<unk>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66e2c4647474659095b757711e8aef0583d58dbb50e3349958ebc460a9cf4977
3
+ size 17083065
tokenizer_config.json ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "250001": {
36
+ "content": "<mask>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "bos_token": "<s>",
45
+ "clean_up_tokenization_spaces": true,
46
+ "cls_token": "<s>",
47
+ "eos_token": "</s>",
48
+ "extra_special_tokens": {},
49
+ "mask_token": "<mask>",
50
+ "max_length": 512,
51
+ "model_max_length": 512,
52
+ "pad_to_multiple_of": null,
53
+ "pad_token": "<pad>",
54
+ "pad_token_type_id": 0,
55
+ "padding_side": "right",
56
+ "sep_token": "</s>",
57
+ "sp_model_kwargs": {},
58
+ "stride": 0,
59
+ "tokenizer_class": "XLMRobertaTokenizer",
60
+ "truncation_side": "right",
61
+ "truncation_strategy": "longest_first",
62
+ "unk_token": "<unk>"
63
+ }
train_results.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 10.0,
3
+ "num_input_tokens_seen": 15175680,
4
+ "total_flos": 1952467720519680.0,
5
+ "train_loss": 0.1272777229185529,
6
+ "train_runtime": 445.3252,
7
+ "train_samples": 11856,
8
+ "train_samples_per_second": 266.232,
9
+ "train_steps_per_second": 33.279
10
+ }
trainer_state.json ADDED
@@ -0,0 +1,434 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 1482,
3
+ "best_metric": 0.2585154175758362,
4
+ "best_model_checkpoint": "multilingual-e5-small-aligned-v2-pii-detector/checkpoint-1482",
5
+ "epoch": 10.0,
6
+ "eval_steps": 500,
7
+ "global_step": 14820,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.33738191632928477,
14
+ "grad_norm": 2.627490997314453,
15
+ "learning_rate": 4.8316464237516873e-05,
16
+ "loss": 0.3028,
17
+ "num_input_tokens_seen": 512000,
18
+ "step": 500,
19
+ "train_runtime": 13.9553,
20
+ "train_tokens_per_second": 36688.687
21
+ },
22
+ {
23
+ "epoch": 0.6747638326585695,
24
+ "grad_norm": 0.09764809906482697,
25
+ "learning_rate": 4.662955465587045e-05,
26
+ "loss": 0.238,
27
+ "num_input_tokens_seen": 1024000,
28
+ "step": 1000,
29
+ "train_runtime": 27.6858,
30
+ "train_tokens_per_second": 36986.469
31
+ },
32
+ {
33
+ "epoch": 1.0,
34
+ "eval_accuracy": 0.9015177065767285,
35
+ "eval_loss": 0.2585154175758362,
36
+ "eval_runtime": 1.648,
37
+ "eval_samples_per_second": 1799.16,
38
+ "eval_steps_per_second": 225.122,
39
+ "num_input_tokens_seen": 1517568,
40
+ "step": 1482
41
+ },
42
+ {
43
+ "epoch": 1.0121457489878543,
44
+ "grad_norm": 2.041290521621704,
45
+ "learning_rate": 4.494264507422402e-05,
46
+ "loss": 0.2022,
47
+ "num_input_tokens_seen": 1536000,
48
+ "step": 1500,
49
+ "train_runtime": 43.9883,
50
+ "train_tokens_per_second": 34918.384
51
+ },
52
+ {
53
+ "epoch": 1.349527665317139,
54
+ "grad_norm": 0.10939698666334152,
55
+ "learning_rate": 4.32557354925776e-05,
56
+ "loss": 0.1781,
57
+ "num_input_tokens_seen": 2048000,
58
+ "step": 2000,
59
+ "train_runtime": 57.4658,
60
+ "train_tokens_per_second": 35638.593
61
+ },
62
+ {
63
+ "epoch": 1.686909581646424,
64
+ "grad_norm": 5.614178657531738,
65
+ "learning_rate": 4.156882591093118e-05,
66
+ "loss": 0.1922,
67
+ "num_input_tokens_seen": 2560000,
68
+ "step": 2500,
69
+ "train_runtime": 71.0513,
70
+ "train_tokens_per_second": 36030.282
71
+ },
72
+ {
73
+ "epoch": 2.0,
74
+ "eval_accuracy": 0.9173693086003373,
75
+ "eval_loss": 0.3090151250362396,
76
+ "eval_runtime": 1.6796,
77
+ "eval_samples_per_second": 1765.251,
78
+ "eval_steps_per_second": 220.88,
79
+ "num_input_tokens_seen": 3035136,
80
+ "step": 2964
81
+ },
82
+ {
83
+ "epoch": 2.0242914979757085,
84
+ "grad_norm": 0.06700567156076431,
85
+ "learning_rate": 3.9881916329284755e-05,
86
+ "loss": 0.1585,
87
+ "num_input_tokens_seen": 3072000,
88
+ "step": 3000,
89
+ "train_runtime": 87.6915,
90
+ "train_tokens_per_second": 35031.905
91
+ },
92
+ {
93
+ "epoch": 2.361673414304993,
94
+ "grad_norm": 0.5916054844856262,
95
+ "learning_rate": 3.8195006747638326e-05,
96
+ "loss": 0.1549,
97
+ "num_input_tokens_seen": 3584000,
98
+ "step": 3500,
99
+ "train_runtime": 101.7739,
100
+ "train_tokens_per_second": 35215.309
101
+ },
102
+ {
103
+ "epoch": 2.699055330634278,
104
+ "grad_norm": 0.6893392205238342,
105
+ "learning_rate": 3.6508097165991904e-05,
106
+ "loss": 0.1517,
107
+ "num_input_tokens_seen": 4096000,
108
+ "step": 4000,
109
+ "train_runtime": 116.1299,
110
+ "train_tokens_per_second": 35270.842
111
+ },
112
+ {
113
+ "epoch": 3.0,
114
+ "eval_accuracy": 0.9079258010118044,
115
+ "eval_loss": 0.28507259488105774,
116
+ "eval_runtime": 1.6356,
117
+ "eval_samples_per_second": 1812.841,
118
+ "eval_steps_per_second": 226.834,
119
+ "num_input_tokens_seen": 4552704,
120
+ "step": 4446
121
+ },
122
+ {
123
+ "epoch": 3.0364372469635628,
124
+ "grad_norm": 6.807932376861572,
125
+ "learning_rate": 3.482118758434548e-05,
126
+ "loss": 0.156,
127
+ "num_input_tokens_seen": 4608000,
128
+ "step": 4500,
129
+ "train_runtime": 133.056,
130
+ "train_tokens_per_second": 34632.022
131
+ },
132
+ {
133
+ "epoch": 3.3738191632928474,
134
+ "grad_norm": 52.58000946044922,
135
+ "learning_rate": 3.313427800269906e-05,
136
+ "loss": 0.1233,
137
+ "num_input_tokens_seen": 5120000,
138
+ "step": 5000,
139
+ "train_runtime": 147.2098,
140
+ "train_tokens_per_second": 34780.299
141
+ },
142
+ {
143
+ "epoch": 3.7112010796221324,
144
+ "grad_norm": 0.09153091162443161,
145
+ "learning_rate": 3.144736842105263e-05,
146
+ "loss": 0.1412,
147
+ "num_input_tokens_seen": 5632000,
148
+ "step": 5500,
149
+ "train_runtime": 161.9414,
150
+ "train_tokens_per_second": 34778.016
151
+ },
152
+ {
153
+ "epoch": 4.0,
154
+ "eval_accuracy": 0.9059021922428331,
155
+ "eval_loss": 0.31255096197128296,
156
+ "eval_runtime": 1.8022,
157
+ "eval_samples_per_second": 1645.244,
158
+ "eval_steps_per_second": 205.864,
159
+ "num_input_tokens_seen": 6070272,
160
+ "step": 5928
161
+ },
162
+ {
163
+ "epoch": 4.048582995951417,
164
+ "grad_norm": 0.014544670470058918,
165
+ "learning_rate": 2.9760458839406208e-05,
166
+ "loss": 0.1248,
167
+ "num_input_tokens_seen": 6144000,
168
+ "step": 6000,
169
+ "train_runtime": 179.8529,
170
+ "train_tokens_per_second": 34161.242
171
+ },
172
+ {
173
+ "epoch": 4.385964912280702,
174
+ "grad_norm": 0.022761313244700432,
175
+ "learning_rate": 2.8073549257759785e-05,
176
+ "loss": 0.0975,
177
+ "num_input_tokens_seen": 6656000,
178
+ "step": 6500,
179
+ "train_runtime": 194.2334,
180
+ "train_tokens_per_second": 34268.042
181
+ },
182
+ {
183
+ "epoch": 4.723346828609986,
184
+ "grad_norm": 0.27537310123443604,
185
+ "learning_rate": 2.6386639676113363e-05,
186
+ "loss": 0.1271,
187
+ "num_input_tokens_seen": 7168000,
188
+ "step": 7000,
189
+ "train_runtime": 207.8817,
190
+ "train_tokens_per_second": 34481.154
191
+ },
192
+ {
193
+ "epoch": 5.0,
194
+ "eval_accuracy": 0.9018549747048904,
195
+ "eval_loss": 0.34343627095222473,
196
+ "eval_runtime": 1.8728,
197
+ "eval_samples_per_second": 1583.159,
198
+ "eval_steps_per_second": 198.095,
199
+ "num_input_tokens_seen": 7587840,
200
+ "step": 7410
201
+ },
202
+ {
203
+ "epoch": 5.060728744939271,
204
+ "grad_norm": 0.3377642333507538,
205
+ "learning_rate": 2.4699730094466937e-05,
206
+ "loss": 0.1278,
207
+ "num_input_tokens_seen": 7680000,
208
+ "step": 7500,
209
+ "train_runtime": 224.822,
210
+ "train_tokens_per_second": 34160.358
211
+ },
212
+ {
213
+ "epoch": 5.398110661268556,
214
+ "grad_norm": 0.011958185583353043,
215
+ "learning_rate": 2.301282051282051e-05,
216
+ "loss": 0.0988,
217
+ "num_input_tokens_seen": 8192000,
218
+ "step": 8000,
219
+ "train_runtime": 238.5719,
220
+ "train_tokens_per_second": 34337.661
221
+ },
222
+ {
223
+ "epoch": 5.7354925775978405,
224
+ "grad_norm": 0.01739046536386013,
225
+ "learning_rate": 2.132591093117409e-05,
226
+ "loss": 0.1018,
227
+ "num_input_tokens_seen": 8704000,
228
+ "step": 8500,
229
+ "train_runtime": 253.495,
230
+ "train_tokens_per_second": 34335.983
231
+ },
232
+ {
233
+ "epoch": 6.0,
234
+ "eval_accuracy": 0.8930860033726813,
235
+ "eval_loss": 0.38996028900146484,
236
+ "eval_runtime": 1.6539,
237
+ "eval_samples_per_second": 1792.721,
238
+ "eval_steps_per_second": 224.317,
239
+ "num_input_tokens_seen": 9105408,
240
+ "step": 8892
241
+ },
242
+ {
243
+ "epoch": 6.0728744939271255,
244
+ "grad_norm": 0.3100438714027405,
245
+ "learning_rate": 1.9639001349527667e-05,
246
+ "loss": 0.1168,
247
+ "num_input_tokens_seen": 9216000,
248
+ "step": 9000,
249
+ "train_runtime": 271.7013,
250
+ "train_tokens_per_second": 33919.602
251
+ },
252
+ {
253
+ "epoch": 6.410256410256411,
254
+ "grad_norm": 0.43619751930236816,
255
+ "learning_rate": 1.7952091767881245e-05,
256
+ "loss": 0.0869,
257
+ "num_input_tokens_seen": 9728000,
258
+ "step": 9500,
259
+ "train_runtime": 285.8275,
260
+ "train_tokens_per_second": 34034.516
261
+ },
262
+ {
263
+ "epoch": 6.747638326585695,
264
+ "grad_norm": 0.006818657275289297,
265
+ "learning_rate": 1.626518218623482e-05,
266
+ "loss": 0.1037,
267
+ "num_input_tokens_seen": 10240000,
268
+ "step": 10000,
269
+ "train_runtime": 299.932,
270
+ "train_tokens_per_second": 34141.069
271
+ },
272
+ {
273
+ "epoch": 7.0,
274
+ "eval_accuracy": 0.8944350758853289,
275
+ "eval_loss": 0.39529648423194885,
276
+ "eval_runtime": 1.7201,
277
+ "eval_samples_per_second": 1723.784,
278
+ "eval_steps_per_second": 215.691,
279
+ "num_input_tokens_seen": 10622976,
280
+ "step": 10374
281
+ },
282
+ {
283
+ "epoch": 7.08502024291498,
284
+ "grad_norm": 0.07480119913816452,
285
+ "learning_rate": 1.4578272604588395e-05,
286
+ "loss": 0.1063,
287
+ "num_input_tokens_seen": 10752000,
288
+ "step": 10500,
289
+ "train_runtime": 316.9384,
290
+ "train_tokens_per_second": 33924.575
291
+ },
292
+ {
293
+ "epoch": 7.422402159244265,
294
+ "grad_norm": 0.008870264515280724,
295
+ "learning_rate": 1.289136302294197e-05,
296
+ "loss": 0.0908,
297
+ "num_input_tokens_seen": 11264000,
298
+ "step": 11000,
299
+ "train_runtime": 331.4588,
300
+ "train_tokens_per_second": 33983.108
301
+ },
302
+ {
303
+ "epoch": 7.759784075573549,
304
+ "grad_norm": 0.33139288425445557,
305
+ "learning_rate": 1.1204453441295547e-05,
306
+ "loss": 0.0902,
307
+ "num_input_tokens_seen": 11776000,
308
+ "step": 11500,
309
+ "train_runtime": 345.9573,
310
+ "train_tokens_per_second": 34038.879
311
+ },
312
+ {
313
+ "epoch": 8.0,
314
+ "eval_accuracy": 0.894097807757167,
315
+ "eval_loss": 0.5339534282684326,
316
+ "eval_runtime": 1.8369,
317
+ "eval_samples_per_second": 1614.124,
318
+ "eval_steps_per_second": 201.97,
319
+ "num_input_tokens_seen": 12140544,
320
+ "step": 11856
321
+ },
322
+ {
323
+ "epoch": 8.097165991902834,
324
+ "grad_norm": 1.103989839553833,
325
+ "learning_rate": 9.517543859649124e-06,
326
+ "loss": 0.0792,
327
+ "num_input_tokens_seen": 12288000,
328
+ "step": 12000,
329
+ "train_runtime": 363.0247,
330
+ "train_tokens_per_second": 33848.936
331
+ },
332
+ {
333
+ "epoch": 8.434547908232119,
334
+ "grad_norm": 0.19695305824279785,
335
+ "learning_rate": 7.830634278002699e-06,
336
+ "loss": 0.0771,
337
+ "num_input_tokens_seen": 12800000,
338
+ "step": 12500,
339
+ "train_runtime": 376.8777,
340
+ "train_tokens_per_second": 33963.276
341
+ },
342
+ {
343
+ "epoch": 8.771929824561404,
344
+ "grad_norm": 3.2966551780700684,
345
+ "learning_rate": 6.1437246963562756e-06,
346
+ "loss": 0.077,
347
+ "num_input_tokens_seen": 13312000,
348
+ "step": 13000,
349
+ "train_runtime": 390.4455,
350
+ "train_tokens_per_second": 34094.384
351
+ },
352
+ {
353
+ "epoch": 9.0,
354
+ "eval_accuracy": 0.8900505902192243,
355
+ "eval_loss": 0.6329491138458252,
356
+ "eval_runtime": 1.6301,
357
+ "eval_samples_per_second": 1818.909,
358
+ "eval_steps_per_second": 227.594,
359
+ "num_input_tokens_seen": 13658112,
360
+ "step": 13338
361
+ },
362
+ {
363
+ "epoch": 9.109311740890687,
364
+ "grad_norm": 2.050884485244751,
365
+ "learning_rate": 4.4568151147098515e-06,
366
+ "loss": 0.0783,
367
+ "num_input_tokens_seen": 13824000,
368
+ "step": 13500,
369
+ "train_runtime": 406.9294,
370
+ "train_tokens_per_second": 33971.492
371
+ },
372
+ {
373
+ "epoch": 9.446693657219972,
374
+ "grad_norm": 0.005987819749861956,
375
+ "learning_rate": 2.769905533063428e-06,
376
+ "loss": 0.0717,
377
+ "num_input_tokens_seen": 14336000,
378
+ "step": 14000,
379
+ "train_runtime": 420.4707,
380
+ "train_tokens_per_second": 34095.126
381
+ },
382
+ {
383
+ "epoch": 9.784075573549257,
384
+ "grad_norm": 0.0041707539930939674,
385
+ "learning_rate": 1.0829959514170041e-06,
386
+ "loss": 0.0685,
387
+ "num_input_tokens_seen": 14848000,
388
+ "step": 14500,
389
+ "train_runtime": 433.8559,
390
+ "train_tokens_per_second": 34223.346
391
+ },
392
+ {
393
+ "epoch": 10.0,
394
+ "eval_accuracy": 0.8920741989881956,
395
+ "eval_loss": 0.6290408968925476,
396
+ "eval_runtime": 1.6333,
397
+ "eval_samples_per_second": 1815.288,
398
+ "eval_steps_per_second": 227.141,
399
+ "num_input_tokens_seen": 15175680,
400
+ "step": 14820
401
+ },
402
+ {
403
+ "epoch": 10.0,
404
+ "num_input_tokens_seen": 15175680,
405
+ "step": 14820,
406
+ "total_flos": 1952467720519680.0,
407
+ "train_loss": 0.1272777229185529,
408
+ "train_runtime": 445.3252,
409
+ "train_samples_per_second": 266.232,
410
+ "train_steps_per_second": 33.279
411
+ }
412
+ ],
413
+ "logging_steps": 500,
414
+ "max_steps": 14820,
415
+ "num_input_tokens_seen": 15175680,
416
+ "num_train_epochs": 10,
417
+ "save_steps": 500,
418
+ "stateful_callbacks": {
419
+ "TrainerControl": {
420
+ "args": {
421
+ "should_epoch_stop": false,
422
+ "should_evaluate": false,
423
+ "should_log": false,
424
+ "should_save": true,
425
+ "should_training_stop": true
426
+ },
427
+ "attributes": {}
428
+ }
429
+ },
430
+ "total_flos": 1952467720519680.0,
431
+ "train_batch_size": 8,
432
+ "trial_name": null,
433
+ "trial_params": null
434
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:32ce1e246e8da9cb1f7bf84495bb07f5e6a549f44f69a69266d1dec339ecb772
3
+ size 5201