Gabe-Thomp commited on
Commit
54c8ec8
·
verified ·
1 Parent(s): 06bdb7b

Model save

Browse files
all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "total_flos": 231659938701312.0,
3
+ "train_loss": 0.5313233767376572,
4
+ "train_runtime": 12361.3768,
5
+ "train_samples": 15500,
6
+ "train_samples_per_second": 3.762,
7
+ "train_steps_per_second": 0.03
8
+ }
generation_config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 2,
4
+ "cache_implementation": "hybrid",
5
+ "eos_token_id": 1,
6
+ "pad_token_id": 0,
7
+ "transformers_version": "4.54.0"
8
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "total_flos": 231659938701312.0,
3
+ "train_loss": 0.5313233767376572,
4
+ "train_runtime": 12361.3768,
5
+ "train_samples": 15500,
6
+ "train_samples_per_second": 3.762,
7
+ "train_steps_per_second": 0.03
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,702 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 3.0,
6
+ "eval_steps": 500,
7
+ "global_step": 366,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.0412796697626419,
14
+ "grad_norm": 15.624099418508212,
15
+ "learning_rate": 2.1621621621621622e-07,
16
+ "loss": 1.0211,
17
+ "mean_token_accuracy": 0.7495194703340531,
18
+ "num_tokens": 753861.0,
19
+ "step": 5
20
+ },
21
+ {
22
+ "epoch": 0.0825593395252838,
23
+ "grad_norm": 14.089666887893312,
24
+ "learning_rate": 4.864864864864865e-07,
25
+ "loss": 0.9982,
26
+ "mean_token_accuracy": 0.7503360390663147,
27
+ "num_tokens": 1499146.0,
28
+ "step": 10
29
+ },
30
+ {
31
+ "epoch": 0.1238390092879257,
32
+ "grad_norm": 8.165011263693703,
33
+ "learning_rate": 7.567567567567568e-07,
34
+ "loss": 0.8403,
35
+ "mean_token_accuracy": 0.774555218219757,
36
+ "num_tokens": 2234191.0,
37
+ "step": 15
38
+ },
39
+ {
40
+ "epoch": 0.1651186790505676,
41
+ "grad_norm": 2.4829258847685955,
42
+ "learning_rate": 1.0270270270270269e-06,
43
+ "loss": 0.7115,
44
+ "mean_token_accuracy": 0.7917023435235023,
45
+ "num_tokens": 2982701.0,
46
+ "step": 20
47
+ },
48
+ {
49
+ "epoch": 0.20639834881320948,
50
+ "grad_norm": 1.4992039643671118,
51
+ "learning_rate": 1.2972972972972972e-06,
52
+ "loss": 0.6584,
53
+ "mean_token_accuracy": 0.7975756898522377,
54
+ "num_tokens": 3733149.0,
55
+ "step": 25
56
+ },
57
+ {
58
+ "epoch": 0.2476780185758514,
59
+ "grad_norm": 1.3240435233707502,
60
+ "learning_rate": 1.5675675675675676e-06,
61
+ "loss": 0.6196,
62
+ "mean_token_accuracy": 0.8073552951216698,
63
+ "num_tokens": 4474136.0,
64
+ "step": 30
65
+ },
66
+ {
67
+ "epoch": 0.2889576883384933,
68
+ "grad_norm": 1.7540112925667055,
69
+ "learning_rate": 1.837837837837838e-06,
70
+ "loss": 0.6287,
71
+ "mean_token_accuracy": 0.8032846122980117,
72
+ "num_tokens": 5210845.0,
73
+ "step": 35
74
+ },
75
+ {
76
+ "epoch": 0.3302373581011352,
77
+ "grad_norm": 0.981995931842241,
78
+ "learning_rate": 1.9998176420316e-06,
79
+ "loss": 0.6227,
80
+ "mean_token_accuracy": 0.8027533680200577,
81
+ "num_tokens": 5964637.0,
82
+ "step": 40
83
+ },
84
+ {
85
+ "epoch": 0.3715170278637771,
86
+ "grad_norm": 0.990647528055256,
87
+ "learning_rate": 1.9977668786231533e-06,
88
+ "loss": 0.6315,
89
+ "mean_token_accuracy": 0.8004137575626373,
90
+ "num_tokens": 6720240.0,
91
+ "step": 45
92
+ },
93
+ {
94
+ "epoch": 0.41279669762641896,
95
+ "grad_norm": 0.8897636043459279,
96
+ "learning_rate": 1.993442093851331e-06,
97
+ "loss": 0.5916,
98
+ "mean_token_accuracy": 0.811573649942875,
99
+ "num_tokens": 7470701.0,
100
+ "step": 50
101
+ },
102
+ {
103
+ "epoch": 0.4540763673890609,
104
+ "grad_norm": 0.9169927457005795,
105
+ "learning_rate": 1.986853144380224e-06,
106
+ "loss": 0.5955,
107
+ "mean_token_accuracy": 0.811116699874401,
108
+ "num_tokens": 8221962.0,
109
+ "step": 55
110
+ },
111
+ {
112
+ "epoch": 0.4953560371517028,
113
+ "grad_norm": 0.9021550704152176,
114
+ "learning_rate": 1.9780150471563555e-06,
115
+ "loss": 0.6043,
116
+ "mean_token_accuracy": 0.8079309955239296,
117
+ "num_tokens": 8984378.0,
118
+ "step": 60
119
+ },
120
+ {
121
+ "epoch": 0.5366357069143447,
122
+ "grad_norm": 0.8518882228941898,
123
+ "learning_rate": 1.9669479451833974e-06,
124
+ "loss": 0.604,
125
+ "mean_token_accuracy": 0.8058954104781151,
126
+ "num_tokens": 9729682.0,
127
+ "step": 65
128
+ },
129
+ {
130
+ "epoch": 0.5779153766769866,
131
+ "grad_norm": 0.869489796529064,
132
+ "learning_rate": 1.9536770616140275e-06,
133
+ "loss": 0.5939,
134
+ "mean_token_accuracy": 0.8116138935089111,
135
+ "num_tokens": 10470679.0,
136
+ "step": 70
137
+ },
138
+ {
139
+ "epoch": 0.6191950464396285,
140
+ "grad_norm": 0.860002637647343,
141
+ "learning_rate": 1.9382326422635704e-06,
142
+ "loss": 0.5832,
143
+ "mean_token_accuracy": 0.8130927443504333,
144
+ "num_tokens": 11233125.0,
145
+ "step": 75
146
+ },
147
+ {
148
+ "epoch": 0.6604747162022704,
149
+ "grad_norm": 0.8461444317665581,
150
+ "learning_rate": 1.920649886676429e-06,
151
+ "loss": 0.5589,
152
+ "mean_token_accuracy": 0.8205906450748444,
153
+ "num_tokens": 11965589.0,
154
+ "step": 80
155
+ },
156
+ {
157
+ "epoch": 0.7017543859649122,
158
+ "grad_norm": 0.9136979277004551,
159
+ "learning_rate": 1.9009688679024189e-06,
160
+ "loss": 0.5925,
161
+ "mean_token_accuracy": 0.8109533503651619,
162
+ "num_tokens": 12717735.0,
163
+ "step": 85
164
+ },
165
+ {
166
+ "epoch": 0.7430340557275542,
167
+ "grad_norm": 0.8541232534987969,
168
+ "learning_rate": 1.8792344411658468e-06,
169
+ "loss": 0.5724,
170
+ "mean_token_accuracy": 0.815299516916275,
171
+ "num_tokens": 13470614.0,
172
+ "step": 90
173
+ },
174
+ {
175
+ "epoch": 0.7843137254901961,
176
+ "grad_norm": 0.8780037746703969,
177
+ "learning_rate": 1.8554961416354758e-06,
178
+ "loss": 0.587,
179
+ "mean_token_accuracy": 0.8129138767719268,
180
+ "num_tokens": 14199610.0,
181
+ "step": 95
182
+ },
183
+ {
184
+ "epoch": 0.8255933952528379,
185
+ "grad_norm": 0.9183336104957267,
186
+ "learning_rate": 1.8298080715283857e-06,
187
+ "loss": 0.5732,
188
+ "mean_token_accuracy": 0.8151614516973495,
189
+ "num_tokens": 14950919.0,
190
+ "step": 100
191
+ },
192
+ {
193
+ "epoch": 0.8668730650154799,
194
+ "grad_norm": 0.8600833044234086,
195
+ "learning_rate": 1.80222877680502e-06,
196
+ "loss": 0.5913,
197
+ "mean_token_accuracy": 0.8102210983633995,
198
+ "num_tokens": 15707369.0,
199
+ "step": 105
200
+ },
201
+ {
202
+ "epoch": 0.9081527347781218,
203
+ "grad_norm": 0.901112261036662,
204
+ "learning_rate": 1.7728211137364486e-06,
205
+ "loss": 0.5735,
206
+ "mean_token_accuracy": 0.8148060619831086,
207
+ "num_tokens": 16455219.0,
208
+ "step": 110
209
+ },
210
+ {
211
+ "epoch": 0.9494324045407637,
212
+ "grad_norm": 0.8398945948990572,
213
+ "learning_rate": 1.7416521056479575e-06,
214
+ "loss": 0.5892,
215
+ "mean_token_accuracy": 0.811500047147274,
216
+ "num_tokens": 17211297.0,
217
+ "step": 115
218
+ },
219
+ {
220
+ "epoch": 0.9907120743034056,
221
+ "grad_norm": 0.8890507447299595,
222
+ "learning_rate": 1.7087927901654556e-06,
223
+ "loss": 0.5693,
224
+ "mean_token_accuracy": 0.8162169650197029,
225
+ "num_tokens": 17945495.0,
226
+ "step": 120
227
+ },
228
+ {
229
+ "epoch": 1.0247678018575852,
230
+ "grad_norm": 0.9298798379000134,
231
+ "learning_rate": 1.6743180573128493e-06,
232
+ "loss": 0.5766,
233
+ "mean_token_accuracy": 0.8257526899829055,
234
+ "num_tokens": 18569799.0,
235
+ "step": 125
236
+ },
237
+ {
238
+ "epoch": 1.066047471620227,
239
+ "grad_norm": 0.9389781614014343,
240
+ "learning_rate": 1.6383064788293728e-06,
241
+ "loss": 0.5057,
242
+ "mean_token_accuracy": 0.8331148475408554,
243
+ "num_tokens": 19311720.0,
244
+ "step": 130
245
+ },
246
+ {
247
+ "epoch": 1.107327141382869,
248
+ "grad_norm": 0.9165276144771588,
249
+ "learning_rate": 1.6008401290958805e-06,
250
+ "loss": 0.5192,
251
+ "mean_token_accuracy": 0.8288257464766502,
252
+ "num_tokens": 20053638.0,
253
+ "step": 135
254
+ },
255
+ {
256
+ "epoch": 1.1486068111455108,
257
+ "grad_norm": 0.9180564284822471,
258
+ "learning_rate": 1.5620043980782325e-06,
259
+ "loss": 0.5062,
260
+ "mean_token_accuracy": 0.8319134443998337,
261
+ "num_tokens": 20821118.0,
262
+ "step": 140
263
+ },
264
+ {
265
+ "epoch": 1.1898864809081529,
266
+ "grad_norm": 0.9467359343070767,
267
+ "learning_rate": 1.521887796714092e-06,
268
+ "loss": 0.5052,
269
+ "mean_token_accuracy": 0.8318841397762299,
270
+ "num_tokens": 21558684.0,
271
+ "step": 145
272
+ },
273
+ {
274
+ "epoch": 1.2311661506707947,
275
+ "grad_norm": 0.9416451507331786,
276
+ "learning_rate": 1.4805817551866838e-06,
277
+ "loss": 0.5069,
278
+ "mean_token_accuracy": 0.8323728099465371,
279
+ "num_tokens": 22320277.0,
280
+ "step": 150
281
+ },
282
+ {
283
+ "epoch": 1.2724458204334366,
284
+ "grad_norm": 0.9170674897620965,
285
+ "learning_rate": 1.438180414545267e-06,
286
+ "loss": 0.4836,
287
+ "mean_token_accuracy": 0.839121425151825,
288
+ "num_tokens": 23061357.0,
289
+ "step": 155
290
+ },
291
+ {
292
+ "epoch": 1.3137254901960784,
293
+ "grad_norm": 0.890883059084858,
294
+ "learning_rate": 1.394780412147245e-06,
295
+ "loss": 0.4891,
296
+ "mean_token_accuracy": 0.837252251803875,
297
+ "num_tokens": 23817765.0,
298
+ "step": 160
299
+ },
300
+ {
301
+ "epoch": 1.3550051599587203,
302
+ "grad_norm": 0.9261908491580952,
303
+ "learning_rate": 1.3504806614109097e-06,
304
+ "loss": 0.4847,
305
+ "mean_token_accuracy": 0.8408183559775353,
306
+ "num_tokens": 24558776.0,
307
+ "step": 165
308
+ },
309
+ {
310
+ "epoch": 1.3962848297213624,
311
+ "grad_norm": 0.8928610102614272,
312
+ "learning_rate": 1.3053821263807945e-06,
313
+ "loss": 0.5086,
314
+ "mean_token_accuracy": 0.8326794102787971,
315
+ "num_tokens": 25308397.0,
316
+ "step": 170
317
+ },
318
+ {
319
+ "epoch": 1.437564499484004,
320
+ "grad_norm": 0.9511213183140893,
321
+ "learning_rate": 1.2595875916194184e-06,
322
+ "loss": 0.485,
323
+ "mean_token_accuracy": 0.83836370408535,
324
+ "num_tokens": 26060513.0,
325
+ "step": 175
326
+ },
327
+ {
328
+ "epoch": 1.478844169246646,
329
+ "grad_norm": 0.9622471592321337,
330
+ "learning_rate": 1.21320142794987e-06,
331
+ "loss": 0.4972,
332
+ "mean_token_accuracy": 0.8352236717939376,
333
+ "num_tokens": 26828045.0,
334
+ "step": 180
335
+ },
336
+ {
337
+ "epoch": 1.520123839009288,
338
+ "grad_norm": 0.9274039425644313,
339
+ "learning_rate": 1.16632935458313e-06,
340
+ "loss": 0.5004,
341
+ "mean_token_accuracy": 0.8354085251688957,
342
+ "num_tokens": 27572442.0,
343
+ "step": 185
344
+ },
345
+ {
346
+ "epoch": 1.5614035087719298,
347
+ "grad_norm": 0.9308885942067677,
348
+ "learning_rate": 1.119078198172262e-06,
349
+ "loss": 0.5058,
350
+ "mean_token_accuracy": 0.8333476513624192,
351
+ "num_tokens": 28328964.0,
352
+ "step": 190
353
+ },
354
+ {
355
+ "epoch": 1.6026831785345719,
356
+ "grad_norm": 0.8757908786448626,
357
+ "learning_rate": 1.071555649342626e-06,
358
+ "loss": 0.5047,
359
+ "mean_token_accuracy": 0.8343596026301384,
360
+ "num_tokens": 29081735.0,
361
+ "step": 195
362
+ },
363
+ {
364
+ "epoch": 1.6439628482972135,
365
+ "grad_norm": 0.9354324187980655,
366
+ "learning_rate": 1.0238700172530007e-06,
367
+ "loss": 0.4975,
368
+ "mean_token_accuracy": 0.834177765250206,
369
+ "num_tokens": 29830443.0,
370
+ "step": 200
371
+ },
372
+ {
373
+ "epoch": 1.6852425180598556,
374
+ "grad_norm": 0.9383174369871288,
375
+ "learning_rate": 9.761299827469992e-07,
376
+ "loss": 0.5167,
377
+ "mean_token_accuracy": 0.8294686481356621,
378
+ "num_tokens": 30564371.0,
379
+ "step": 205
380
+ },
381
+ {
382
+ "epoch": 1.7265221878224974,
383
+ "grad_norm": 0.9485743322852883,
384
+ "learning_rate": 9.284443506573739e-07,
385
+ "loss": 0.4961,
386
+ "mean_token_accuracy": 0.8358075320720673,
387
+ "num_tokens": 31307876.0,
388
+ "step": 210
389
+ },
390
+ {
391
+ "epoch": 1.7678018575851393,
392
+ "grad_norm": 0.8524322530089569,
393
+ "learning_rate": 8.809218018277377e-07,
394
+ "loss": 0.4856,
395
+ "mean_token_accuracy": 0.8387641996145249,
396
+ "num_tokens": 32058278.0,
397
+ "step": 215
398
+ },
399
+ {
400
+ "epoch": 1.8090815273477814,
401
+ "grad_norm": 1.1251297373275735,
402
+ "learning_rate": 8.336706454168699e-07,
403
+ "loss": 0.5009,
404
+ "mean_token_accuracy": 0.8337702050805091,
405
+ "num_tokens": 32785582.0,
406
+ "step": 220
407
+ },
408
+ {
409
+ "epoch": 1.850361197110423,
410
+ "grad_norm": 0.9150995192385856,
411
+ "learning_rate": 7.8679857205013e-07,
412
+ "loss": 0.5002,
413
+ "mean_token_accuracy": 0.8346363499760627,
414
+ "num_tokens": 33526490.0,
415
+ "step": 225
416
+ },
417
+ {
418
+ "epoch": 1.891640866873065,
419
+ "grad_norm": 0.9121811917199866,
420
+ "learning_rate": 7.404124083805818e-07,
421
+ "loss": 0.4951,
422
+ "mean_token_accuracy": 0.8354502245783806,
423
+ "num_tokens": 34276923.0,
424
+ "step": 230
425
+ },
426
+ {
427
+ "epoch": 1.932920536635707,
428
+ "grad_norm": 0.9500668445733738,
429
+ "learning_rate": 6.946178736192052e-07,
430
+ "loss": 0.5093,
431
+ "mean_token_accuracy": 0.8325110420584678,
432
+ "num_tokens": 35026509.0,
433
+ "step": 235
434
+ },
435
+ {
436
+ "epoch": 1.9742002063983488,
437
+ "grad_norm": 0.936384210591819,
438
+ "learning_rate": 6.495193385890901e-07,
439
+ "loss": 0.4915,
440
+ "mean_token_accuracy": 0.8370130330324173,
441
+ "num_tokens": 35765801.0,
442
+ "step": 240
443
+ },
444
+ {
445
+ "epoch": 2.0082559339525283,
446
+ "grad_norm": 1.081443933500798,
447
+ "learning_rate": 6.052195878527549e-07,
448
+ "loss": 0.534,
449
+ "mean_token_accuracy": 0.8311992432131912,
450
+ "num_tokens": 36399496.0,
451
+ "step": 245
452
+ },
453
+ {
454
+ "epoch": 2.0495356037151704,
455
+ "grad_norm": 0.9264320640204967,
456
+ "learning_rate": 5.618195854547332e-07,
457
+ "loss": 0.4596,
458
+ "mean_token_accuracy": 0.8468097746372223,
459
+ "num_tokens": 37157466.0,
460
+ "step": 250
461
+ },
462
+ {
463
+ "epoch": 2.090815273477812,
464
+ "grad_norm": 0.9743487807211495,
465
+ "learning_rate": 5.194182448133162e-07,
466
+ "loss": 0.4507,
467
+ "mean_token_accuracy": 0.8484956413507462,
468
+ "num_tokens": 37921600.0,
469
+ "step": 255
470
+ },
471
+ {
472
+ "epoch": 2.132094943240454,
473
+ "grad_norm": 0.9384842383306781,
474
+ "learning_rate": 4.781122032859079e-07,
475
+ "loss": 0.4422,
476
+ "mean_token_accuracy": 0.850089156627655,
477
+ "num_tokens": 38671144.0,
478
+ "step": 260
479
+ },
480
+ {
481
+ "epoch": 2.173374613003096,
482
+ "grad_norm": 0.923069296908276,
483
+ "learning_rate": 4.379956019217674e-07,
484
+ "loss": 0.4354,
485
+ "mean_token_accuracy": 0.8527857303619385,
486
+ "num_tokens": 39418140.0,
487
+ "step": 265
488
+ },
489
+ {
490
+ "epoch": 2.214654282765738,
491
+ "grad_norm": 0.8887917642443601,
492
+ "learning_rate": 3.991598709041195e-07,
493
+ "loss": 0.4579,
494
+ "mean_token_accuracy": 0.8460401177406311,
495
+ "num_tokens": 40184827.0,
496
+ "step": 270
497
+ },
498
+ {
499
+ "epoch": 2.25593395252838,
500
+ "grad_norm": 0.9820660095735302,
501
+ "learning_rate": 3.6169352117062745e-07,
502
+ "loss": 0.4518,
503
+ "mean_token_accuracy": 0.8484508559107781,
504
+ "num_tokens": 40943017.0,
505
+ "step": 275
506
+ },
507
+ {
508
+ "epoch": 2.2972136222910216,
509
+ "grad_norm": 0.8880277746538214,
510
+ "learning_rate": 3.2568194268715065e-07,
511
+ "loss": 0.4247,
512
+ "mean_token_accuracy": 0.8553372338414192,
513
+ "num_tokens": 41675482.0,
514
+ "step": 280
515
+ },
516
+ {
517
+ "epoch": 2.3384932920536636,
518
+ "grad_norm": 0.9393539096552885,
519
+ "learning_rate": 2.912072098345446e-07,
520
+ "loss": 0.4279,
521
+ "mean_token_accuracy": 0.8559224531054497,
522
+ "num_tokens": 42415673.0,
523
+ "step": 285
524
+ },
525
+ {
526
+ "epoch": 2.3797729618163057,
527
+ "grad_norm": 0.9303823491332491,
528
+ "learning_rate": 2.583478943520424e-07,
529
+ "loss": 0.4423,
530
+ "mean_token_accuracy": 0.8504590332508087,
531
+ "num_tokens": 43152305.0,
532
+ "step": 290
533
+ },
534
+ {
535
+ "epoch": 2.4210526315789473,
536
+ "grad_norm": 0.971127221838176,
537
+ "learning_rate": 2.271788862635513e-07,
538
+ "loss": 0.4323,
539
+ "mean_token_accuracy": 0.8537680730223656,
540
+ "num_tokens": 43874492.0,
541
+ "step": 295
542
+ },
543
+ {
544
+ "epoch": 2.4623323013415894,
545
+ "grad_norm": 0.9189912575444394,
546
+ "learning_rate": 1.9777122319497986e-07,
547
+ "loss": 0.467,
548
+ "mean_token_accuracy": 0.8447170093655586,
549
+ "num_tokens": 44628327.0,
550
+ "step": 300
551
+ },
552
+ {
553
+ "epoch": 2.503611971104231,
554
+ "grad_norm": 0.9428059815426227,
555
+ "learning_rate": 1.7019192847161423e-07,
556
+ "loss": 0.429,
557
+ "mean_token_accuracy": 0.8544344991445542,
558
+ "num_tokens": 45369336.0,
559
+ "step": 305
560
+ },
561
+ {
562
+ "epoch": 2.544891640866873,
563
+ "grad_norm": 0.9101357299119612,
564
+ "learning_rate": 1.4450385836452428e-07,
565
+ "loss": 0.4555,
566
+ "mean_token_accuracy": 0.8474543124437333,
567
+ "num_tokens": 46119769.0,
568
+ "step": 310
569
+ },
570
+ {
571
+ "epoch": 2.586171310629515,
572
+ "grad_norm": 0.919668232631187,
573
+ "learning_rate": 1.207655588341534e-07,
574
+ "loss": 0.4445,
575
+ "mean_token_accuracy": 0.8487650215625763,
576
+ "num_tokens": 46867680.0,
577
+ "step": 315
578
+ },
579
+ {
580
+ "epoch": 2.627450980392157,
581
+ "grad_norm": 0.8805176832592541,
582
+ "learning_rate": 9.903113209758096e-08,
583
+ "loss": 0.4274,
584
+ "mean_token_accuracy": 0.85523192435503,
585
+ "num_tokens": 47623269.0,
586
+ "step": 320
587
+ },
588
+ {
589
+ "epoch": 2.6687306501547985,
590
+ "grad_norm": 1.0165924861588809,
591
+ "learning_rate": 7.93501133235711e-08,
592
+ "loss": 0.4583,
593
+ "mean_token_accuracy": 0.8462967693805694,
594
+ "num_tokens": 48351505.0,
595
+ "step": 325
596
+ },
597
+ {
598
+ "epoch": 2.7100103199174406,
599
+ "grad_norm": 0.9019149565338729,
600
+ "learning_rate": 6.17673577364296e-08,
601
+ "loss": 0.438,
602
+ "mean_token_accuracy": 0.8522011756896972,
603
+ "num_tokens": 49095045.0,
604
+ "step": 330
605
+ },
606
+ {
607
+ "epoch": 2.7512899896800826,
608
+ "grad_norm": 0.8969543904734346,
609
+ "learning_rate": 4.632293838597246e-08,
610
+ "loss": 0.433,
611
+ "mean_token_accuracy": 0.8527301624417305,
612
+ "num_tokens": 49857090.0,
613
+ "step": 335
614
+ },
615
+ {
616
+ "epoch": 2.7925696594427247,
617
+ "grad_norm": 0.9016846496482673,
618
+ "learning_rate": 3.305205481660245e-08,
619
+ "loss": 0.448,
620
+ "mean_token_accuracy": 0.8483750134706497,
621
+ "num_tokens": 50619517.0,
622
+ "step": 340
623
+ },
624
+ {
625
+ "epoch": 2.8338493292053664,
626
+ "grad_norm": 0.9202038224797638,
627
+ "learning_rate": 2.19849528436441e-08,
628
+ "loss": 0.4477,
629
+ "mean_token_accuracy": 0.850049777328968,
630
+ "num_tokens": 51364012.0,
631
+ "step": 345
632
+ },
633
+ {
634
+ "epoch": 2.875128998968008,
635
+ "grad_norm": 1.3192572217707328,
636
+ "learning_rate": 1.3146855619776132e-08,
637
+ "loss": 0.4181,
638
+ "mean_token_accuracy": 0.8579459518194199,
639
+ "num_tokens": 52098120.0,
640
+ "step": 350
641
+ },
642
+ {
643
+ "epoch": 2.91640866873065,
644
+ "grad_norm": 0.908172601322105,
645
+ "learning_rate": 6.557906148669023e-09,
646
+ "loss": 0.4465,
647
+ "mean_token_accuracy": 0.8493014112114906,
648
+ "num_tokens": 52845956.0,
649
+ "step": 355
650
+ },
651
+ {
652
+ "epoch": 2.957688338493292,
653
+ "grad_norm": 0.9137493586477308,
654
+ "learning_rate": 2.233121376846836e-09,
655
+ "loss": 0.4569,
656
+ "mean_token_accuracy": 0.8459745928645134,
657
+ "num_tokens": 53593774.0,
658
+ "step": 360
659
+ },
660
+ {
661
+ "epoch": 2.998968008255934,
662
+ "grad_norm": 0.9681544351701314,
663
+ "learning_rate": 1.8235796839982664e-10,
664
+ "loss": 0.4458,
665
+ "mean_token_accuracy": 0.8503826469182968,
666
+ "num_tokens": 54348421.0,
667
+ "step": 365
668
+ },
669
+ {
670
+ "epoch": 3.0,
671
+ "mean_token_accuracy": 0.8451418280601501,
672
+ "num_tokens": 54368147.0,
673
+ "step": 366,
674
+ "total_flos": 231659938701312.0,
675
+ "train_loss": 0.5313233767376572,
676
+ "train_runtime": 12361.3768,
677
+ "train_samples_per_second": 3.762,
678
+ "train_steps_per_second": 0.03
679
+ }
680
+ ],
681
+ "logging_steps": 5,
682
+ "max_steps": 366,
683
+ "num_input_tokens_seen": 0,
684
+ "num_train_epochs": 3,
685
+ "save_steps": 500,
686
+ "stateful_callbacks": {
687
+ "TrainerControl": {
688
+ "args": {
689
+ "should_epoch_stop": false,
690
+ "should_evaluate": false,
691
+ "should_log": false,
692
+ "should_save": true,
693
+ "should_training_stop": true
694
+ },
695
+ "attributes": {}
696
+ }
697
+ },
698
+ "total_flos": 231659938701312.0,
699
+ "train_batch_size": 2,
700
+ "trial_name": null,
701
+ "trial_params": null
702
+ }