EleanorZzz commited on
Commit
0c8a750
·
verified ·
1 Parent(s): 570b8c3

End of training

Browse files
Files changed (5) hide show
  1. README.md +1 -1
  2. all_results.json +5 -5
  3. train_results.json +5 -5
  4. trainer_state.json +106 -106
  5. training_loss.png +0 -0
README.md CHANGED
@@ -16,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
16
 
17
  # gsm8k_training_synthetic_positive_vs_multi
18
 
19
- This model is a fine-tuned version of [meta-llama/Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct) on an unknown dataset.
20
 
21
  ## Model description
22
 
 
16
 
17
  # gsm8k_training_synthetic_positive_vs_multi
18
 
19
+ This model is a fine-tuned version of [meta-llama/Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct) on the gsm8k_training_synthetic_positive_vs_multi dataset.
20
 
21
  ## Model description
22
 
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 4.0,
3
- "total_flos": 5441966439399424.0,
4
- "train_loss": 0.20355752086639403,
5
- "train_runtime": 387.1143,
6
- "train_samples_per_second": 10.333,
7
- "train_steps_per_second": 1.292
8
  }
 
1
  {
2
  "epoch": 4.0,
3
+ "total_flos": 5490451427622912.0,
4
+ "train_loss": 0.19940226113796233,
5
+ "train_runtime": 384.5053,
6
+ "train_samples_per_second": 10.403,
7
+ "train_steps_per_second": 1.3
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 4.0,
3
- "total_flos": 5441966439399424.0,
4
- "train_loss": 0.20355752086639403,
5
- "train_runtime": 387.1143,
6
- "train_samples_per_second": 10.333,
7
- "train_steps_per_second": 1.292
8
  }
 
1
  {
2
  "epoch": 4.0,
3
+ "total_flos": 5490451427622912.0,
4
+ "train_loss": 0.19940226113796233,
5
+ "train_runtime": 384.5053,
6
+ "train_samples_per_second": 10.403,
7
+ "train_steps_per_second": 1.3
8
  }
trainer_state.json CHANGED
@@ -11,362 +11,362 @@
11
  "log_history": [
12
  {
13
  "epoch": 0.08,
14
- "grad_norm": 0.6101526021957397,
15
  "learning_rate": 0.00017999999999999998,
16
- "loss": 0.5321,
17
  "step": 10
18
  },
19
  {
20
  "epoch": 0.16,
21
- "grad_norm": 0.614645779132843,
22
  "learning_rate": 0.00038,
23
- "loss": 0.3472,
24
  "step": 20
25
  },
26
  {
27
  "epoch": 0.24,
28
- "grad_norm": 0.5607407093048096,
29
  "learning_rate": 0.00058,
30
- "loss": 0.3515,
31
  "step": 30
32
  },
33
  {
34
  "epoch": 0.32,
35
- "grad_norm": 0.6382182240486145,
36
  "learning_rate": 0.0007800000000000001,
37
- "loss": 0.309,
38
  "step": 40
39
  },
40
  {
41
  "epoch": 0.4,
42
- "grad_norm": 0.5207857489585876,
43
  "learning_rate": 0.00098,
44
- "loss": 0.2925,
45
  "step": 50
46
  },
47
  {
48
  "epoch": 0.48,
49
- "grad_norm": 0.7279995083808899,
50
  "learning_rate": 0.0009990133642141358,
51
- "loss": 0.328,
52
  "step": 60
53
  },
54
  {
55
  "epoch": 0.56,
56
- "grad_norm": 0.803231954574585,
57
  "learning_rate": 0.0009956077701257708,
58
- "loss": 0.3606,
59
  "step": 70
60
  },
61
  {
62
  "epoch": 0.64,
63
- "grad_norm": 0.9615626335144043,
64
  "learning_rate": 0.000989787624799672,
65
- "loss": 0.3146,
66
  "step": 80
67
  },
68
  {
69
  "epoch": 0.72,
70
- "grad_norm": 0.7829203605651855,
71
  "learning_rate": 0.0009815812833988292,
72
- "loss": 0.3517,
73
  "step": 90
74
  },
75
  {
76
  "epoch": 0.8,
77
- "grad_norm": 0.8386848568916321,
78
  "learning_rate": 0.0009710287263936483,
79
- "loss": 0.2999,
80
  "step": 100
81
  },
82
  {
83
  "epoch": 0.88,
84
- "grad_norm": 0.555416464805603,
85
  "learning_rate": 0.0009581813647811198,
86
- "loss": 0.2798,
87
  "step": 110
88
  },
89
  {
90
  "epoch": 0.96,
91
- "grad_norm": 0.6741239428520203,
92
  "learning_rate": 0.0009431017896156073,
93
- "loss": 0.302,
94
  "step": 120
95
  },
96
  {
97
  "epoch": 1.04,
98
- "grad_norm": 0.6036772727966309,
99
  "learning_rate": 0.0009258634670715238,
100
- "loss": 0.2673,
101
  "step": 130
102
  },
103
  {
104
  "epoch": 1.12,
105
- "grad_norm": 0.7313661575317383,
106
  "learning_rate": 0.0009065503805235138,
107
- "loss": 0.2474,
108
  "step": 140
109
  },
110
  {
111
  "epoch": 1.2,
112
- "grad_norm": 0.7425872087478638,
113
  "learning_rate": 0.0008852566213878947,
114
- "loss": 0.2647,
115
  "step": 150
116
  },
117
  {
118
  "epoch": 1.28,
119
- "grad_norm": 0.8528808951377869,
120
  "learning_rate": 0.0008620859307187339,
121
- "loss": 0.2409,
122
  "step": 160
123
  },
124
  {
125
  "epoch": 1.3599999999999999,
126
- "grad_norm": 0.7592480778694153,
127
  "learning_rate": 0.0008371511937918616,
128
- "loss": 0.2586,
129
  "step": 170
130
  },
131
  {
132
  "epoch": 1.44,
133
- "grad_norm": 0.7716279625892639,
134
  "learning_rate": 0.0008105738901391552,
135
- "loss": 0.2575,
136
  "step": 180
137
  },
138
  {
139
  "epoch": 1.52,
140
- "grad_norm": 0.6334287524223328,
141
  "learning_rate": 0.0007824835017124689,
142
- "loss": 0.2431,
143
  "step": 190
144
  },
145
  {
146
  "epoch": 1.6,
147
- "grad_norm": 0.5061123967170715,
148
  "learning_rate": 0.0007530168820605818,
149
- "loss": 0.2585,
150
  "step": 200
151
  },
152
  {
153
  "epoch": 1.6800000000000002,
154
- "grad_norm": 0.5677999258041382,
155
  "learning_rate": 0.0007223175895924637,
156
- "loss": 0.2591,
157
  "step": 210
158
  },
159
  {
160
  "epoch": 1.76,
161
- "grad_norm": 0.5746018290519714,
162
  "learning_rate": 0.0006905351881751372,
163
- "loss": 0.2447,
164
  "step": 220
165
  },
166
  {
167
  "epoch": 1.8399999999999999,
168
- "grad_norm": 0.6062314510345459,
169
  "learning_rate": 0.0006578245184735513,
170
- "loss": 0.2273,
171
  "step": 230
172
  },
173
  {
174
  "epoch": 1.92,
175
- "grad_norm": 0.6199404001235962,
176
  "learning_rate": 0.0006243449435824276,
177
- "loss": 0.2309,
178
  "step": 240
179
  },
180
  {
181
  "epoch": 2.0,
182
- "grad_norm": 0.5176653265953064,
183
  "learning_rate": 0.0005902595726252801,
184
- "loss": 0.2211,
185
  "step": 250
186
  },
187
  {
188
  "epoch": 2.08,
189
- "grad_norm": 0.4597887098789215,
190
  "learning_rate": 0.0005557344661031627,
191
- "loss": 0.1596,
192
  "step": 260
193
  },
194
  {
195
  "epoch": 2.16,
196
- "grad_norm": 0.5063103437423706,
197
  "learning_rate": 0.0005209378268645998,
198
- "loss": 0.1607,
199
  "step": 270
200
  },
201
  {
202
  "epoch": 2.24,
203
- "grad_norm": 0.6462648510932922,
204
  "learning_rate": 0.00048603918063821566,
205
- "loss": 0.1581,
206
  "step": 280
207
  },
208
  {
209
  "epoch": 2.32,
210
- "grad_norm": 0.5296216607093811,
211
  "learning_rate": 0.0004512085501204253,
212
- "loss": 0.1525,
213
  "step": 290
214
  },
215
  {
216
  "epoch": 2.4,
217
- "grad_norm": 0.8066571950912476,
218
  "learning_rate": 0.0004166156266419489,
219
- "loss": 0.1595,
220
  "step": 300
221
  },
222
  {
223
  "epoch": 2.48,
224
- "grad_norm": 0.596865177154541,
225
  "learning_rate": 0.000382428943448705,
226
- "loss": 0.15,
227
  "step": 310
228
  },
229
  {
230
  "epoch": 2.56,
231
- "grad_norm": 0.5388411283493042,
232
  "learning_rate": 0.00034881505462477783,
233
- "loss": 0.1376,
234
  "step": 320
235
  },
236
  {
237
  "epoch": 2.64,
238
- "grad_norm": 0.44305840134620667,
239
  "learning_rate": 0.00031593772365766105,
240
- "loss": 0.1467,
241
  "step": 330
242
  },
243
  {
244
  "epoch": 2.7199999999999998,
245
- "grad_norm": 0.4659726917743683,
246
  "learning_rate": 0.0002839571255990088,
247
- "loss": 0.1435,
248
  "step": 340
249
  },
250
  {
251
  "epoch": 2.8,
252
- "grad_norm": 0.4379129111766815,
253
  "learning_rate": 0.0002530290667078846,
254
- "loss": 0.1428,
255
  "step": 350
256
  },
257
  {
258
  "epoch": 2.88,
259
- "grad_norm": 0.4827200174331665,
260
  "learning_rate": 0.000223304225378328,
261
- "loss": 0.1468,
262
  "step": 360
263
  },
264
  {
265
  "epoch": 2.96,
266
- "grad_norm": 0.3618449568748474,
267
  "learning_rate": 0.00019492741804936621,
268
- "loss": 0.1462,
269
  "step": 370
270
  },
271
  {
272
  "epoch": 3.04,
273
- "grad_norm": 0.34718525409698486,
274
  "learning_rate": 0.0001680368936738792,
275
- "loss": 0.124,
276
  "step": 380
277
  },
278
  {
279
  "epoch": 3.12,
280
- "grad_norm": 0.3362080156803131,
281
  "learning_rate": 0.00014276366018359842,
282
- "loss": 0.0822,
283
  "step": 390
284
  },
285
  {
286
  "epoch": 3.2,
287
- "grad_norm": 0.4773559868335724,
288
  "learning_rate": 0.00011923084623163172,
289
- "loss": 0.0831,
290
  "step": 400
291
  },
292
  {
293
  "epoch": 3.2800000000000002,
294
- "grad_norm": 0.43467625975608826,
295
  "learning_rate": 9.755310132204298e-05,
296
- "loss": 0.0868,
297
  "step": 410
298
  },
299
  {
300
  "epoch": 3.36,
301
- "grad_norm": 0.5573152899742126,
302
  "learning_rate": 7.783603724899258e-05,
303
- "loss": 0.0841,
304
  "step": 420
305
  },
306
  {
307
  "epoch": 3.44,
308
- "grad_norm": 0.33357012271881104,
309
  "learning_rate": 6.0175713566691824e-05,
310
- "loss": 0.0776,
311
  "step": 430
312
  },
313
  {
314
  "epoch": 3.52,
315
- "grad_norm": 0.3988214135169983,
316
  "learning_rate": 4.465816959691149e-05,
317
- "loss": 0.0797,
318
  "step": 440
319
  },
320
  {
321
  "epoch": 3.6,
322
- "grad_norm": 0.3149195611476898,
323
  "learning_rate": 3.1359005254054274e-05,
324
- "loss": 0.0794,
325
  "step": 450
326
  },
327
  {
328
  "epoch": 3.68,
329
- "grad_norm": 0.5255022048950195,
330
  "learning_rate": 2.0343012729971243e-05,
331
- "loss": 0.0781,
332
  "step": 460
333
  },
334
  {
335
  "epoch": 3.76,
336
- "grad_norm": 0.4124266803264618,
337
  "learning_rate": 1.166386083291604e-05,
338
- "loss": 0.0826,
339
  "step": 470
340
  },
341
  {
342
  "epoch": 3.84,
343
- "grad_norm": 0.35789692401885986,
344
  "learning_rate": 5.363833518505834e-06,
345
- "loss": 0.0762,
346
  "step": 480
347
  },
348
  {
349
  "epoch": 3.92,
350
- "grad_norm": 0.5189103484153748,
351
  "learning_rate": 1.4736238865398766e-06,
352
- "loss": 0.0737,
353
  "step": 490
354
  },
355
  {
356
  "epoch": 4.0,
357
- "grad_norm": 0.39071929454803467,
358
  "learning_rate": 1.2184647302626584e-08,
359
- "loss": 0.0763,
360
  "step": 500
361
  },
362
  {
363
  "epoch": 4.0,
364
  "step": 500,
365
- "total_flos": 5441966439399424.0,
366
- "train_loss": 0.20355752086639403,
367
- "train_runtime": 387.1143,
368
- "train_samples_per_second": 10.333,
369
- "train_steps_per_second": 1.292
370
  }
371
  ],
372
  "logging_steps": 10,
@@ -386,7 +386,7 @@
386
  "attributes": {}
387
  }
388
  },
389
- "total_flos": 5441966439399424.0,
390
  "train_batch_size": 1,
391
  "trial_name": null,
392
  "trial_params": null
 
11
  "log_history": [
12
  {
13
  "epoch": 0.08,
14
+ "grad_norm": 0.6058010458946228,
15
  "learning_rate": 0.00017999999999999998,
16
+ "loss": 0.4696,
17
  "step": 10
18
  },
19
  {
20
  "epoch": 0.16,
21
+ "grad_norm": 0.7855823040008545,
22
  "learning_rate": 0.00038,
23
+ "loss": 0.3417,
24
  "step": 20
25
  },
26
  {
27
  "epoch": 0.24,
28
+ "grad_norm": 0.5843826532363892,
29
  "learning_rate": 0.00058,
30
+ "loss": 0.2864,
31
  "step": 30
32
  },
33
  {
34
  "epoch": 0.32,
35
+ "grad_norm": 0.6763360500335693,
36
  "learning_rate": 0.0007800000000000001,
37
+ "loss": 0.3075,
38
  "step": 40
39
  },
40
  {
41
  "epoch": 0.4,
42
+ "grad_norm": 0.7557706832885742,
43
  "learning_rate": 0.00098,
44
+ "loss": 0.3017,
45
  "step": 50
46
  },
47
  {
48
  "epoch": 0.48,
49
+ "grad_norm": 0.6372764110565186,
50
  "learning_rate": 0.0009990133642141358,
51
+ "loss": 0.347,
52
  "step": 60
53
  },
54
  {
55
  "epoch": 0.56,
56
+ "grad_norm": 0.6951160430908203,
57
  "learning_rate": 0.0009956077701257708,
58
+ "loss": 0.2951,
59
  "step": 70
60
  },
61
  {
62
  "epoch": 0.64,
63
+ "grad_norm": 0.8329572677612305,
64
  "learning_rate": 0.000989787624799672,
65
+ "loss": 0.3089,
66
  "step": 80
67
  },
68
  {
69
  "epoch": 0.72,
70
+ "grad_norm": 0.8762990236282349,
71
  "learning_rate": 0.0009815812833988292,
72
+ "loss": 0.3095,
73
  "step": 90
74
  },
75
  {
76
  "epoch": 0.8,
77
+ "grad_norm": 0.6908068060874939,
78
  "learning_rate": 0.0009710287263936483,
79
+ "loss": 0.2886,
80
  "step": 100
81
  },
82
  {
83
  "epoch": 0.88,
84
+ "grad_norm": 0.8244242668151855,
85
  "learning_rate": 0.0009581813647811198,
86
+ "loss": 0.322,
87
  "step": 110
88
  },
89
  {
90
  "epoch": 0.96,
91
+ "grad_norm": 0.7944660782814026,
92
  "learning_rate": 0.0009431017896156073,
93
+ "loss": 0.3048,
94
  "step": 120
95
  },
96
  {
97
  "epoch": 1.04,
98
+ "grad_norm": 0.6548622250556946,
99
  "learning_rate": 0.0009258634670715238,
100
+ "loss": 0.2989,
101
  "step": 130
102
  },
103
  {
104
  "epoch": 1.12,
105
+ "grad_norm": 0.6823218464851379,
106
  "learning_rate": 0.0009065503805235138,
107
+ "loss": 0.2466,
108
  "step": 140
109
  },
110
  {
111
  "epoch": 1.2,
112
+ "grad_norm": 0.8037425875663757,
113
  "learning_rate": 0.0008852566213878947,
114
+ "loss": 0.2485,
115
  "step": 150
116
  },
117
  {
118
  "epoch": 1.28,
119
+ "grad_norm": 0.8479170799255371,
120
  "learning_rate": 0.0008620859307187339,
121
+ "loss": 0.2401,
122
  "step": 160
123
  },
124
  {
125
  "epoch": 1.3599999999999999,
126
+ "grad_norm": 0.5974708199501038,
127
  "learning_rate": 0.0008371511937918616,
128
+ "loss": 0.2637,
129
  "step": 170
130
  },
131
  {
132
  "epoch": 1.44,
133
+ "grad_norm": 0.5917499661445618,
134
  "learning_rate": 0.0008105738901391552,
135
+ "loss": 0.2456,
136
  "step": 180
137
  },
138
  {
139
  "epoch": 1.52,
140
+ "grad_norm": 0.8342835903167725,
141
  "learning_rate": 0.0007824835017124689,
142
+ "loss": 0.2471,
143
  "step": 190
144
  },
145
  {
146
  "epoch": 1.6,
147
+ "grad_norm": 0.6186763048171997,
148
  "learning_rate": 0.0007530168820605818,
149
+ "loss": 0.2342,
150
  "step": 200
151
  },
152
  {
153
  "epoch": 1.6800000000000002,
154
+ "grad_norm": 0.7886436581611633,
155
  "learning_rate": 0.0007223175895924637,
156
+ "loss": 0.2495,
157
  "step": 210
158
  },
159
  {
160
  "epoch": 1.76,
161
+ "grad_norm": 0.5301917195320129,
162
  "learning_rate": 0.0006905351881751372,
163
+ "loss": 0.2291,
164
  "step": 220
165
  },
166
  {
167
  "epoch": 1.8399999999999999,
168
+ "grad_norm": 0.5933790802955627,
169
  "learning_rate": 0.0006578245184735513,
170
+ "loss": 0.2139,
171
  "step": 230
172
  },
173
  {
174
  "epoch": 1.92,
175
+ "grad_norm": 0.6522702574729919,
176
  "learning_rate": 0.0006243449435824276,
177
+ "loss": 0.2206,
178
  "step": 240
179
  },
180
  {
181
  "epoch": 2.0,
182
+ "grad_norm": 0.47575750946998596,
183
  "learning_rate": 0.0005902595726252801,
184
+ "loss": 0.2289,
185
  "step": 250
186
  },
187
  {
188
  "epoch": 2.08,
189
+ "grad_norm": 0.4569144546985626,
190
  "learning_rate": 0.0005557344661031627,
191
+ "loss": 0.1546,
192
  "step": 260
193
  },
194
  {
195
  "epoch": 2.16,
196
+ "grad_norm": 0.4253688454627991,
197
  "learning_rate": 0.0005209378268645998,
198
+ "loss": 0.1582,
199
  "step": 270
200
  },
201
  {
202
  "epoch": 2.24,
203
+ "grad_norm": 0.4019651412963867,
204
  "learning_rate": 0.00048603918063821566,
205
+ "loss": 0.1461,
206
  "step": 280
207
  },
208
  {
209
  "epoch": 2.32,
210
+ "grad_norm": 1.082315444946289,
211
  "learning_rate": 0.0004512085501204253,
212
+ "loss": 0.1669,
213
  "step": 290
214
  },
215
  {
216
  "epoch": 2.4,
217
+ "grad_norm": 0.4050568640232086,
218
  "learning_rate": 0.0004166156266419489,
219
+ "loss": 0.1461,
220
  "step": 300
221
  },
222
  {
223
  "epoch": 2.48,
224
+ "grad_norm": 0.6206286549568176,
225
  "learning_rate": 0.000382428943448705,
226
+ "loss": 0.1564,
227
  "step": 310
228
  },
229
  {
230
  "epoch": 2.56,
231
+ "grad_norm": 0.49500882625579834,
232
  "learning_rate": 0.00034881505462477783,
233
+ "loss": 0.1505,
234
  "step": 320
235
  },
236
  {
237
  "epoch": 2.64,
238
+ "grad_norm": 0.5360985398292542,
239
  "learning_rate": 0.00031593772365766105,
240
+ "loss": 0.1432,
241
  "step": 330
242
  },
243
  {
244
  "epoch": 2.7199999999999998,
245
+ "grad_norm": 0.4015409052371979,
246
  "learning_rate": 0.0002839571255990088,
247
+ "loss": 0.136,
248
  "step": 340
249
  },
250
  {
251
  "epoch": 2.8,
252
+ "grad_norm": 0.5249755382537842,
253
  "learning_rate": 0.0002530290667078846,
254
+ "loss": 0.1605,
255
  "step": 350
256
  },
257
  {
258
  "epoch": 2.88,
259
+ "grad_norm": 0.3920833468437195,
260
  "learning_rate": 0.000223304225378328,
261
+ "loss": 0.1511,
262
  "step": 360
263
  },
264
  {
265
  "epoch": 2.96,
266
+ "grad_norm": 0.36147066950798035,
267
  "learning_rate": 0.00019492741804936621,
268
+ "loss": 0.1403,
269
  "step": 370
270
  },
271
  {
272
  "epoch": 3.04,
273
+ "grad_norm": 0.28068122267723083,
274
  "learning_rate": 0.0001680368936738792,
275
+ "loss": 0.1229,
276
  "step": 380
277
  },
278
  {
279
  "epoch": 3.12,
280
+ "grad_norm": 0.37893545627593994,
281
  "learning_rate": 0.00014276366018359842,
282
+ "loss": 0.0914,
283
  "step": 390
284
  },
285
  {
286
  "epoch": 3.2,
287
+ "grad_norm": 0.42108216881752014,
288
  "learning_rate": 0.00011923084623163172,
289
+ "loss": 0.0823,
290
  "step": 400
291
  },
292
  {
293
  "epoch": 3.2800000000000002,
294
+ "grad_norm": 0.400537371635437,
295
  "learning_rate": 9.755310132204298e-05,
296
+ "loss": 0.08,
297
  "step": 410
298
  },
299
  {
300
  "epoch": 3.36,
301
+ "grad_norm": 0.42910516262054443,
302
  "learning_rate": 7.783603724899258e-05,
303
+ "loss": 0.0801,
304
  "step": 420
305
  },
306
  {
307
  "epoch": 3.44,
308
+ "grad_norm": 0.46107789874076843,
309
  "learning_rate": 6.0175713566691824e-05,
310
+ "loss": 0.0797,
311
  "step": 430
312
  },
313
  {
314
  "epoch": 3.52,
315
+ "grad_norm": 0.42276251316070557,
316
  "learning_rate": 4.465816959691149e-05,
317
+ "loss": 0.085,
318
  "step": 440
319
  },
320
  {
321
  "epoch": 3.6,
322
+ "grad_norm": 0.3870842456817627,
323
  "learning_rate": 3.1359005254054274e-05,
324
+ "loss": 0.0866,
325
  "step": 450
326
  },
327
  {
328
  "epoch": 3.68,
329
+ "grad_norm": 0.4392528831958771,
330
  "learning_rate": 2.0343012729971243e-05,
331
+ "loss": 0.0759,
332
  "step": 460
333
  },
334
  {
335
  "epoch": 3.76,
336
+ "grad_norm": 0.3069448173046112,
337
  "learning_rate": 1.166386083291604e-05,
338
+ "loss": 0.079,
339
  "step": 470
340
  },
341
  {
342
  "epoch": 3.84,
343
+ "grad_norm": 0.39865902066230774,
344
  "learning_rate": 5.363833518505834e-06,
345
+ "loss": 0.0927,
346
  "step": 480
347
  },
348
  {
349
  "epoch": 3.92,
350
+ "grad_norm": 0.3342863917350769,
351
  "learning_rate": 1.4736238865398766e-06,
352
+ "loss": 0.0767,
353
  "step": 490
354
  },
355
  {
356
  "epoch": 4.0,
357
+ "grad_norm": 0.34820282459259033,
358
  "learning_rate": 1.2184647302626584e-08,
359
+ "loss": 0.078,
360
  "step": 500
361
  },
362
  {
363
  "epoch": 4.0,
364
  "step": 500,
365
+ "total_flos": 5490451427622912.0,
366
+ "train_loss": 0.19940226113796233,
367
+ "train_runtime": 384.5053,
368
+ "train_samples_per_second": 10.403,
369
+ "train_steps_per_second": 1.3
370
  }
371
  ],
372
  "logging_steps": 10,
 
386
  "attributes": {}
387
  }
388
  },
389
+ "total_flos": 5490451427622912.0,
390
  "train_batch_size": 1,
391
  "trial_name": null,
392
  "trial_params": null
training_loss.png CHANGED