ggbetz commited on
Commit
f05f27e
·
verified ·
1 Parent(s): 46f77b6

Model save

Browse files
README.md CHANGED
@@ -26,7 +26,7 @@ print(output["generated_text"])
26
 
27
  ## Training procedure
28
 
29
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/ggbetz/argunauts-training/runs/mfr9vzm9)
30
 
31
 
32
  This model was trained with DPO, a method introduced in [Direct Preference Optimization: Your Language Model is Secretly a Reward Model](https://huggingface.co/papers/2305.18290).
 
26
 
27
  ## Training procedure
28
 
29
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/ggbetz/argunauts-training/runs/8afyfsbz)
30
 
31
 
32
  This model was trained with DPO, a method introduced in [Direct Preference Optimization: Your Language Model is Secretly a Reward Model](https://huggingface.co/papers/2305.18290).
all_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 1.9873417721518987,
3
  "total_flos": 0.0,
4
- "train_loss": 0.3332489388329642,
5
- "train_runtime": 1764.0917,
6
- "train_samples": 1578,
7
- "train_samples_per_second": 1.789,
8
  "train_steps_per_second": 0.056
9
  }
 
1
  {
2
+ "epoch": 1.9820224719101125,
3
  "total_flos": 0.0,
4
+ "train_loss": 0.340618884563446,
5
+ "train_runtime": 1955.8801,
6
+ "train_samples": 1779,
7
+ "train_samples_per_second": 1.819,
8
  "train_steps_per_second": 0.056
9
  }
model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0194ae5449bcff49cacf9b7a32bdf40f0c0b9b97d6e6f08c5c0a337f3c316482
3
  size 4976698672
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:872f675707c09411dbb21f0c56ce21a75a907a172ce62eab35fbc9ac386d87d3
3
  size 4976698672
model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8f76427d1ed7882fa2e1d45aab92abf3c0b7493118cf2e62d2b3d5cd5a36f292
3
  size 4999802720
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb9bba22ea8a58699ae221ee9488c2cffe45b95c03ff0f7d585b957bceadb231
3
  size 4999802720
model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e3db0121dcf6b93ce60e6957cecdaa7f36181c5f0877e209970bb7a416797a0e
3
  size 4915916176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e8d81c8cb4860860e1e8ebc9f7920a5497b153e08513ac9b6af2d4656427d313
3
  size 4915916176
model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:db2af0685928fdee52f9e5fef1e81f3d9c595770472ad366bf881f352c775300
3
  size 1168138808
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bacab750cca6f653341a857fdbe0f2616bae9c79632f48a574760daf38647180
3
  size 1168138808
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 1.9873417721518987,
3
  "total_flos": 0.0,
4
- "train_loss": 0.3332489388329642,
5
- "train_runtime": 1764.0917,
6
- "train_samples": 1578,
7
- "train_samples_per_second": 1.789,
8
  "train_steps_per_second": 0.056
9
  }
 
1
  {
2
+ "epoch": 1.9820224719101125,
3
  "total_flos": 0.0,
4
+ "train_loss": 0.340618884563446,
5
+ "train_runtime": 1955.8801,
6
+ "train_samples": 1779,
7
+ "train_samples_per_second": 1.819,
8
  "train_steps_per_second": 0.056
9
  }
trainer_state.json CHANGED
@@ -1,310 +1,355 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 1.9873417721518987,
5
  "eval_steps": 500,
6
- "global_step": 98,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.10126582278481013,
13
- "grad_norm": 164.88829092196647,
14
- "learning_rate": 5.1020408163265303e-08,
15
- "logits/chosen": -1.1785156726837158,
16
- "logits/rejected": -1.236425757408142,
17
- "logps/chosen": -227.52499389648438,
18
- "logps/rejected": -236.91250610351562,
19
- "loss": 0.3665,
20
- "rewards/accuracies": 0.8187500238418579,
21
- "rewards/chosen": -1.726477026939392,
22
- "rewards/margins": 2.431835889816284,
23
- "rewards/rejected": -4.161084175109863,
24
  "step": 5
25
  },
26
  {
27
- "epoch": 0.20253164556962025,
28
- "grad_norm": 196.04682481931437,
29
- "learning_rate": 1.0204081632653061e-07,
30
- "logits/chosen": -1.272363305091858,
31
- "logits/rejected": -1.274511694908142,
32
- "logps/chosen": -220.14999389648438,
33
- "logps/rejected": -245.1999969482422,
34
- "loss": 0.3605,
35
- "rewards/accuracies": 0.84375,
36
- "rewards/chosen": -2.18218994140625,
37
- "rewards/margins": 2.397656202316284,
38
- "rewards/rejected": -4.5774078369140625,
39
  "step": 10
40
  },
41
  {
42
- "epoch": 0.3037974683544304,
43
- "grad_norm": 181.94134870297944,
44
- "learning_rate": 1.5306122448979592e-07,
45
- "logits/chosen": -1.19287109375,
46
- "logits/rejected": -1.20751953125,
47
- "logps/chosen": -198.3125,
48
- "logps/rejected": -232.83749389648438,
49
- "loss": 0.4412,
50
- "rewards/accuracies": 0.8187500238418579,
51
- "rewards/chosen": -0.9437316656112671,
52
- "rewards/margins": 2.420117139816284,
53
- "rewards/rejected": -3.36419677734375,
54
  "step": 15
55
  },
56
  {
57
- "epoch": 0.4050632911392405,
58
- "grad_norm": 176.48931870428808,
59
- "learning_rate": 2.0408163265306121e-07,
60
- "logits/chosen": -1.23291015625,
61
- "logits/rejected": -1.23095703125,
62
- "logps/chosen": -216.875,
63
- "logps/rejected": -227.72500610351562,
64
- "loss": 0.4458,
65
- "rewards/accuracies": 0.8125,
66
- "rewards/chosen": -1.646325707435608,
67
- "rewards/margins": 2.341747999191284,
68
- "rewards/rejected": -3.9908447265625,
69
  "step": 20
70
  },
71
  {
72
- "epoch": 0.5063291139240507,
73
- "grad_norm": 152.41353229457945,
74
- "learning_rate": 2.551020408163265e-07,
75
- "logits/chosen": -1.217187523841858,
76
- "logits/rejected": -1.1962890625,
77
- "logps/chosen": -198.1999969482422,
78
- "logps/rejected": -234.0749969482422,
79
- "loss": 0.3668,
80
- "rewards/accuracies": 0.824999988079071,
81
- "rewards/chosen": -1.236718773841858,
82
- "rewards/margins": 2.2503418922424316,
83
- "rewards/rejected": -3.4869627952575684,
84
  "step": 25
85
  },
86
  {
87
- "epoch": 0.6075949367088608,
88
- "grad_norm": 238.15664690468444,
89
- "learning_rate": 3.0612244897959183e-07,
90
- "logits/chosen": -1.1808593273162842,
91
- "logits/rejected": -1.198828101158142,
92
- "logps/chosen": -203.9875030517578,
93
- "logps/rejected": -218.33749389648438,
94
- "loss": 0.3598,
95
- "rewards/accuracies": 0.84375,
96
- "rewards/chosen": -1.8857848644256592,
97
- "rewards/margins": 2.5849852561950684,
98
- "rewards/rejected": -4.472509860992432,
99
  "step": 30
100
  },
101
  {
102
- "epoch": 0.7088607594936709,
103
- "grad_norm": 173.56185951357247,
104
- "learning_rate": 3.5714285714285716e-07,
105
- "logits/chosen": -1.2887694835662842,
106
- "logits/rejected": -1.2497069835662842,
107
- "logps/chosen": -227.25,
108
- "logps/rejected": -272.48748779296875,
109
- "loss": 0.3668,
110
- "rewards/accuracies": 0.8687499761581421,
111
- "rewards/chosen": -1.4921875,
112
- "rewards/margins": 2.6267333030700684,
113
- "rewards/rejected": -4.119189262390137,
114
  "step": 35
115
  },
116
  {
117
- "epoch": 0.810126582278481,
118
- "grad_norm": 178.120520590497,
119
- "learning_rate": 4.0816326530612243e-07,
120
- "logits/chosen": -1.236230492591858,
121
- "logits/rejected": -1.26806640625,
122
- "logps/chosen": -230.9499969482422,
123
- "logps/rejected": -250.83749389648438,
124
- "loss": 0.3654,
125
  "rewards/accuracies": 0.8500000238418579,
126
- "rewards/chosen": -1.5751953125,
127
- "rewards/margins": 2.5509276390075684,
128
- "rewards/rejected": -4.125390529632568,
129
  "step": 40
130
  },
131
  {
132
- "epoch": 0.9113924050632911,
133
- "grad_norm": 305.3433425219198,
134
- "learning_rate": 4.5918367346938775e-07,
135
- "logits/chosen": -1.2873046398162842,
136
- "logits/rejected": -1.2756836414337158,
137
- "logps/chosen": -216.85000610351562,
138
- "logps/rejected": -247.8000030517578,
139
- "loss": 0.3923,
140
- "rewards/accuracies": 0.78125,
141
- "rewards/chosen": -2.557360887527466,
142
- "rewards/margins": 2.396679639816284,
143
- "rewards/rejected": -4.952929496765137,
144
  "step": 45
145
  },
146
  {
147
- "epoch": 1.0151898734177216,
148
- "grad_norm": 175.7849952088377,
149
- "learning_rate": 4.897959183673469e-07,
150
- "logits/chosen": -1.28271484375,
151
- "logits/rejected": -1.275292992591858,
152
- "logps/chosen": -196.35000610351562,
153
- "logps/rejected": -230.125,
154
- "loss": 0.3045,
155
- "rewards/accuracies": 0.8687499761581421,
156
- "rewards/chosen": -1.607446312904358,
157
- "rewards/margins": 2.945483446121216,
158
- "rewards/rejected": -4.556542873382568,
159
  "step": 50
160
  },
161
  {
162
- "epoch": 1.1164556962025316,
163
- "grad_norm": 93.12529608401417,
164
- "learning_rate": 4.387755102040816e-07,
165
- "logits/chosen": -1.211328148841858,
166
- "logits/rejected": -1.239160180091858,
167
- "logps/chosen": -234.8125,
168
- "logps/rejected": -263.42498779296875,
169
- "loss": 0.3077,
170
- "rewards/accuracies": 0.856249988079071,
171
- "rewards/chosen": -1.0397613048553467,
172
- "rewards/margins": 2.655688524246216,
173
- "rewards/rejected": -3.6968016624450684,
174
  "step": 55
175
  },
176
  {
177
- "epoch": 1.2177215189873418,
178
- "grad_norm": 128.88871435709157,
179
- "learning_rate": 3.877551020408163e-07,
180
- "logits/chosen": -1.2453124523162842,
181
- "logits/rejected": -1.257714867591858,
182
- "logps/chosen": -217.125,
183
- "logps/rejected": -236.60000610351562,
184
- "loss": 0.2759,
185
- "rewards/accuracies": 0.90625,
186
- "rewards/chosen": -1.4923827648162842,
187
- "rewards/margins": 2.579882860183716,
188
- "rewards/rejected": -4.072167873382568,
189
  "step": 60
190
  },
191
  {
192
- "epoch": 1.3189873417721518,
193
- "grad_norm": 100.50051637621726,
194
- "learning_rate": 3.3673469387755096e-07,
195
- "logits/chosen": -1.2649414539337158,
196
- "logits/rejected": -1.2864258289337158,
197
- "logps/chosen": -195.9250030517578,
198
- "logps/rejected": -230.8874969482422,
199
- "loss": 0.318,
200
- "rewards/accuracies": 0.8812500238418579,
201
- "rewards/chosen": -2.084057569503784,
202
- "rewards/margins": 2.5894532203674316,
203
- "rewards/rejected": -4.67529296875,
204
  "step": 65
205
  },
206
  {
207
- "epoch": 1.420253164556962,
208
- "grad_norm": 116.04677407046486,
209
- "learning_rate": 2.857142857142857e-07,
210
- "logits/chosen": -1.25537109375,
211
- "logits/rejected": -1.2003905773162842,
212
- "logps/chosen": -211.52499389648438,
213
- "logps/rejected": -223.4499969482422,
214
- "loss": 0.303,
215
- "rewards/accuracies": 0.84375,
216
- "rewards/chosen": -1.956323266029358,
217
- "rewards/margins": 2.680859327316284,
218
- "rewards/rejected": -4.637743949890137,
219
  "step": 70
220
  },
221
  {
222
- "epoch": 1.5215189873417723,
223
- "grad_norm": 165.72279289188168,
224
- "learning_rate": 2.346938775510204e-07,
225
- "logits/chosen": -1.200585961341858,
226
- "logits/rejected": -1.214257836341858,
227
- "logps/chosen": -216.3249969482422,
228
- "logps/rejected": -247.9875030517578,
229
- "loss": 0.2451,
230
  "rewards/accuracies": 0.8687499761581421,
231
- "rewards/chosen": -1.793310523033142,
232
- "rewards/margins": 2.6591796875,
233
- "rewards/rejected": -4.44970703125,
234
  "step": 75
235
  },
236
  {
237
- "epoch": 1.6227848101265823,
238
- "grad_norm": 72.70953479370881,
239
- "learning_rate": 1.836734693877551e-07,
240
- "logits/chosen": -1.2444336414337158,
241
- "logits/rejected": -1.2509765625,
242
- "logps/chosen": -198.1374969482422,
243
- "logps/rejected": -238.6750030517578,
244
- "loss": 0.2367,
245
- "rewards/accuracies": 0.8999999761581421,
246
- "rewards/chosen": -1.5260741710662842,
247
- "rewards/margins": 2.770703077316284,
248
- "rewards/rejected": -4.296618461608887,
249
  "step": 80
250
  },
251
  {
252
- "epoch": 1.7240506329113923,
253
- "grad_norm": 102.46666266044015,
254
- "learning_rate": 1.326530612244898e-07,
255
- "logits/chosen": -1.174902319908142,
256
- "logits/rejected": -1.1663086414337158,
257
- "logps/chosen": -213.0749969482422,
258
- "logps/rejected": -241.3000030517578,
259
- "loss": 0.2643,
260
- "rewards/accuracies": 0.887499988079071,
261
- "rewards/chosen": -1.648535132408142,
262
- "rewards/margins": 2.9351563453674316,
263
- "rewards/rejected": -4.583788871765137,
264
  "step": 85
265
  },
266
  {
267
- "epoch": 1.8253164556962025,
268
- "grad_norm": 96.17442050283816,
269
- "learning_rate": 8.163265306122448e-08,
270
- "logits/chosen": -1.3068358898162842,
271
- "logits/rejected": -1.2685546875,
272
- "logps/chosen": -197.96249389648438,
273
- "logps/rejected": -240.0749969482422,
274
- "loss": 0.2913,
275
- "rewards/accuracies": 0.8812500238418579,
276
- "rewards/chosen": -1.8278076648712158,
277
- "rewards/margins": 2.79736328125,
278
- "rewards/rejected": -4.623583793640137,
279
  "step": 90
280
  },
281
  {
282
- "epoch": 1.9265822784810127,
283
- "grad_norm": 177.81461241439493,
284
- "learning_rate": 3.0612244897959183e-08,
285
- "logits/chosen": -1.195214867591858,
286
- "logits/rejected": -1.21337890625,
287
- "logps/chosen": -233.21249389648438,
288
- "logps/rejected": -256.3999938964844,
289
- "loss": 0.2971,
290
- "rewards/accuracies": 0.90625,
291
- "rewards/chosen": -1.703759789466858,
292
- "rewards/margins": 3.0367188453674316,
293
- "rewards/rejected": -4.742871284484863,
294
  "step": 95
295
  },
296
  {
297
- "epoch": 1.9873417721518987,
298
- "step": 98,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
299
  "total_flos": 0.0,
300
- "train_loss": 0.3332489388329642,
301
- "train_runtime": 1764.0917,
302
- "train_samples_per_second": 1.789,
303
  "train_steps_per_second": 0.056
304
  }
305
  ],
306
  "logging_steps": 5,
307
- "max_steps": 98,
308
  "num_input_tokens_seen": 0,
309
  "num_train_epochs": 2,
310
  "save_steps": 50,
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.9820224719101125,
5
  "eval_steps": 500,
6
+ "global_step": 110,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.0898876404494382,
13
+ "grad_norm": 250.8028214771956,
14
+ "learning_rate": 4.545454545454545e-08,
15
+ "logits/chosen": -1.31005859375,
16
+ "logits/rejected": -1.302734375,
17
+ "logps/chosen": -219.6999969482422,
18
+ "logps/rejected": -277.82501220703125,
19
+ "loss": 0.3525,
20
+ "rewards/accuracies": 0.84375,
21
+ "rewards/chosen": -2.008740186691284,
22
+ "rewards/margins": 2.612988233566284,
23
+ "rewards/rejected": -4.619116306304932,
24
  "step": 5
25
  },
26
  {
27
+ "epoch": 0.1797752808988764,
28
+ "grad_norm": 170.2256638749618,
29
+ "learning_rate": 9.09090909090909e-08,
30
+ "logits/chosen": -1.200585961341858,
31
+ "logits/rejected": -1.272363305091858,
32
+ "logps/chosen": -205.9250030517578,
33
+ "logps/rejected": -230.5124969482422,
34
+ "loss": 0.3659,
35
+ "rewards/accuracies": 0.824999988079071,
36
+ "rewards/chosen": -1.0198485851287842,
37
+ "rewards/margins": 2.195556640625,
38
+ "rewards/rejected": -3.2144532203674316,
39
  "step": 10
40
  },
41
  {
42
+ "epoch": 0.2696629213483146,
43
+ "grad_norm": 295.1115992445591,
44
+ "learning_rate": 1.3636363636363635e-07,
45
+ "logits/chosen": -1.2916991710662842,
46
+ "logits/rejected": -1.23388671875,
47
+ "logps/chosen": -180.6999969482422,
48
+ "logps/rejected": -212.14999389648438,
49
+ "loss": 0.437,
50
+ "rewards/accuracies": 0.8374999761581421,
51
+ "rewards/chosen": -2.502392530441284,
52
+ "rewards/margins": 2.3900389671325684,
53
+ "rewards/rejected": -4.890429496765137,
54
  "step": 15
55
  },
56
  {
57
+ "epoch": 0.3595505617977528,
58
+ "grad_norm": 277.7225526041515,
59
+ "learning_rate": 1.818181818181818e-07,
60
+ "logits/chosen": -1.259765625,
61
+ "logits/rejected": -1.2590820789337158,
62
+ "logps/chosen": -197.78750610351562,
63
+ "logps/rejected": -218.91250610351562,
64
+ "loss": 0.3354,
65
+ "rewards/accuracies": 0.8374999761581421,
66
+ "rewards/chosen": -2.3748536109924316,
67
+ "rewards/margins": 2.6907958984375,
68
+ "rewards/rejected": -5.071179389953613,
69
  "step": 20
70
  },
71
  {
72
+ "epoch": 0.449438202247191,
73
+ "grad_norm": 115.01279948814654,
74
+ "learning_rate": 2.2727272727272726e-07,
75
+ "logits/chosen": -1.180078148841858,
76
+ "logits/rejected": -1.196386694908142,
77
+ "logps/chosen": -206.75,
78
+ "logps/rejected": -224.52499389648438,
79
+ "loss": 0.4124,
80
+ "rewards/accuracies": 0.8125,
81
+ "rewards/chosen": -2.122851610183716,
82
+ "rewards/margins": 1.923669457435608,
83
+ "rewards/rejected": -4.044787406921387,
84
  "step": 25
85
  },
86
  {
87
+ "epoch": 0.5393258426966292,
88
+ "grad_norm": 72.00062004716999,
89
+ "learning_rate": 2.727272727272727e-07,
90
+ "logits/chosen": -1.245996117591858,
91
+ "logits/rejected": -1.2501952648162842,
92
+ "logps/chosen": -208.91250610351562,
93
+ "logps/rejected": -217.8125,
94
+ "loss": 0.3735,
95
+ "rewards/accuracies": 0.8187500238418579,
96
+ "rewards/chosen": -1.261322021484375,
97
+ "rewards/margins": 2.326733350753784,
98
+ "rewards/rejected": -3.585009813308716,
99
  "step": 30
100
  },
101
  {
102
+ "epoch": 0.6292134831460674,
103
+ "grad_norm": 159.67880662723658,
104
+ "learning_rate": 3.1818181818181815e-07,
105
+ "logits/chosen": -1.2498047351837158,
106
+ "logits/rejected": -1.212011694908142,
107
+ "logps/chosen": -200.75,
108
+ "logps/rejected": -203.1750030517578,
109
+ "loss": 0.3959,
110
+ "rewards/accuracies": 0.831250011920929,
111
+ "rewards/chosen": -2.0471434593200684,
112
+ "rewards/margins": 2.1449217796325684,
113
+ "rewards/rejected": -4.191888332366943,
114
  "step": 35
115
  },
116
  {
117
+ "epoch": 0.7191011235955056,
118
+ "grad_norm": 159.3228331743839,
119
+ "learning_rate": 3.636363636363636e-07,
120
+ "logits/chosen": -1.304296851158142,
121
+ "logits/rejected": -1.2903320789337158,
122
+ "logps/chosen": -211.1750030517578,
123
+ "logps/rejected": -218.71249389648438,
124
+ "loss": 0.3872,
125
  "rewards/accuracies": 0.8500000238418579,
126
+ "rewards/chosen": -1.924902319908142,
127
+ "rewards/margins": 2.205249071121216,
128
+ "rewards/rejected": -4.130883693695068,
129
  "step": 40
130
  },
131
  {
132
+ "epoch": 0.8089887640449438,
133
+ "grad_norm": 209.54928027137586,
134
+ "learning_rate": 4.090909090909091e-07,
135
+ "logits/chosen": -1.2551758289337158,
136
+ "logits/rejected": -1.2458007335662842,
137
+ "logps/chosen": -222.3249969482422,
138
+ "logps/rejected": -237.64999389648438,
139
+ "loss": 0.4076,
140
+ "rewards/accuracies": 0.8125,
141
+ "rewards/chosen": -1.8654601573944092,
142
+ "rewards/margins": 2.3793578147888184,
143
+ "rewards/rejected": -4.247656345367432,
144
  "step": 45
145
  },
146
  {
147
+ "epoch": 0.898876404494382,
148
+ "grad_norm": 134.6125923552392,
149
+ "learning_rate": 4.545454545454545e-07,
150
+ "logits/chosen": -1.2243163585662842,
151
+ "logits/rejected": -1.208593726158142,
152
+ "logps/chosen": -203.58749389648438,
153
+ "logps/rejected": -223.9875030517578,
154
+ "loss": 0.4013,
155
+ "rewards/accuracies": 0.793749988079071,
156
+ "rewards/chosen": -2.0265870094299316,
157
+ "rewards/margins": 2.5132813453674316,
158
+ "rewards/rejected": -4.545800685882568,
159
  "step": 50
160
  },
161
  {
162
+ "epoch": 0.9887640449438202,
163
+ "grad_norm": 189.31983158924976,
164
+ "learning_rate": 5e-07,
165
+ "logits/chosen": -1.2102539539337158,
166
+ "logits/rejected": -1.205078125,
167
+ "logps/chosen": -204.6750030517578,
168
+ "logps/rejected": -235.71249389648438,
169
+ "loss": 0.4268,
170
+ "rewards/accuracies": 0.78125,
171
+ "rewards/chosen": -1.649023413658142,
172
+ "rewards/margins": 2.216552734375,
173
+ "rewards/rejected": -3.8661131858825684,
174
  "step": 55
175
  },
176
  {
177
+ "epoch": 1.0831460674157303,
178
+ "grad_norm": 125.21413207415151,
179
+ "learning_rate": 4.545454545454545e-07,
180
+ "logits/chosen": -1.2853515148162842,
181
+ "logits/rejected": -1.318261742591858,
182
+ "logps/chosen": -202.3000030517578,
183
+ "logps/rejected": -216.0124969482422,
184
+ "loss": 0.3312,
185
+ "rewards/accuracies": 0.8812500238418579,
186
+ "rewards/chosen": -1.6000487804412842,
187
+ "rewards/margins": 2.2996582984924316,
188
+ "rewards/rejected": -3.900146484375,
189
  "step": 60
190
  },
191
  {
192
+ "epoch": 1.1730337078651685,
193
+ "grad_norm": 164.73999131891642,
194
+ "learning_rate": 4.090909090909091e-07,
195
+ "logits/chosen": -1.240136742591858,
196
+ "logits/rejected": -1.2451171875,
197
+ "logps/chosen": -229.66250610351562,
198
+ "logps/rejected": -257.3999938964844,
199
+ "loss": 0.2984,
200
+ "rewards/accuracies": 0.84375,
201
+ "rewards/chosen": -1.916986107826233,
202
+ "rewards/margins": 2.175122022628784,
203
+ "rewards/rejected": -4.092577934265137,
204
  "step": 65
205
  },
206
  {
207
+ "epoch": 1.2629213483146067,
208
+ "grad_norm": 99.20074290357307,
209
+ "learning_rate": 3.636363636363636e-07,
210
+ "logits/chosen": -1.254492163658142,
211
+ "logits/rejected": -1.2688477039337158,
212
+ "logps/chosen": -222.0625,
213
+ "logps/rejected": -236.8625030517578,
214
+ "loss": 0.3024,
215
+ "rewards/accuracies": 0.862500011920929,
216
+ "rewards/chosen": -1.540551781654358,
217
+ "rewards/margins": 2.271484375,
218
+ "rewards/rejected": -3.810473680496216,
219
  "step": 70
220
  },
221
  {
222
+ "epoch": 1.3528089887640449,
223
+ "grad_norm": 126.64899063848813,
224
+ "learning_rate": 3.1818181818181815e-07,
225
+ "logits/chosen": -1.2423827648162842,
226
+ "logits/rejected": -1.212158203125,
227
+ "logps/chosen": -208.27499389648438,
228
+ "logps/rejected": -230.75,
229
+ "loss": 0.2873,
230
  "rewards/accuracies": 0.8687499761581421,
231
+ "rewards/chosen": -1.522576928138733,
232
+ "rewards/margins": 2.6622557640075684,
233
+ "rewards/rejected": -4.184668064117432,
234
  "step": 75
235
  },
236
  {
237
+ "epoch": 1.442696629213483,
238
+ "grad_norm": 117.37315757338318,
239
+ "learning_rate": 2.727272727272727e-07,
240
+ "logits/chosen": -1.244140625,
241
+ "logits/rejected": -1.20654296875,
242
+ "logps/chosen": -203.5812530517578,
243
+ "logps/rejected": -249.3249969482422,
244
+ "loss": 0.3876,
245
+ "rewards/accuracies": 0.793749988079071,
246
+ "rewards/chosen": -1.797338843345642,
247
+ "rewards/margins": 2.134106397628784,
248
+ "rewards/rejected": -3.930224657058716,
249
  "step": 80
250
  },
251
  {
252
+ "epoch": 1.5325842696629213,
253
+ "grad_norm": 133.10246641657412,
254
+ "learning_rate": 2.2727272727272726e-07,
255
+ "logits/chosen": -1.252050757408142,
256
+ "logits/rejected": -1.2130858898162842,
257
+ "logps/chosen": -202.4499969482422,
258
+ "logps/rejected": -219.47500610351562,
259
+ "loss": 0.3168,
260
+ "rewards/accuracies": 0.84375,
261
+ "rewards/chosen": -1.588891625404358,
262
+ "rewards/margins": 2.5704102516174316,
263
+ "rewards/rejected": -4.15966796875,
264
  "step": 85
265
  },
266
  {
267
+ "epoch": 1.6224719101123597,
268
+ "grad_norm": 129.44212708304207,
269
+ "learning_rate": 1.818181818181818e-07,
270
+ "logits/chosen": -1.2580077648162842,
271
+ "logits/rejected": -1.262597680091858,
272
+ "logps/chosen": -165.0124969482422,
273
+ "logps/rejected": -193.21249389648438,
274
+ "loss": 0.2857,
275
+ "rewards/accuracies": 0.893750011920929,
276
+ "rewards/chosen": -1.4154541492462158,
277
+ "rewards/margins": 2.55810546875,
278
+ "rewards/rejected": -3.975329637527466,
279
  "step": 90
280
  },
281
  {
282
+ "epoch": 1.7123595505617977,
283
+ "grad_norm": 133.23298622428365,
284
+ "learning_rate": 1.3636363636363635e-07,
285
+ "logits/chosen": -1.26171875,
286
+ "logits/rejected": -1.2370116710662842,
287
+ "logps/chosen": -191.8874969482422,
288
+ "logps/rejected": -232.5124969482422,
289
+ "loss": 0.2636,
290
+ "rewards/accuracies": 0.862500011920929,
291
+ "rewards/chosen": -1.913476586341858,
292
+ "rewards/margins": 3.128124952316284,
293
+ "rewards/rejected": -5.037988185882568,
294
  "step": 95
295
  },
296
  {
297
+ "epoch": 1.802247191011236,
298
+ "grad_norm": 121.7166467503501,
299
+ "learning_rate": 9.09090909090909e-08,
300
+ "logits/chosen": -1.2141602039337158,
301
+ "logits/rejected": -1.228906273841858,
302
+ "logps/chosen": -199.4375,
303
+ "logps/rejected": -229.64999389648438,
304
+ "loss": 0.2476,
305
+ "rewards/accuracies": 0.8999999761581421,
306
+ "rewards/chosen": -1.579687476158142,
307
+ "rewards/margins": 2.887890577316284,
308
+ "rewards/rejected": -4.467577934265137,
309
+ "step": 100
310
+ },
311
+ {
312
+ "epoch": 1.892134831460674,
313
+ "grad_norm": 124.25297719784706,
314
+ "learning_rate": 4.545454545454545e-08,
315
+ "logits/chosen": -1.291894555091858,
316
+ "logits/rejected": -1.346093773841858,
317
+ "logps/chosen": -214.0625,
318
+ "logps/rejected": -233.0749969482422,
319
+ "loss": 0.2455,
320
+ "rewards/accuracies": 0.8999999761581421,
321
+ "rewards/chosen": -2.6200804710388184,
322
+ "rewards/margins": 2.843066453933716,
323
+ "rewards/rejected": -5.462133884429932,
324
+ "step": 105
325
+ },
326
+ {
327
+ "epoch": 1.9820224719101125,
328
+ "grad_norm": 113.18883753786767,
329
+ "learning_rate": 0.0,
330
+ "logits/chosen": -1.2438476085662842,
331
+ "logits/rejected": -1.2707030773162842,
332
+ "logps/chosen": -215.46249389648438,
333
+ "logps/rejected": -218.7375030517578,
334
+ "loss": 0.2322,
335
+ "rewards/accuracies": 0.8999999761581421,
336
+ "rewards/chosen": -2.182504177093506,
337
+ "rewards/margins": 2.7933106422424316,
338
+ "rewards/rejected": -4.975976467132568,
339
+ "step": 110
340
+ },
341
+ {
342
+ "epoch": 1.9820224719101125,
343
+ "step": 110,
344
  "total_flos": 0.0,
345
+ "train_loss": 0.340618884563446,
346
+ "train_runtime": 1955.8801,
347
+ "train_samples_per_second": 1.819,
348
  "train_steps_per_second": 0.056
349
  }
350
  ],
351
  "logging_steps": 5,
352
+ "max_steps": 110,
353
  "num_input_tokens_seen": 0,
354
  "num_train_epochs": 2,
355
  "save_steps": 50,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:19a4d7ba1f5921cec23798ad2026f81f5e42e812d37f484757273cf4aea3a061
3
  size 7672
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:03d967e1eb02dcc35b71618607ff5d6e5a726deda0443e58f9972946a8d826b4
3
  size 7672