{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 5025, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.009950248756218905, "grad_norm": 37.25846862792969, "learning_rate": 2.45e-07, "logits/chosen": 5.909375190734863, "logits/rejected": 6.022812366485596, "logps/chosen": -153.30499267578125, "logps/rejected": -145.34500122070312, "loss": 23.1951, "rewards/accuracies": 0.5337499976158142, "rewards/chosen": 86.37000274658203, "rewards/margins": 8.612265586853027, "rewards/rejected": 77.75499725341797, "step": 50 }, { "epoch": 0.01990049751243781, "grad_norm": 46.854312896728516, "learning_rate": 4.95e-07, "logits/chosen": 5.64968729019165, "logits/rejected": 5.914999961853027, "logps/chosen": -149.85000610351562, "logps/rejected": -148.31500244140625, "loss": 23.2079, "rewards/accuracies": 0.5256249904632568, "rewards/chosen": 84.31500244140625, "rewards/margins": 5.061445236206055, "rewards/rejected": 79.23249816894531, "step": 100 }, { "epoch": 0.029850746268656716, "grad_norm": 53.076805114746094, "learning_rate": 4.998778891959453e-07, "logits/chosen": 5.153749942779541, "logits/rejected": 5.412187576293945, "logps/chosen": -150.7550048828125, "logps/rejected": -148.91000366210938, "loss": 23.4129, "rewards/accuracies": 0.5212500095367432, "rewards/chosen": 82.07499694824219, "rewards/margins": 5.790234565734863, "rewards/rejected": 76.28500366210938, "step": 150 }, { "epoch": 0.03980099502487562, "grad_norm": 76.89788818359375, "learning_rate": 4.99501662760924e-07, "logits/chosen": 4.105234146118164, "logits/rejected": 4.434531211853027, "logps/chosen": -165.125, "logps/rejected": -161.82749938964844, "loss": 22.0375, "rewards/accuracies": 0.5193750262260437, "rewards/chosen": 80.40499877929688, "rewards/margins": 5.854726791381836, "rewards/rejected": 74.58000183105469, "step": 200 }, { "epoch": 0.04975124378109453, "grad_norm": 106.3976821899414, "learning_rate": 4.988716525160205e-07, "logits/chosen": 2.590937614440918, "logits/rejected": 2.960390567779541, "logps/chosen": -181.55999755859375, "logps/rejected": -182.18499755859375, "loss": 22.0983, "rewards/accuracies": 0.5206249952316284, "rewards/chosen": 79.50749969482422, "rewards/margins": 5.624882698059082, "rewards/rejected": 73.90750122070312, "step": 250 }, { "epoch": 0.05970149253731343, "grad_norm": 154.4518585205078, "learning_rate": 4.979884992842194e-07, "logits/chosen": 1.157080054283142, "logits/rejected": 1.4771264791488647, "logps/chosen": -199.74000549316406, "logps/rejected": -208.74000549316406, "loss": 23.0821, "rewards/accuracies": 0.4806250035762787, "rewards/chosen": 75.35250091552734, "rewards/margins": 1.8104979991912842, "rewards/rejected": 73.51750183105469, "step": 300 }, { "epoch": 0.06965174129353234, "grad_norm": 159.42955017089844, "learning_rate": 4.968531013761348e-07, "logits/chosen": -0.5976855754852295, "logits/rejected": -0.2811816334724426, "logps/chosen": -253.47000122070312, "logps/rejected": -241.38999938964844, "loss": 19.8918, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 79.19000244140625, "rewards/margins": 8.498632431030273, "rewards/rejected": 70.72000122070312, "step": 350 }, { "epoch": 0.07960199004975124, "grad_norm": 113.71151733398438, "learning_rate": 4.954666136762819e-07, "logits/chosen": -2.210566520690918, "logits/rejected": -1.936132788658142, "logps/chosen": -294.9599914550781, "logps/rejected": -279.9599914550781, "loss": 18.7674, "rewards/accuracies": 0.5331249833106995, "rewards/chosen": 72.56375122070312, "rewards/margins": 8.443652153015137, "rewards/rejected": 64.11750030517578, "step": 400 }, { "epoch": 0.08955223880597014, "grad_norm": 420.630859375, "learning_rate": 4.938304464683715e-07, "logits/chosen": -3.8620312213897705, "logits/rejected": -3.5835156440734863, "logps/chosen": -352.239990234375, "logps/rejected": -348.32000732421875, "loss": 18.0366, "rewards/accuracies": 0.5162500143051147, "rewards/chosen": 59.02375030517578, "rewards/margins": 2.2487499713897705, "rewards/rejected": 56.75749969482422, "step": 450 }, { "epoch": 0.09950248756218906, "grad_norm": 1939.998046875, "learning_rate": 4.91946264000822e-07, "logits/chosen": -4.11453104019165, "logits/rejected": -4.022890567779541, "logps/chosen": -524.02001953125, "logps/rejected": -501.67999267578125, "loss": 13.3907, "rewards/accuracies": 0.5350000262260437, "rewards/chosen": 50.61375045776367, "rewards/margins": 6.952011585235596, "rewards/rejected": 43.663124084472656, "step": 500 }, { "epoch": 0.10945273631840796, "grad_norm": 1124.046142578125, "learning_rate": 4.898159827939476e-07, "logits/chosen": -4.222187519073486, "logits/rejected": -4.111406326293945, "logps/chosen": -715.0800170898438, "logps/rejected": -685.260009765625, "loss": 10.9502, "rewards/accuracies": 0.5181249976158142, "rewards/chosen": 25.342500686645508, "rewards/margins": 0.6623925566673279, "rewards/rejected": 24.693124771118164, "step": 550 }, { "epoch": 0.11940298507462686, "grad_norm": 1316.1282958984375, "learning_rate": 4.874417696905456e-07, "logits/chosen": -4.313593864440918, "logits/rejected": -4.233281135559082, "logps/chosen": -738.239990234375, "logps/rejected": -716.5800170898438, "loss": 8.9983, "rewards/accuracies": 0.5450000166893005, "rewards/chosen": 25.038436889648438, "rewards/margins": 3.051743268966675, "rewards/rejected": 21.988750457763672, "step": 600 }, { "epoch": 0.12935323383084577, "grad_norm": 1739.2020263671875, "learning_rate": 4.848260396518637e-07, "logits/chosen": -4.20578145980835, "logits/rejected": -4.150312423706055, "logps/chosen": -768.8599853515625, "logps/rejected": -742.8200073242188, "loss": 9.0056, "rewards/accuracies": 0.546875, "rewards/chosen": 24.521249771118164, "rewards/margins": 3.205258846282959, "rewards/rejected": 21.316171646118164, "step": 650 }, { "epoch": 0.13930348258706468, "grad_norm": 1328.8701171875, "learning_rate": 4.819714533011918e-07, "logits/chosen": -4.449375152587891, "logits/rejected": -4.36984395980835, "logps/chosen": -743.5399780273438, "logps/rejected": -737.8800048828125, "loss": 7.7838, "rewards/accuracies": 0.5268750190734863, "rewards/chosen": 19.766250610351562, "rewards/margins": 2.2691991329193115, "rewards/rejected": 17.490938186645508, "step": 700 }, { "epoch": 0.14925373134328357, "grad_norm": 1090.959716796875, "learning_rate": 4.788809142175751e-07, "logits/chosen": -4.560468673706055, "logits/rejected": -4.488906383514404, "logps/chosen": -800.739990234375, "logps/rejected": -747.9600219726562, "loss": 7.3371, "rewards/accuracies": 0.5674999952316284, "rewards/chosen": 18.475936889648438, "rewards/margins": 3.2167186737060547, "rewards/rejected": 15.255346298217773, "step": 750 }, { "epoch": 0.15920398009950248, "grad_norm": 3159.435302734375, "learning_rate": 4.755575659824014e-07, "logits/chosen": -4.382031440734863, "logits/rejected": -4.326250076293945, "logps/chosen": -796.97998046875, "logps/rejected": -792.5800170898438, "loss": 8.1944, "rewards/accuracies": 0.5537499785423279, "rewards/chosen": 16.225391387939453, "rewards/margins": 1.8960351943969727, "rewards/rejected": 14.33435344696045, "step": 800 }, { "epoch": 0.1691542288557214, "grad_norm": 2522.934326171875, "learning_rate": 4.7200478898186656e-07, "logits/chosen": -4.401249885559082, "logits/rejected": -4.393125057220459, "logps/chosen": -801.2000122070312, "logps/rejected": -761.7000122070312, "loss": 7.0774, "rewards/accuracies": 0.5487499833106995, "rewards/chosen": 15.95101547241211, "rewards/margins": 1.5917773246765137, "rewards/rejected": 14.3623046875, "step": 850 }, { "epoch": 0.1791044776119403, "grad_norm": 1642.070068359375, "learning_rate": 4.68226196968572e-07, "logits/chosen": -3.7705469131469727, "logits/rejected": -3.7598438262939453, "logps/chosen": -800.5999755859375, "logps/rejected": -739.3800048828125, "loss": 7.2798, "rewards/accuracies": 0.534375011920929, "rewards/chosen": 17.5234375, "rewards/margins": 1.022646427154541, "rewards/rejected": 16.5008602142334, "step": 900 }, { "epoch": 0.1890547263681592, "grad_norm": 1916.802001953125, "learning_rate": 4.642256333857497e-07, "logits/chosen": -3.6234374046325684, "logits/rejected": -3.5637500286102295, "logps/chosen": -764.1599731445312, "logps/rejected": -759.6799926757812, "loss": 6.8376, "rewards/accuracies": 0.5568749904632568, "rewards/chosen": 17.314218521118164, "rewards/margins": 2.4442381858825684, "rewards/rejected": 14.867304801940918, "step": 950 }, { "epoch": 0.19900497512437812, "grad_norm": 2218.57373046875, "learning_rate": 4.600071674578551e-07, "logits/chosen": -4.034062385559082, "logits/rejected": -4.010156154632568, "logps/chosen": -813.4600219726562, "logps/rejected": -766.9000244140625, "loss": 6.9447, "rewards/accuracies": 0.5418750047683716, "rewards/chosen": 16.6539249420166, "rewards/margins": 2.051767587661743, "rewards/rejected": 14.60546875, "step": 1000 }, { "epoch": 0.208955223880597, "grad_norm": 4437.78369140625, "learning_rate": 4.555750900515026e-07, "logits/chosen": -4.153124809265137, "logits/rejected": -4.062812328338623, "logps/chosen": -789.47998046875, "logps/rejected": -759.8200073242188, "loss": 6.6345, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 15.9857816696167, "rewards/margins": 1.8259130716323853, "rewards/rejected": 14.157539367675781, "step": 1050 }, { "epoch": 0.21890547263681592, "grad_norm": 1672.605224609375, "learning_rate": 4.5093390931095656e-07, "logits/chosen": -4.221562385559082, "logits/rejected": -4.203437328338623, "logps/chosen": -795.239990234375, "logps/rejected": -751.6400146484375, "loss": 6.3275, "rewards/accuracies": 0.5362499952316284, "rewards/chosen": 16.673358917236328, "rewards/margins": 1.631040096282959, "rewards/rejected": 15.046093940734863, "step": 1100 }, { "epoch": 0.22885572139303484, "grad_norm": 845.5272216796875, "learning_rate": 4.4608834607261394e-07, "logits/chosen": -4.139531135559082, "logits/rejected": -4.091875076293945, "logps/chosen": -826.239990234375, "logps/rejected": -776.7999877929688, "loss": 5.9411, "rewards/accuracies": 0.5531250238418579, "rewards/chosen": 15.388359069824219, "rewards/margins": 2.7884082794189453, "rewards/rejected": 12.600312232971191, "step": 1150 }, { "epoch": 0.23880597014925373, "grad_norm": 2127.64697265625, "learning_rate": 4.4104332906314545e-07, "logits/chosen": -4.543749809265137, "logits/rejected": -4.498437404632568, "logps/chosen": -763.5999755859375, "logps/rejected": -782.280029296875, "loss": 6.0186, "rewards/accuracies": 0.5381249785423279, "rewards/chosen": 16.952342987060547, "rewards/margins": 2.063539981842041, "rewards/rejected": 14.888437271118164, "step": 1200 }, { "epoch": 0.24875621890547264, "grad_norm": 3592.465576171875, "learning_rate": 4.358039898861784e-07, "logits/chosen": -3.616874933242798, "logits/rejected": -3.5975780487060547, "logps/chosen": -805.9600219726562, "logps/rejected": -754.1599731445312, "loss": 7.1137, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 16.315702438354492, "rewards/margins": 1.9189550876617432, "rewards/rejected": 14.396132469177246, "step": 1250 }, { "epoch": 0.25870646766169153, "grad_norm": 2655.087646484375, "learning_rate": 4.303756578026196e-07, "logits/chosen": -4.052499771118164, "logits/rejected": -3.9873437881469727, "logps/chosen": -822.5599975585938, "logps/rejected": -769.0599975585938, "loss": 6.3728, "rewards/accuracies": 0.528124988079071, "rewards/chosen": 13.059394836425781, "rewards/margins": 1.4543017148971558, "rewards/rejected": 11.605507850646973, "step": 1300 }, { "epoch": 0.26865671641791045, "grad_norm": 3106.407470703125, "learning_rate": 4.247638543099302e-07, "logits/chosen": -4.597812652587891, "logits/rejected": -4.58078145980835, "logps/chosen": -821.6799926757812, "logps/rejected": -774.0399780273438, "loss": 5.3592, "rewards/accuracies": 0.5612499713897705, "rewards/chosen": 15.785625457763672, "rewards/margins": 2.451181650161743, "rewards/rejected": 13.333086013793945, "step": 1350 }, { "epoch": 0.27860696517412936, "grad_norm": 3253.3349609375, "learning_rate": 4.189742875258636e-07, "logits/chosen": -4.145625114440918, "logits/rejected": -4.125, "logps/chosen": -818.6799926757812, "logps/rejected": -775.5999755859375, "loss": 5.955, "rewards/accuracies": 0.5543749928474426, "rewards/chosen": 14.581796646118164, "rewards/margins": 0.9402441382408142, "rewards/rejected": 13.644218444824219, "step": 1400 }, { "epoch": 0.2885572139303483, "grad_norm": 2794.8505859375, "learning_rate": 4.1301284638238023e-07, "logits/chosen": -4.417500019073486, "logits/rejected": -4.430781364440918, "logps/chosen": -873.97998046875, "logps/rejected": -809.7999877929688, "loss": 5.9541, "rewards/accuracies": 0.5625, "rewards/chosen": 16.624374389648438, "rewards/margins": 2.9456982612609863, "rewards/rejected": 13.676972389221191, "step": 1450 }, { "epoch": 0.29850746268656714, "grad_norm": 3549.135009765625, "learning_rate": 4.068855946356451e-07, "logits/chosen": -4.357968807220459, "logits/rejected": -4.272500038146973, "logps/chosen": -815.0999755859375, "logps/rejected": -799.0999755859375, "loss": 7.1658, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 12.853320121765137, "rewards/margins": -0.4855078160762787, "rewards/rejected": 13.337441444396973, "step": 1500 }, { "epoch": 0.30845771144278605, "grad_norm": 2740.423095703125, "learning_rate": 4.005987646982011e-07, "logits/chosen": -4.377812385559082, "logits/rejected": -4.360000133514404, "logps/chosen": -861.0, "logps/rejected": -819.9000244140625, "loss": 6.0543, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 13.93810749053955, "rewards/margins": 1.2174170017242432, "rewards/rejected": 12.72454833984375, "step": 1550 }, { "epoch": 0.31840796019900497, "grad_norm": 2517.54248046875, "learning_rate": 3.9415875129958994e-07, "logits/chosen": -4.250625133514404, "logits/rejected": -4.236562728881836, "logps/chosen": -870.3599853515625, "logps/rejected": -832.8400268554688, "loss": 6.4818, "rewards/accuracies": 0.5443750023841858, "rewards/chosen": 12.261445045471191, "rewards/margins": 0.11241699010133743, "rewards/rejected": 12.149633407592773, "step": 1600 }, { "epoch": 0.3283582089552239, "grad_norm": 4087.48828125, "learning_rate": 3.875721049818718e-07, "logits/chosen": -4.099062442779541, "logits/rejected": -4.049062728881836, "logps/chosen": -868.47998046875, "logps/rejected": -826.6400146484375, "loss": 5.7788, "rewards/accuracies": 0.5299999713897705, "rewards/chosen": 12.918557167053223, "rewards/margins": 0.6001172065734863, "rewards/rejected": 12.3140230178833, "step": 1650 }, { "epoch": 0.3383084577114428, "grad_norm": 3558.606689453125, "learning_rate": 3.808455254366574e-07, "logits/chosen": -3.7817187309265137, "logits/rejected": -3.768437385559082, "logps/chosen": -857.0599975585938, "logps/rejected": -832.239990234375, "loss": 6.1453, "rewards/accuracies": 0.5162500143051147, "rewards/chosen": 13.769579887390137, "rewards/margins": 1.5330761671066284, "rewards/rejected": 12.234726905822754, "step": 1700 }, { "epoch": 0.3482587064676617, "grad_norm": 3828.70068359375, "learning_rate": 3.739858546904308e-07, "logits/chosen": -4.390937328338623, "logits/rejected": -4.3046875, "logps/chosen": -837.1799926757812, "logps/rejected": -835.3599853515625, "loss": 6.6707, "rewards/accuracies": 0.5400000214576721, "rewards/chosen": 11.527656555175781, "rewards/margins": 0.20983397960662842, "rewards/rejected": 11.319659233093262, "step": 1750 }, { "epoch": 0.3582089552238806, "grad_norm": 4480.52392578125, "learning_rate": 3.6700007014509514e-07, "logits/chosen": -4.233593940734863, "logits/rejected": -4.196249961853027, "logps/chosen": -868.5, "logps/rejected": -844.9000244140625, "loss": 5.0152, "rewards/accuracies": 0.5418750047683716, "rewards/chosen": 11.055917739868164, "rewards/margins": 1.0635205507278442, "rewards/rejected": 9.988080978393555, "step": 1800 }, { "epoch": 0.3681592039800995, "grad_norm": 2393.2080078125, "learning_rate": 3.5989527748081805e-07, "logits/chosen": -4.220937728881836, "logits/rejected": -4.229062557220459, "logps/chosen": -885.8800048828125, "logps/rejected": -871.0800170898438, "loss": 5.564, "rewards/accuracies": 0.5206249952316284, "rewards/chosen": 10.777030944824219, "rewards/margins": 0.5477758646011353, "rewards/rejected": 10.22183609008789, "step": 1850 }, { "epoch": 0.3781094527363184, "grad_norm": 2142.15185546875, "learning_rate": 3.52678703428399e-07, "logits/chosen": -3.959531307220459, "logits/rejected": -3.898750066757202, "logps/chosen": -828.0800170898438, "logps/rejected": -837.0, "loss": 4.9398, "rewards/accuracies": 0.5493749976158142, "rewards/chosen": 10.465898513793945, "rewards/margins": 1.3795897960662842, "rewards/rejected": 9.086328506469727, "step": 1900 }, { "epoch": 0.3880597014925373, "grad_norm": 5701.14794921875, "learning_rate": 3.45357688418507e-07, "logits/chosen": -3.465625047683716, "logits/rejected": -3.4301562309265137, "logps/chosen": -863.3200073242188, "logps/rejected": -833.1799926757812, "loss": 4.9191, "rewards/accuracies": 0.5206249952316284, "rewards/chosen": 11.544062614440918, "rewards/margins": 1.2966210842132568, "rewards/rejected": 10.246211051940918, "step": 1950 }, { "epoch": 0.39800995024875624, "grad_norm": 2166.156005859375, "learning_rate": 3.3793967911526797e-07, "logits/chosen": -4.175624847412109, "logits/rejected": -4.157968521118164, "logps/chosen": -864.739990234375, "logps/rejected": -818.3200073242188, "loss": 4.9636, "rewards/accuracies": 0.5587499737739563, "rewards/chosen": 10.2691011428833, "rewards/margins": 1.404970645904541, "rewards/rejected": 8.860605239868164, "step": 2000 }, { "epoch": 0.4079601990049751, "grad_norm": 1748.567626953125, "learning_rate": 3.3043222084179477e-07, "logits/chosen": -4.447812557220459, "logits/rejected": -4.435625076293945, "logps/chosen": -864.1799926757812, "logps/rejected": -806.9400024414062, "loss": 4.3649, "rewards/accuracies": 0.5406249761581421, "rewards/chosen": 11.546367645263672, "rewards/margins": 1.7490723133087158, "rewards/rejected": 9.790781021118164, "step": 2050 }, { "epoch": 0.417910447761194, "grad_norm": 4250.59228515625, "learning_rate": 3.228429499053651e-07, "logits/chosen": -3.7835936546325684, "logits/rejected": -3.764218807220459, "logps/chosen": -896.280029296875, "logps/rejected": -840.52001953125, "loss": 5.556, "rewards/accuracies": 0.53125, "rewards/chosen": 12.199726104736328, "rewards/margins": 0.49269530177116394, "rewards/rejected": 11.707152366638184, "step": 2100 }, { "epoch": 0.42786069651741293, "grad_norm": 2269.677734375, "learning_rate": 3.151795858300542e-07, "logits/chosen": -4.282343864440918, "logits/rejected": -4.28640604019165, "logps/chosen": -864.52001953125, "logps/rejected": -833.02001953125, "loss": 4.4658, "rewards/accuracies": 0.5256249904632568, "rewards/chosen": 10.688271522521973, "rewards/margins": 0.9122143387794495, "rewards/rejected": 9.776113510131836, "step": 2150 }, { "epoch": 0.43781094527363185, "grad_norm": 2995.49267578125, "learning_rate": 3.0744992350472184e-07, "logits/chosen": -4.102499961853027, "logits/rejected": -4.065000057220459, "logps/chosen": -891.3599853515625, "logps/rejected": -809.739990234375, "loss": 3.9283, "rewards/accuracies": 0.5575000047683716, "rewards/chosen": 10.750624656677246, "rewards/margins": 1.9478063583374023, "rewards/rejected": 8.801519393920898, "step": 2200 }, { "epoch": 0.44776119402985076, "grad_norm": 2685.581298828125, "learning_rate": 2.9966182525434136e-07, "logits/chosen": -4.429843902587891, "logits/rejected": -4.410468578338623, "logps/chosen": -917.9000244140625, "logps/rejected": -884.8599853515625, "loss": 4.9653, "rewards/accuracies": 0.5181249976158142, "rewards/chosen": 8.635839462280273, "rewards/margins": 0.5861572027206421, "rewards/rejected": 8.053730010986328, "step": 2250 }, { "epoch": 0.4577114427860697, "grad_norm": 1810.1524658203125, "learning_rate": 2.9182321284273524e-07, "logits/chosen": -4.380312442779541, "logits/rejected": -4.308281421661377, "logps/chosen": -892.0999755859375, "logps/rejected": -817.1799926757812, "loss": 4.4186, "rewards/accuracies": 0.5537499785423279, "rewards/chosen": 8.945687294006348, "rewards/margins": 1.2103466987609863, "rewards/rejected": 7.736120223999023, "step": 2300 }, { "epoch": 0.46766169154228854, "grad_norm": 1638.470458984375, "learning_rate": 2.839420594148518e-07, "logits/chosen": -4.286562442779541, "logits/rejected": -4.321406364440918, "logps/chosen": -856.7000122070312, "logps/rejected": -857.2999877929688, "loss": 4.0532, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 9.412128448486328, "rewards/margins": 1.397641658782959, "rewards/rejected": 8.009862899780273, "step": 2350 }, { "epoch": 0.47761194029850745, "grad_norm": 4396.177734375, "learning_rate": 2.7602638138677834e-07, "logits/chosen": -4.463749885559082, "logits/rejected": -4.425156116485596, "logps/chosen": -903.4199829101562, "logps/rejected": -882.0800170898438, "loss": 4.5126, "rewards/accuracies": 0.5487499833106995, "rewards/chosen": 7.66628885269165, "rewards/margins": 1.1259644031524658, "rewards/rejected": 6.541113376617432, "step": 2400 }, { "epoch": 0.48756218905472637, "grad_norm": 4004.349609375, "learning_rate": 2.6808423029174143e-07, "logits/chosen": -4.335468769073486, "logits/rejected": -4.302812576293945, "logps/chosen": -876.6799926757812, "logps/rejected": -830.239990234375, "loss": 4.9104, "rewards/accuracies": 0.5393750071525574, "rewards/chosen": 10.196874618530273, "rewards/margins": 0.7247558832168579, "rewards/rejected": 9.472156524658203, "step": 2450 }, { "epoch": 0.4975124378109453, "grad_norm": 2645.156494140625, "learning_rate": 2.6012368459038625e-07, "logits/chosen": -4.241718769073486, "logits/rejected": -4.247031211853027, "logps/chosen": -940.6400146484375, "logps/rejected": -873.52001953125, "loss": 4.4678, "rewards/accuracies": 0.5193750262260437, "rewards/chosen": 7.638674259185791, "rewards/margins": 0.7611132860183716, "rewards/rejected": 6.874751091003418, "step": 2500 }, { "epoch": 0.5074626865671642, "grad_norm": 4464.42724609375, "learning_rate": 2.5215284145366754e-07, "logits/chosen": -4.28781270980835, "logits/rejected": -4.303593635559082, "logps/chosen": -898.02001953125, "logps/rejected": -856.219970703125, "loss": 4.8918, "rewards/accuracies": 0.5393750071525574, "rewards/chosen": 9.058222770690918, "rewards/margins": 0.16966308653354645, "rewards/rejected": 8.891836166381836, "step": 2550 }, { "epoch": 0.5174129353233831, "grad_norm": 1550.342041015625, "learning_rate": 2.4417980852670795e-07, "logits/chosen": -4.276875019073486, "logits/rejected": -4.283124923706055, "logps/chosen": -908.8800048828125, "logps/rejected": -837.219970703125, "loss": 3.7542, "rewards/accuracies": 0.5493749976158142, "rewards/chosen": 8.9493408203125, "rewards/margins": 1.7262645959854126, "rewards/rejected": 7.21969747543335, "step": 2600 }, { "epoch": 0.527363184079602, "grad_norm": 1139.1016845703125, "learning_rate": 2.3621269568200348e-07, "logits/chosen": -4.569843769073486, "logits/rejected": -4.572031021118164, "logps/chosen": -863.0399780273438, "logps/rejected": -832.1199951171875, "loss": 4.4595, "rewards/accuracies": 0.5274999737739563, "rewards/chosen": 9.005471229553223, "rewards/margins": 0.9330615401268005, "rewards/rejected": 8.067304611206055, "step": 2650 }, { "epoch": 0.5373134328358209, "grad_norm": 3259.9931640625, "learning_rate": 2.2825960677036263e-07, "logits/chosen": -5.025000095367432, "logits/rejected": -5.025312423706055, "logps/chosen": -900.02001953125, "logps/rejected": -855.4600219726562, "loss": 3.8295, "rewards/accuracies": 0.5162500143051147, "rewards/chosen": 7.501829624176025, "rewards/margins": 0.9494946002960205, "rewards/rejected": 6.554053783416748, "step": 2700 }, { "epoch": 0.5472636815920398, "grad_norm": 1576.5633544921875, "learning_rate": 2.2032863137797098e-07, "logits/chosen": -4.931250095367432, "logits/rejected": -4.935312271118164, "logps/chosen": -888.0, "logps/rejected": -890.8200073242188, "loss": 3.6169, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 6.200995922088623, "rewards/margins": 2.0712647438049316, "rewards/rejected": 4.132159233093262, "step": 2750 }, { "epoch": 0.5572139303482587, "grad_norm": 2286.4580078125, "learning_rate": 2.1242783659796472e-07, "logits/chosen": -5.111249923706055, "logits/rejected": -5.120625019073486, "logps/chosen": -899.4000244140625, "logps/rejected": -861.1199951171875, "loss": 4.0467, "rewards/accuracies": 0.5299999713897705, "rewards/chosen": 6.854379653930664, "rewards/margins": 0.9951757788658142, "rewards/rejected": 5.860227108001709, "step": 2800 }, { "epoch": 0.5671641791044776, "grad_norm": 1399.0577392578125, "learning_rate": 2.0456525882488414e-07, "logits/chosen": -5.425624847412109, "logits/rejected": -5.343437671661377, "logps/chosen": -907.280029296875, "logps/rejected": -847.0599975585938, "loss": 4.3571, "rewards/accuracies": 0.5137500166893005, "rewards/chosen": 7.121167182922363, "rewards/margins": 0.05256347730755806, "rewards/rejected": 7.0656046867370605, "step": 2850 }, { "epoch": 0.5771144278606966, "grad_norm": 1395.7008056640625, "learning_rate": 1.967488955803515e-07, "logits/chosen": -5.565000057220459, "logits/rejected": -5.533124923706055, "logps/chosen": -921.8800048828125, "logps/rejected": -867.5599975585938, "loss": 3.6962, "rewards/accuracies": 0.5237500071525574, "rewards/chosen": 8.013593673706055, "rewards/margins": 1.2249804735183716, "rewards/rejected": 6.789748668670654, "step": 2900 }, { "epoch": 0.5870646766169154, "grad_norm": 1817.9039306640625, "learning_rate": 1.8898669737829009e-07, "logits/chosen": -5.284999847412109, "logits/rejected": -5.328750133514404, "logps/chosen": -901.7999877929688, "logps/rejected": -848.260009765625, "loss": 3.4712, "rewards/accuracies": 0.5325000286102295, "rewards/chosen": 8.152949333190918, "rewards/margins": 1.7032690048217773, "rewards/rejected": 6.451176643371582, "step": 2950 }, { "epoch": 0.5970149253731343, "grad_norm": 2175.98291015625, "learning_rate": 1.8128655963795654e-07, "logits/chosen": -5.226718902587891, "logits/rejected": -5.177187442779541, "logps/chosen": -896.52001953125, "logps/rejected": -842.5, "loss": 4.3828, "rewards/accuracies": 0.5337499976158142, "rewards/chosen": 6.346333026885986, "rewards/margins": -0.23146240413188934, "rewards/rejected": 6.57891845703125, "step": 3000 }, { "epoch": 0.6069651741293532, "grad_norm": 2423.90869140625, "learning_rate": 1.736563146530148e-07, "logits/chosen": -5.147812366485596, "logits/rejected": -5.121250152587891, "logps/chosen": -906.780029296875, "logps/rejected": -842.0599975585938, "loss": 3.618, "rewards/accuracies": 0.5256249904632568, "rewards/chosen": 7.217099666595459, "rewards/margins": 1.0019750595092773, "rewards/rejected": 6.212661266326904, "step": 3050 }, { "epoch": 0.6169154228855721, "grad_norm": 1796.0404052734375, "learning_rate": 1.6610372362481795e-07, "logits/chosen": -5.517499923706055, "logits/rejected": -5.500937461853027, "logps/chosen": -888.3800048828125, "logps/rejected": -870.8200073242188, "loss": 3.9941, "rewards/accuracies": 0.534375011920929, "rewards/chosen": 6.678945541381836, "rewards/margins": 0.21024902164936066, "rewards/rejected": 6.4722514152526855, "step": 3100 }, { "epoch": 0.6268656716417911, "grad_norm": 3228.182373046875, "learning_rate": 1.5863646876800294e-07, "logits/chosen": -5.522500038146973, "logits/rejected": -5.519062519073486, "logps/chosen": -917.4600219726562, "logps/rejected": -894.3400268554688, "loss": 3.9995, "rewards/accuracies": 0.5049999952316284, "rewards/chosen": 6.917697906494141, "rewards/margins": 0.9786840677261353, "rewards/rejected": 5.941035270690918, "step": 3150 }, { "epoch": 0.6368159203980099, "grad_norm": 1673.2705078125, "learning_rate": 1.512621454964278e-07, "logits/chosen": -5.52468729019165, "logits/rejected": -5.51687479019165, "logps/chosen": -920.6400146484375, "logps/rejected": -874.0399780273438, "loss": 3.5676, "rewards/accuracies": 0.5256249904632568, "rewards/chosen": 6.860390663146973, "rewards/margins": 0.9590514898300171, "rewards/rejected": 5.902841567993164, "step": 3200 }, { "epoch": 0.6467661691542289, "grad_norm": 3450.835693359375, "learning_rate": 1.439882546973991e-07, "logits/chosen": -5.425468921661377, "logits/rejected": -5.380000114440918, "logps/chosen": -896.6799926757812, "logps/rejected": -860.0, "loss": 3.9909, "rewards/accuracies": 0.5531250238418579, "rewards/chosen": 6.930351734161377, "rewards/margins": 0.6515478491783142, "rewards/rejected": 6.279133319854736, "step": 3250 }, { "epoch": 0.6567164179104478, "grad_norm": 2366.854736328125, "learning_rate": 1.3682219510204828e-07, "logits/chosen": -5.55343770980835, "logits/rejected": -5.555312633514404, "logps/chosen": -918.0800170898438, "logps/rejected": -867.8400268554688, "loss": 3.9592, "rewards/accuracies": 0.5318750143051147, "rewards/chosen": 6.138139724731445, "rewards/margins": 0.7850878834724426, "rewards/rejected": 5.353430271148682, "step": 3300 }, { "epoch": 0.6666666666666666, "grad_norm": 2788.40771484375, "learning_rate": 1.2977125575961799e-07, "logits/chosen": -5.831562519073486, "logits/rejected": -5.809062480926514, "logps/chosen": -918.0999755859375, "logps/rejected": -877.780029296875, "loss": 3.6818, "rewards/accuracies": 0.5325000286102295, "rewards/chosen": 5.774457931518555, "rewards/margins": 0.3942529261112213, "rewards/rejected": 5.381279468536377, "step": 3350 }, { "epoch": 0.6766169154228856, "grad_norm": 2038.4530029296875, "learning_rate": 1.2284260862331184e-07, "logits/chosen": -5.459531307220459, "logits/rejected": -5.452187538146973, "logps/chosen": -868.5599975585938, "logps/rejected": -831.5, "loss": 4.1592, "rewards/accuracies": 0.5149999856948853, "rewards/chosen": 8.009712219238281, "rewards/margins": 0.3110009729862213, "rewards/rejected": 7.698652267456055, "step": 3400 }, { "epoch": 0.6865671641791045, "grad_norm": 2557.6064453125, "learning_rate": 1.1604330125525078e-07, "logits/chosen": -5.480000019073486, "logits/rejected": -5.461249828338623, "logps/chosen": -930.9000244140625, "logps/rejected": -879.5800170898438, "loss": 3.8611, "rewards/accuracies": 0.5181249976158142, "rewards/chosen": 6.370607852935791, "rewards/margins": 0.5133349895477295, "rewards/rejected": 5.857964038848877, "step": 3450 }, { "epoch": 0.6965174129353234, "grad_norm": 2376.09716796875, "learning_rate": 1.0938024965795506e-07, "logits/chosen": -5.44406270980835, "logits/rejected": -5.42312479019165, "logps/chosen": -893.1400146484375, "logps/rejected": -862.0, "loss": 3.7593, "rewards/accuracies": 0.5168750286102295, "rewards/chosen": 7.158564567565918, "rewards/margins": 0.6185815334320068, "rewards/rejected": 6.541041851043701, "step": 3500 }, { "epoch": 0.7064676616915423, "grad_norm": 3130.968017578125, "learning_rate": 1.0286023123964326e-07, "logits/chosen": -5.474531173706055, "logits/rejected": -5.479062557220459, "logps/chosen": -889.4000244140625, "logps/rejected": -889.760009765625, "loss": 3.6067, "rewards/accuracies": 0.5049999952316284, "rewards/chosen": 4.780278205871582, "rewards/margins": 0.41691163182258606, "rewards/rejected": 4.363155364990234, "step": 3550 }, { "epoch": 0.7164179104477612, "grad_norm": 2820.47412109375, "learning_rate": 9.64898779205055e-08, "logits/chosen": -5.624687671661377, "logits/rejected": -5.600468635559082, "logps/chosen": -903.4400024414062, "logps/rejected": -850.5, "loss": 3.9096, "rewards/accuracies": 0.5049999952316284, "rewards/chosen": 6.33207893371582, "rewards/margins": 0.01932373084127903, "rewards/rejected": 6.315381050109863, "step": 3600 }, { "epoch": 0.7263681592039801, "grad_norm": 3085.365478515625, "learning_rate": 9.027566938696051e-08, "logits/chosen": -5.869375228881836, "logits/rejected": -5.823437690734863, "logps/chosen": -913.4400024414062, "logps/rejected": -874.3599853515625, "loss": 4.0442, "rewards/accuracies": 0.5274999737739563, "rewards/chosen": 5.950512886047363, "rewards/margins": 0.31416991353034973, "rewards/rejected": 5.635661602020264, "step": 3650 }, { "epoch": 0.736318407960199, "grad_norm": 2496.829833984375, "learning_rate": 8.42239265007595e-08, "logits/chosen": -5.743750095367432, "logits/rejected": -5.717812538146973, "logps/chosen": -893.8800048828125, "logps/rejected": -847.2999877929688, "loss": 3.7924, "rewards/accuracies": 0.5131250023841858, "rewards/chosen": 7.561201095581055, "rewards/margins": 0.5582299828529358, "rewards/rejected": 6.997402191162109, "step": 3700 }, { "epoch": 0.746268656716418, "grad_norm": 3231.818115234375, "learning_rate": 7.834080486964115e-08, "logits/chosen": -5.849374771118164, "logits/rejected": -5.823437690734863, "logps/chosen": -919.1199951171875, "logps/rejected": -881.9600219726562, "loss": 4.0168, "rewards/accuracies": 0.4975000023841858, "rewards/chosen": 6.423149585723877, "rewards/margins": -0.03854003921151161, "rewards/rejected": 6.461066722869873, "step": 3750 }, { "epoch": 0.7562189054726368, "grad_norm": 2785.632080078125, "learning_rate": 7.263228858607615e-08, "logits/chosen": -6.016250133514404, "logits/rejected": -5.974062442779541, "logps/chosen": -896.739990234375, "logps/rejected": -847.219970703125, "loss": 3.5495, "rewards/accuracies": 0.5462499856948853, "rewards/chosen": 6.750986099243164, "rewards/margins": 0.5915331840515137, "rewards/rejected": 6.161344051361084, "step": 3800 }, { "epoch": 0.7661691542288557, "grad_norm": 1095.79541015625, "learning_rate": 6.7104184140471e-08, "logits/chosen": -5.880312442779541, "logits/rejected": -5.940000057220459, "logps/chosen": -925.3200073242188, "logps/rejected": -870.6199951171875, "loss": 3.5409, "rewards/accuracies": 0.5456249713897705, "rewards/chosen": 8.115625381469727, "rewards/margins": 1.7515722513198853, "rewards/rejected": 6.366718769073486, "step": 3850 }, { "epoch": 0.7761194029850746, "grad_norm": 1527.818115234375, "learning_rate": 6.176211451502181e-08, "logits/chosen": -5.776249885559082, "logits/rejected": -5.784687519073486, "logps/chosen": -915.5800170898438, "logps/rejected": -894.719970703125, "loss": 3.6188, "rewards/accuracies": 0.5287500023841858, "rewards/chosen": 5.876552581787109, "rewards/margins": 1.2620117664337158, "rewards/rejected": 4.617353439331055, "step": 3900 }, { "epoch": 0.7860696517412935, "grad_norm": 2342.368896484375, "learning_rate": 5.66115134642263e-08, "logits/chosen": -5.654062271118164, "logits/rejected": -5.647812366485596, "logps/chosen": -949.5, "logps/rejected": -899.1599731445312, "loss": 3.6019, "rewards/accuracies": 0.5174999833106995, "rewards/chosen": 5.2860107421875, "rewards/margins": 0.6428442597389221, "rewards/rejected": 4.6402587890625, "step": 3950 }, { "epoch": 0.7960199004975125, "grad_norm": 1863.121337890625, "learning_rate": 5.1657619987870657e-08, "logits/chosen": -5.65500020980835, "logits/rejected": -5.638437271118164, "logps/chosen": -906.2000122070312, "logps/rejected": -842.8800048828125, "loss": 4.1956, "rewards/accuracies": 0.5256249904632568, "rewards/chosen": 5.409960746765137, "rewards/margins": 0.055903319269418716, "rewards/rejected": 5.3602614402771, "step": 4000 }, { "epoch": 0.8059701492537313, "grad_norm": 3470.02197265625, "learning_rate": 4.690547300211392e-08, "logits/chosen": -5.610937595367432, "logits/rejected": -5.582656383514404, "logps/chosen": -865.780029296875, "logps/rejected": -821.719970703125, "loss": 3.7337, "rewards/accuracies": 0.5174999833106995, "rewards/chosen": 6.724204063415527, "rewards/margins": 0.2500012218952179, "rewards/rejected": 6.473584175109863, "step": 4050 }, { "epoch": 0.8159203980099502, "grad_norm": 1288.328369140625, "learning_rate": 4.235990621408972e-08, "logits/chosen": -5.644999980926514, "logits/rejected": -5.604687690734863, "logps/chosen": -901.4000244140625, "logps/rejected": -848.6799926757812, "loss": 3.7236, "rewards/accuracies": 0.5337499976158142, "rewards/chosen": 5.766840934753418, "rewards/margins": 0.43273621797561646, "rewards/rejected": 5.334909439086914, "step": 4100 }, { "epoch": 0.8258706467661692, "grad_norm": 2981.522216796875, "learning_rate": 3.802554320523949e-08, "logits/chosen": -5.59375, "logits/rejected": -5.619062423706055, "logps/chosen": -928.6199951171875, "logps/rejected": -869.6599731445312, "loss": 3.1704, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 8.217485427856445, "rewards/margins": 1.8676855564117432, "rewards/rejected": 6.345156192779541, "step": 4150 }, { "epoch": 0.835820895522388, "grad_norm": 1575.174072265625, "learning_rate": 3.390679272837724e-08, "logits/chosen": -5.644999980926514, "logits/rejected": -5.65625, "logps/chosen": -920.8800048828125, "logps/rejected": -862.9199829101562, "loss": 3.3543, "rewards/accuracies": 0.5425000190734863, "rewards/chosen": 6.809421539306641, "rewards/margins": 1.4754736423492432, "rewards/rejected": 5.333471775054932, "step": 4200 }, { "epoch": 0.845771144278607, "grad_norm": 1714.1629638671875, "learning_rate": 3.00078442232703e-08, "logits/chosen": -5.693437576293945, "logits/rejected": -5.644374847412109, "logps/chosen": -917.4000244140625, "logps/rejected": -891.1199951171875, "loss": 4.002, "rewards/accuracies": 0.5237500071525574, "rewards/chosen": 5.626728534698486, "rewards/margins": 0.3019775450229645, "rewards/rejected": 5.322280406951904, "step": 4250 }, { "epoch": 0.8557213930348259, "grad_norm": 4939.02490234375, "learning_rate": 2.633266355529684e-08, "logits/chosen": -5.634375095367432, "logits/rejected": -5.644999980926514, "logps/chosen": -928.219970703125, "logps/rejected": -868.4199829101562, "loss": 3.297, "rewards/accuracies": 0.5481250286102295, "rewards/chosen": 8.091529846191406, "rewards/margins": 2.134963274002075, "rewards/rejected": 5.956567287445068, "step": 4300 }, { "epoch": 0.8656716417910447, "grad_norm": 3157.2939453125, "learning_rate": 2.2884988981515447e-08, "logits/chosen": -5.770625114440918, "logits/rejected": -5.713437557220459, "logps/chosen": -948.8400268554688, "logps/rejected": -911.5800170898438, "loss": 4.5557, "rewards/accuracies": 0.5231249928474426, "rewards/chosen": 5.515078067779541, "rewards/margins": -0.4518188536167145, "rewards/rejected": 5.966367244720459, "step": 4350 }, { "epoch": 0.8756218905472637, "grad_norm": 1111.7291259765625, "learning_rate": 1.9668327348248857e-08, "logits/chosen": -5.699999809265137, "logits/rejected": -5.701562404632568, "logps/chosen": -895.1400146484375, "logps/rejected": -893.3200073242188, "loss": 3.5643, "rewards/accuracies": 0.5493749976158142, "rewards/chosen": 5.846921443939209, "rewards/margins": 0.8055566549301147, "rewards/rejected": 5.04319953918457, "step": 4400 }, { "epoch": 0.8855721393034826, "grad_norm": 4429.13330078125, "learning_rate": 1.6685950524050307e-08, "logits/chosen": -5.749062538146973, "logits/rejected": -5.731562614440918, "logps/chosen": -940.0399780273438, "logps/rejected": -897.780029296875, "loss": 3.4933, "rewards/accuracies": 0.53125, "rewards/chosen": 5.280101299285889, "rewards/margins": 0.7753466963768005, "rewards/rejected": 4.501115798950195, "step": 4450 }, { "epoch": 0.8955223880597015, "grad_norm": 1798.3892822265625, "learning_rate": 1.3940892071680837e-08, "logits/chosen": -5.7578125, "logits/rejected": -5.719531059265137, "logps/chosen": -913.3400268554688, "logps/rejected": -870.8599853515625, "loss": 3.8212, "rewards/accuracies": 0.5056250095367432, "rewards/chosen": 6.39865255355835, "rewards/margins": 0.44740965962409973, "rewards/rejected": 5.9497971534729, "step": 4500 }, { "epoch": 0.9054726368159204, "grad_norm": 1578.768798828125, "learning_rate": 1.1435944162481808e-08, "logits/chosen": -5.743750095367432, "logits/rejected": -5.692031383514404, "logps/chosen": -936.0999755859375, "logps/rejected": -878.8800048828125, "loss": 3.6106, "rewards/accuracies": 0.5268750190734863, "rewards/chosen": 6.700493335723877, "rewards/margins": 0.9369800090789795, "rewards/rejected": 5.758784294128418, "step": 4550 }, { "epoch": 0.9154228855721394, "grad_norm": 2657.033935546875, "learning_rate": 9.17365473628226e-09, "logits/chosen": -5.730000019073486, "logits/rejected": -5.676718711853027, "logps/chosen": -936.4600219726562, "logps/rejected": -913.5800170898438, "loss": 4.1793, "rewards/accuracies": 0.5262500047683716, "rewards/chosen": 6.953037261962891, "rewards/margins": 0.0015722656389698386, "rewards/rejected": 6.950415134429932, "step": 4600 }, { "epoch": 0.9253731343283582, "grad_norm": 5578.38916015625, "learning_rate": 7.1563249097292e-09, "logits/chosen": -5.808750152587891, "logits/rejected": -5.783437728881836, "logps/chosen": -908.780029296875, "logps/rejected": -889.7000122070312, "loss": 3.3363, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 5.7548828125, "rewards/margins": 1.054022192955017, "rewards/rejected": 4.701448917388916, "step": 4650 }, { "epoch": 0.9353233830845771, "grad_norm": 2391.306640625, "learning_rate": 5.38600663567737e-09, "logits/chosen": -5.85281229019165, "logits/rejected": -5.862187385559082, "logps/chosen": -873.1599731445312, "logps/rejected": -841.780029296875, "loss": 3.168, "rewards/accuracies": 0.5099999904632568, "rewards/chosen": 6.133432388305664, "rewards/margins": 0.7415136694908142, "rewards/rejected": 5.392402172088623, "step": 4700 }, { "epoch": 0.945273631840796, "grad_norm": 1352.6234130859375, "learning_rate": 3.864500616019228e-09, "logits/chosen": -5.815937519073486, "logits/rejected": -5.816874980926514, "logps/chosen": -930.3800048828125, "logps/rejected": -871.1799926757812, "loss": 3.6411, "rewards/accuracies": 0.5568749904632568, "rewards/chosen": 6.584997653961182, "rewards/margins": 1.1969140768051147, "rewards/rejected": 5.385488510131836, "step": 4750 }, { "epoch": 0.9552238805970149, "grad_norm": 1797.4112548828125, "learning_rate": 2.593354470077802e-09, "logits/chosen": -5.836249828338623, "logits/rejected": -5.78249979019165, "logps/chosen": -942.0, "logps/rejected": -871.239990234375, "loss": 3.9736, "rewards/accuracies": 0.5318750143051147, "rewards/chosen": 5.091171741485596, "rewards/margins": 0.1407189965248108, "rewards/rejected": 4.95010232925415, "step": 4800 }, { "epoch": 0.9651741293532339, "grad_norm": 1682.3397216796875, "learning_rate": 1.5738611604260433e-09, "logits/chosen": -5.798749923706055, "logits/rejected": -5.760000228881836, "logps/chosen": -901.0, "logps/rejected": -837.4600219726562, "loss": 3.8623, "rewards/accuracies": 0.5318750143051147, "rewards/chosen": 6.400158882141113, "rewards/margins": 0.044941406697034836, "rewards/rejected": 6.351467132568359, "step": 4850 }, { "epoch": 0.9751243781094527, "grad_norm": 1607.166259765625, "learning_rate": 8.070576777333138e-10, "logits/chosen": -5.75390625, "logits/rejected": -5.801562309265137, "logps/chosen": -863.0599975585938, "logps/rejected": -844.0399780273438, "loss": 3.1538, "rewards/accuracies": 0.5356249809265137, "rewards/chosen": 7.545395374298096, "rewards/margins": 1.907900333404541, "rewards/rejected": 5.632500171661377, "step": 4900 }, { "epoch": 0.9850746268656716, "grad_norm": 1843.007568359375, "learning_rate": 2.937239859770735e-10, "logits/chosen": -5.721562385559082, "logits/rejected": -5.678124904632568, "logps/chosen": -901.3200073242188, "logps/rejected": -831.760009765625, "loss": 4.2105, "rewards/accuracies": 0.5493749976158142, "rewards/chosen": 8.320673942565918, "rewards/margins": 0.23246826231479645, "rewards/rejected": 8.0889253616333, "step": 4950 }, { "epoch": 0.9950248756218906, "grad_norm": 2304.98828125, "learning_rate": 3.4382229092522196e-11, "logits/chosen": -5.78781270980835, "logits/rejected": -5.800312519073486, "logps/chosen": -910.6599731445312, "logps/rejected": -895.8599853515625, "loss": 3.3647, "rewards/accuracies": 0.5425000190734863, "rewards/chosen": 7.996386528015137, "rewards/margins": 1.4546045064926147, "rewards/rejected": 6.5366530418396, "step": 5000 } ], "logging_steps": 50, "max_steps": 5025, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }