smollm3-Custom-DPO / trainer_state.json
iamgroot42's picture
Upload folder using huggingface_hub
135193d verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 5025,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.009950248756218905,
"grad_norm": 37.25846862792969,
"learning_rate": 2.45e-07,
"logits/chosen": 5.909375190734863,
"logits/rejected": 6.022812366485596,
"logps/chosen": -153.30499267578125,
"logps/rejected": -145.34500122070312,
"loss": 23.1951,
"rewards/accuracies": 0.5337499976158142,
"rewards/chosen": 86.37000274658203,
"rewards/margins": 8.612265586853027,
"rewards/rejected": 77.75499725341797,
"step": 50
},
{
"epoch": 0.01990049751243781,
"grad_norm": 46.854312896728516,
"learning_rate": 4.95e-07,
"logits/chosen": 5.64968729019165,
"logits/rejected": 5.914999961853027,
"logps/chosen": -149.85000610351562,
"logps/rejected": -148.31500244140625,
"loss": 23.2079,
"rewards/accuracies": 0.5256249904632568,
"rewards/chosen": 84.31500244140625,
"rewards/margins": 5.061445236206055,
"rewards/rejected": 79.23249816894531,
"step": 100
},
{
"epoch": 0.029850746268656716,
"grad_norm": 53.076805114746094,
"learning_rate": 4.998778891959453e-07,
"logits/chosen": 5.153749942779541,
"logits/rejected": 5.412187576293945,
"logps/chosen": -150.7550048828125,
"logps/rejected": -148.91000366210938,
"loss": 23.4129,
"rewards/accuracies": 0.5212500095367432,
"rewards/chosen": 82.07499694824219,
"rewards/margins": 5.790234565734863,
"rewards/rejected": 76.28500366210938,
"step": 150
},
{
"epoch": 0.03980099502487562,
"grad_norm": 76.89788818359375,
"learning_rate": 4.99501662760924e-07,
"logits/chosen": 4.105234146118164,
"logits/rejected": 4.434531211853027,
"logps/chosen": -165.125,
"logps/rejected": -161.82749938964844,
"loss": 22.0375,
"rewards/accuracies": 0.5193750262260437,
"rewards/chosen": 80.40499877929688,
"rewards/margins": 5.854726791381836,
"rewards/rejected": 74.58000183105469,
"step": 200
},
{
"epoch": 0.04975124378109453,
"grad_norm": 106.3976821899414,
"learning_rate": 4.988716525160205e-07,
"logits/chosen": 2.590937614440918,
"logits/rejected": 2.960390567779541,
"logps/chosen": -181.55999755859375,
"logps/rejected": -182.18499755859375,
"loss": 22.0983,
"rewards/accuracies": 0.5206249952316284,
"rewards/chosen": 79.50749969482422,
"rewards/margins": 5.624882698059082,
"rewards/rejected": 73.90750122070312,
"step": 250
},
{
"epoch": 0.05970149253731343,
"grad_norm": 154.4518585205078,
"learning_rate": 4.979884992842194e-07,
"logits/chosen": 1.157080054283142,
"logits/rejected": 1.4771264791488647,
"logps/chosen": -199.74000549316406,
"logps/rejected": -208.74000549316406,
"loss": 23.0821,
"rewards/accuracies": 0.4806250035762787,
"rewards/chosen": 75.35250091552734,
"rewards/margins": 1.8104979991912842,
"rewards/rejected": 73.51750183105469,
"step": 300
},
{
"epoch": 0.06965174129353234,
"grad_norm": 159.42955017089844,
"learning_rate": 4.968531013761348e-07,
"logits/chosen": -0.5976855754852295,
"logits/rejected": -0.2811816334724426,
"logps/chosen": -253.47000122070312,
"logps/rejected": -241.38999938964844,
"loss": 19.8918,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": 79.19000244140625,
"rewards/margins": 8.498632431030273,
"rewards/rejected": 70.72000122070312,
"step": 350
},
{
"epoch": 0.07960199004975124,
"grad_norm": 113.71151733398438,
"learning_rate": 4.954666136762819e-07,
"logits/chosen": -2.210566520690918,
"logits/rejected": -1.936132788658142,
"logps/chosen": -294.9599914550781,
"logps/rejected": -279.9599914550781,
"loss": 18.7674,
"rewards/accuracies": 0.5331249833106995,
"rewards/chosen": 72.56375122070312,
"rewards/margins": 8.443652153015137,
"rewards/rejected": 64.11750030517578,
"step": 400
},
{
"epoch": 0.08955223880597014,
"grad_norm": 420.630859375,
"learning_rate": 4.938304464683715e-07,
"logits/chosen": -3.8620312213897705,
"logits/rejected": -3.5835156440734863,
"logps/chosen": -352.239990234375,
"logps/rejected": -348.32000732421875,
"loss": 18.0366,
"rewards/accuracies": 0.5162500143051147,
"rewards/chosen": 59.02375030517578,
"rewards/margins": 2.2487499713897705,
"rewards/rejected": 56.75749969482422,
"step": 450
},
{
"epoch": 0.09950248756218906,
"grad_norm": 1939.998046875,
"learning_rate": 4.91946264000822e-07,
"logits/chosen": -4.11453104019165,
"logits/rejected": -4.022890567779541,
"logps/chosen": -524.02001953125,
"logps/rejected": -501.67999267578125,
"loss": 13.3907,
"rewards/accuracies": 0.5350000262260437,
"rewards/chosen": 50.61375045776367,
"rewards/margins": 6.952011585235596,
"rewards/rejected": 43.663124084472656,
"step": 500
},
{
"epoch": 0.10945273631840796,
"grad_norm": 1124.046142578125,
"learning_rate": 4.898159827939476e-07,
"logits/chosen": -4.222187519073486,
"logits/rejected": -4.111406326293945,
"logps/chosen": -715.0800170898438,
"logps/rejected": -685.260009765625,
"loss": 10.9502,
"rewards/accuracies": 0.5181249976158142,
"rewards/chosen": 25.342500686645508,
"rewards/margins": 0.6623925566673279,
"rewards/rejected": 24.693124771118164,
"step": 550
},
{
"epoch": 0.11940298507462686,
"grad_norm": 1316.1282958984375,
"learning_rate": 4.874417696905456e-07,
"logits/chosen": -4.313593864440918,
"logits/rejected": -4.233281135559082,
"logps/chosen": -738.239990234375,
"logps/rejected": -716.5800170898438,
"loss": 8.9983,
"rewards/accuracies": 0.5450000166893005,
"rewards/chosen": 25.038436889648438,
"rewards/margins": 3.051743268966675,
"rewards/rejected": 21.988750457763672,
"step": 600
},
{
"epoch": 0.12935323383084577,
"grad_norm": 1739.2020263671875,
"learning_rate": 4.848260396518637e-07,
"logits/chosen": -4.20578145980835,
"logits/rejected": -4.150312423706055,
"logps/chosen": -768.8599853515625,
"logps/rejected": -742.8200073242188,
"loss": 9.0056,
"rewards/accuracies": 0.546875,
"rewards/chosen": 24.521249771118164,
"rewards/margins": 3.205258846282959,
"rewards/rejected": 21.316171646118164,
"step": 650
},
{
"epoch": 0.13930348258706468,
"grad_norm": 1328.8701171875,
"learning_rate": 4.819714533011918e-07,
"logits/chosen": -4.449375152587891,
"logits/rejected": -4.36984395980835,
"logps/chosen": -743.5399780273438,
"logps/rejected": -737.8800048828125,
"loss": 7.7838,
"rewards/accuracies": 0.5268750190734863,
"rewards/chosen": 19.766250610351562,
"rewards/margins": 2.2691991329193115,
"rewards/rejected": 17.490938186645508,
"step": 700
},
{
"epoch": 0.14925373134328357,
"grad_norm": 1090.959716796875,
"learning_rate": 4.788809142175751e-07,
"logits/chosen": -4.560468673706055,
"logits/rejected": -4.488906383514404,
"logps/chosen": -800.739990234375,
"logps/rejected": -747.9600219726562,
"loss": 7.3371,
"rewards/accuracies": 0.5674999952316284,
"rewards/chosen": 18.475936889648438,
"rewards/margins": 3.2167186737060547,
"rewards/rejected": 15.255346298217773,
"step": 750
},
{
"epoch": 0.15920398009950248,
"grad_norm": 3159.435302734375,
"learning_rate": 4.755575659824014e-07,
"logits/chosen": -4.382031440734863,
"logits/rejected": -4.326250076293945,
"logps/chosen": -796.97998046875,
"logps/rejected": -792.5800170898438,
"loss": 8.1944,
"rewards/accuracies": 0.5537499785423279,
"rewards/chosen": 16.225391387939453,
"rewards/margins": 1.8960351943969727,
"rewards/rejected": 14.33435344696045,
"step": 800
},
{
"epoch": 0.1691542288557214,
"grad_norm": 2522.934326171875,
"learning_rate": 4.7200478898186656e-07,
"logits/chosen": -4.401249885559082,
"logits/rejected": -4.393125057220459,
"logps/chosen": -801.2000122070312,
"logps/rejected": -761.7000122070312,
"loss": 7.0774,
"rewards/accuracies": 0.5487499833106995,
"rewards/chosen": 15.95101547241211,
"rewards/margins": 1.5917773246765137,
"rewards/rejected": 14.3623046875,
"step": 850
},
{
"epoch": 0.1791044776119403,
"grad_norm": 1642.070068359375,
"learning_rate": 4.68226196968572e-07,
"logits/chosen": -3.7705469131469727,
"logits/rejected": -3.7598438262939453,
"logps/chosen": -800.5999755859375,
"logps/rejected": -739.3800048828125,
"loss": 7.2798,
"rewards/accuracies": 0.534375011920929,
"rewards/chosen": 17.5234375,
"rewards/margins": 1.022646427154541,
"rewards/rejected": 16.5008602142334,
"step": 900
},
{
"epoch": 0.1890547263681592,
"grad_norm": 1916.802001953125,
"learning_rate": 4.642256333857497e-07,
"logits/chosen": -3.6234374046325684,
"logits/rejected": -3.5637500286102295,
"logps/chosen": -764.1599731445312,
"logps/rejected": -759.6799926757812,
"loss": 6.8376,
"rewards/accuracies": 0.5568749904632568,
"rewards/chosen": 17.314218521118164,
"rewards/margins": 2.4442381858825684,
"rewards/rejected": 14.867304801940918,
"step": 950
},
{
"epoch": 0.19900497512437812,
"grad_norm": 2218.57373046875,
"learning_rate": 4.600071674578551e-07,
"logits/chosen": -4.034062385559082,
"logits/rejected": -4.010156154632568,
"logps/chosen": -813.4600219726562,
"logps/rejected": -766.9000244140625,
"loss": 6.9447,
"rewards/accuracies": 0.5418750047683716,
"rewards/chosen": 16.6539249420166,
"rewards/margins": 2.051767587661743,
"rewards/rejected": 14.60546875,
"step": 1000
},
{
"epoch": 0.208955223880597,
"grad_norm": 4437.78369140625,
"learning_rate": 4.555750900515026e-07,
"logits/chosen": -4.153124809265137,
"logits/rejected": -4.062812328338623,
"logps/chosen": -789.47998046875,
"logps/rejected": -759.8200073242188,
"loss": 6.6345,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": 15.9857816696167,
"rewards/margins": 1.8259130716323853,
"rewards/rejected": 14.157539367675781,
"step": 1050
},
{
"epoch": 0.21890547263681592,
"grad_norm": 1672.605224609375,
"learning_rate": 4.5093390931095656e-07,
"logits/chosen": -4.221562385559082,
"logits/rejected": -4.203437328338623,
"logps/chosen": -795.239990234375,
"logps/rejected": -751.6400146484375,
"loss": 6.3275,
"rewards/accuracies": 0.5362499952316284,
"rewards/chosen": 16.673358917236328,
"rewards/margins": 1.631040096282959,
"rewards/rejected": 15.046093940734863,
"step": 1100
},
{
"epoch": 0.22885572139303484,
"grad_norm": 845.5272216796875,
"learning_rate": 4.4608834607261394e-07,
"logits/chosen": -4.139531135559082,
"logits/rejected": -4.091875076293945,
"logps/chosen": -826.239990234375,
"logps/rejected": -776.7999877929688,
"loss": 5.9411,
"rewards/accuracies": 0.5531250238418579,
"rewards/chosen": 15.388359069824219,
"rewards/margins": 2.7884082794189453,
"rewards/rejected": 12.600312232971191,
"step": 1150
},
{
"epoch": 0.23880597014925373,
"grad_norm": 2127.64697265625,
"learning_rate": 4.4104332906314545e-07,
"logits/chosen": -4.543749809265137,
"logits/rejected": -4.498437404632568,
"logps/chosen": -763.5999755859375,
"logps/rejected": -782.280029296875,
"loss": 6.0186,
"rewards/accuracies": 0.5381249785423279,
"rewards/chosen": 16.952342987060547,
"rewards/margins": 2.063539981842041,
"rewards/rejected": 14.888437271118164,
"step": 1200
},
{
"epoch": 0.24875621890547264,
"grad_norm": 3592.465576171875,
"learning_rate": 4.358039898861784e-07,
"logits/chosen": -3.616874933242798,
"logits/rejected": -3.5975780487060547,
"logps/chosen": -805.9600219726562,
"logps/rejected": -754.1599731445312,
"loss": 7.1137,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": 16.315702438354492,
"rewards/margins": 1.9189550876617432,
"rewards/rejected": 14.396132469177246,
"step": 1250
},
{
"epoch": 0.25870646766169153,
"grad_norm": 2655.087646484375,
"learning_rate": 4.303756578026196e-07,
"logits/chosen": -4.052499771118164,
"logits/rejected": -3.9873437881469727,
"logps/chosen": -822.5599975585938,
"logps/rejected": -769.0599975585938,
"loss": 6.3728,
"rewards/accuracies": 0.528124988079071,
"rewards/chosen": 13.059394836425781,
"rewards/margins": 1.4543017148971558,
"rewards/rejected": 11.605507850646973,
"step": 1300
},
{
"epoch": 0.26865671641791045,
"grad_norm": 3106.407470703125,
"learning_rate": 4.247638543099302e-07,
"logits/chosen": -4.597812652587891,
"logits/rejected": -4.58078145980835,
"logps/chosen": -821.6799926757812,
"logps/rejected": -774.0399780273438,
"loss": 5.3592,
"rewards/accuracies": 0.5612499713897705,
"rewards/chosen": 15.785625457763672,
"rewards/margins": 2.451181650161743,
"rewards/rejected": 13.333086013793945,
"step": 1350
},
{
"epoch": 0.27860696517412936,
"grad_norm": 3253.3349609375,
"learning_rate": 4.189742875258636e-07,
"logits/chosen": -4.145625114440918,
"logits/rejected": -4.125,
"logps/chosen": -818.6799926757812,
"logps/rejected": -775.5999755859375,
"loss": 5.955,
"rewards/accuracies": 0.5543749928474426,
"rewards/chosen": 14.581796646118164,
"rewards/margins": 0.9402441382408142,
"rewards/rejected": 13.644218444824219,
"step": 1400
},
{
"epoch": 0.2885572139303483,
"grad_norm": 2794.8505859375,
"learning_rate": 4.1301284638238023e-07,
"logits/chosen": -4.417500019073486,
"logits/rejected": -4.430781364440918,
"logps/chosen": -873.97998046875,
"logps/rejected": -809.7999877929688,
"loss": 5.9541,
"rewards/accuracies": 0.5625,
"rewards/chosen": 16.624374389648438,
"rewards/margins": 2.9456982612609863,
"rewards/rejected": 13.676972389221191,
"step": 1450
},
{
"epoch": 0.29850746268656714,
"grad_norm": 3549.135009765625,
"learning_rate": 4.068855946356451e-07,
"logits/chosen": -4.357968807220459,
"logits/rejected": -4.272500038146973,
"logps/chosen": -815.0999755859375,
"logps/rejected": -799.0999755859375,
"loss": 7.1658,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": 12.853320121765137,
"rewards/margins": -0.4855078160762787,
"rewards/rejected": 13.337441444396973,
"step": 1500
},
{
"epoch": 0.30845771144278605,
"grad_norm": 2740.423095703125,
"learning_rate": 4.005987646982011e-07,
"logits/chosen": -4.377812385559082,
"logits/rejected": -4.360000133514404,
"logps/chosen": -861.0,
"logps/rejected": -819.9000244140625,
"loss": 6.0543,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": 13.93810749053955,
"rewards/margins": 1.2174170017242432,
"rewards/rejected": 12.72454833984375,
"step": 1550
},
{
"epoch": 0.31840796019900497,
"grad_norm": 2517.54248046875,
"learning_rate": 3.9415875129958994e-07,
"logits/chosen": -4.250625133514404,
"logits/rejected": -4.236562728881836,
"logps/chosen": -870.3599853515625,
"logps/rejected": -832.8400268554688,
"loss": 6.4818,
"rewards/accuracies": 0.5443750023841858,
"rewards/chosen": 12.261445045471191,
"rewards/margins": 0.11241699010133743,
"rewards/rejected": 12.149633407592773,
"step": 1600
},
{
"epoch": 0.3283582089552239,
"grad_norm": 4087.48828125,
"learning_rate": 3.875721049818718e-07,
"logits/chosen": -4.099062442779541,
"logits/rejected": -4.049062728881836,
"logps/chosen": -868.47998046875,
"logps/rejected": -826.6400146484375,
"loss": 5.7788,
"rewards/accuracies": 0.5299999713897705,
"rewards/chosen": 12.918557167053223,
"rewards/margins": 0.6001172065734863,
"rewards/rejected": 12.3140230178833,
"step": 1650
},
{
"epoch": 0.3383084577114428,
"grad_norm": 3558.606689453125,
"learning_rate": 3.808455254366574e-07,
"logits/chosen": -3.7817187309265137,
"logits/rejected": -3.768437385559082,
"logps/chosen": -857.0599975585938,
"logps/rejected": -832.239990234375,
"loss": 6.1453,
"rewards/accuracies": 0.5162500143051147,
"rewards/chosen": 13.769579887390137,
"rewards/margins": 1.5330761671066284,
"rewards/rejected": 12.234726905822754,
"step": 1700
},
{
"epoch": 0.3482587064676617,
"grad_norm": 3828.70068359375,
"learning_rate": 3.739858546904308e-07,
"logits/chosen": -4.390937328338623,
"logits/rejected": -4.3046875,
"logps/chosen": -837.1799926757812,
"logps/rejected": -835.3599853515625,
"loss": 6.6707,
"rewards/accuracies": 0.5400000214576721,
"rewards/chosen": 11.527656555175781,
"rewards/margins": 0.20983397960662842,
"rewards/rejected": 11.319659233093262,
"step": 1750
},
{
"epoch": 0.3582089552238806,
"grad_norm": 4480.52392578125,
"learning_rate": 3.6700007014509514e-07,
"logits/chosen": -4.233593940734863,
"logits/rejected": -4.196249961853027,
"logps/chosen": -868.5,
"logps/rejected": -844.9000244140625,
"loss": 5.0152,
"rewards/accuracies": 0.5418750047683716,
"rewards/chosen": 11.055917739868164,
"rewards/margins": 1.0635205507278442,
"rewards/rejected": 9.988080978393555,
"step": 1800
},
{
"epoch": 0.3681592039800995,
"grad_norm": 2393.2080078125,
"learning_rate": 3.5989527748081805e-07,
"logits/chosen": -4.220937728881836,
"logits/rejected": -4.229062557220459,
"logps/chosen": -885.8800048828125,
"logps/rejected": -871.0800170898438,
"loss": 5.564,
"rewards/accuracies": 0.5206249952316284,
"rewards/chosen": 10.777030944824219,
"rewards/margins": 0.5477758646011353,
"rewards/rejected": 10.22183609008789,
"step": 1850
},
{
"epoch": 0.3781094527363184,
"grad_norm": 2142.15185546875,
"learning_rate": 3.52678703428399e-07,
"logits/chosen": -3.959531307220459,
"logits/rejected": -3.898750066757202,
"logps/chosen": -828.0800170898438,
"logps/rejected": -837.0,
"loss": 4.9398,
"rewards/accuracies": 0.5493749976158142,
"rewards/chosen": 10.465898513793945,
"rewards/margins": 1.3795897960662842,
"rewards/rejected": 9.086328506469727,
"step": 1900
},
{
"epoch": 0.3880597014925373,
"grad_norm": 5701.14794921875,
"learning_rate": 3.45357688418507e-07,
"logits/chosen": -3.465625047683716,
"logits/rejected": -3.4301562309265137,
"logps/chosen": -863.3200073242188,
"logps/rejected": -833.1799926757812,
"loss": 4.9191,
"rewards/accuracies": 0.5206249952316284,
"rewards/chosen": 11.544062614440918,
"rewards/margins": 1.2966210842132568,
"rewards/rejected": 10.246211051940918,
"step": 1950
},
{
"epoch": 0.39800995024875624,
"grad_norm": 2166.156005859375,
"learning_rate": 3.3793967911526797e-07,
"logits/chosen": -4.175624847412109,
"logits/rejected": -4.157968521118164,
"logps/chosen": -864.739990234375,
"logps/rejected": -818.3200073242188,
"loss": 4.9636,
"rewards/accuracies": 0.5587499737739563,
"rewards/chosen": 10.2691011428833,
"rewards/margins": 1.404970645904541,
"rewards/rejected": 8.860605239868164,
"step": 2000
},
{
"epoch": 0.4079601990049751,
"grad_norm": 1748.567626953125,
"learning_rate": 3.3043222084179477e-07,
"logits/chosen": -4.447812557220459,
"logits/rejected": -4.435625076293945,
"logps/chosen": -864.1799926757812,
"logps/rejected": -806.9400024414062,
"loss": 4.3649,
"rewards/accuracies": 0.5406249761581421,
"rewards/chosen": 11.546367645263672,
"rewards/margins": 1.7490723133087158,
"rewards/rejected": 9.790781021118164,
"step": 2050
},
{
"epoch": 0.417910447761194,
"grad_norm": 4250.59228515625,
"learning_rate": 3.228429499053651e-07,
"logits/chosen": -3.7835936546325684,
"logits/rejected": -3.764218807220459,
"logps/chosen": -896.280029296875,
"logps/rejected": -840.52001953125,
"loss": 5.556,
"rewards/accuracies": 0.53125,
"rewards/chosen": 12.199726104736328,
"rewards/margins": 0.49269530177116394,
"rewards/rejected": 11.707152366638184,
"step": 2100
},
{
"epoch": 0.42786069651741293,
"grad_norm": 2269.677734375,
"learning_rate": 3.151795858300542e-07,
"logits/chosen": -4.282343864440918,
"logits/rejected": -4.28640604019165,
"logps/chosen": -864.52001953125,
"logps/rejected": -833.02001953125,
"loss": 4.4658,
"rewards/accuracies": 0.5256249904632568,
"rewards/chosen": 10.688271522521973,
"rewards/margins": 0.9122143387794495,
"rewards/rejected": 9.776113510131836,
"step": 2150
},
{
"epoch": 0.43781094527363185,
"grad_norm": 2995.49267578125,
"learning_rate": 3.0744992350472184e-07,
"logits/chosen": -4.102499961853027,
"logits/rejected": -4.065000057220459,
"logps/chosen": -891.3599853515625,
"logps/rejected": -809.739990234375,
"loss": 3.9283,
"rewards/accuracies": 0.5575000047683716,
"rewards/chosen": 10.750624656677246,
"rewards/margins": 1.9478063583374023,
"rewards/rejected": 8.801519393920898,
"step": 2200
},
{
"epoch": 0.44776119402985076,
"grad_norm": 2685.581298828125,
"learning_rate": 2.9966182525434136e-07,
"logits/chosen": -4.429843902587891,
"logits/rejected": -4.410468578338623,
"logps/chosen": -917.9000244140625,
"logps/rejected": -884.8599853515625,
"loss": 4.9653,
"rewards/accuracies": 0.5181249976158142,
"rewards/chosen": 8.635839462280273,
"rewards/margins": 0.5861572027206421,
"rewards/rejected": 8.053730010986328,
"step": 2250
},
{
"epoch": 0.4577114427860697,
"grad_norm": 1810.1524658203125,
"learning_rate": 2.9182321284273524e-07,
"logits/chosen": -4.380312442779541,
"logits/rejected": -4.308281421661377,
"logps/chosen": -892.0999755859375,
"logps/rejected": -817.1799926757812,
"loss": 4.4186,
"rewards/accuracies": 0.5537499785423279,
"rewards/chosen": 8.945687294006348,
"rewards/margins": 1.2103466987609863,
"rewards/rejected": 7.736120223999023,
"step": 2300
},
{
"epoch": 0.46766169154228854,
"grad_norm": 1638.470458984375,
"learning_rate": 2.839420594148518e-07,
"logits/chosen": -4.286562442779541,
"logits/rejected": -4.321406364440918,
"logps/chosen": -856.7000122070312,
"logps/rejected": -857.2999877929688,
"loss": 4.0532,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": 9.412128448486328,
"rewards/margins": 1.397641658782959,
"rewards/rejected": 8.009862899780273,
"step": 2350
},
{
"epoch": 0.47761194029850745,
"grad_norm": 4396.177734375,
"learning_rate": 2.7602638138677834e-07,
"logits/chosen": -4.463749885559082,
"logits/rejected": -4.425156116485596,
"logps/chosen": -903.4199829101562,
"logps/rejected": -882.0800170898438,
"loss": 4.5126,
"rewards/accuracies": 0.5487499833106995,
"rewards/chosen": 7.66628885269165,
"rewards/margins": 1.1259644031524658,
"rewards/rejected": 6.541113376617432,
"step": 2400
},
{
"epoch": 0.48756218905472637,
"grad_norm": 4004.349609375,
"learning_rate": 2.6808423029174143e-07,
"logits/chosen": -4.335468769073486,
"logits/rejected": -4.302812576293945,
"logps/chosen": -876.6799926757812,
"logps/rejected": -830.239990234375,
"loss": 4.9104,
"rewards/accuracies": 0.5393750071525574,
"rewards/chosen": 10.196874618530273,
"rewards/margins": 0.7247558832168579,
"rewards/rejected": 9.472156524658203,
"step": 2450
},
{
"epoch": 0.4975124378109453,
"grad_norm": 2645.156494140625,
"learning_rate": 2.6012368459038625e-07,
"logits/chosen": -4.241718769073486,
"logits/rejected": -4.247031211853027,
"logps/chosen": -940.6400146484375,
"logps/rejected": -873.52001953125,
"loss": 4.4678,
"rewards/accuracies": 0.5193750262260437,
"rewards/chosen": 7.638674259185791,
"rewards/margins": 0.7611132860183716,
"rewards/rejected": 6.874751091003418,
"step": 2500
},
{
"epoch": 0.5074626865671642,
"grad_norm": 4464.42724609375,
"learning_rate": 2.5215284145366754e-07,
"logits/chosen": -4.28781270980835,
"logits/rejected": -4.303593635559082,
"logps/chosen": -898.02001953125,
"logps/rejected": -856.219970703125,
"loss": 4.8918,
"rewards/accuracies": 0.5393750071525574,
"rewards/chosen": 9.058222770690918,
"rewards/margins": 0.16966308653354645,
"rewards/rejected": 8.891836166381836,
"step": 2550
},
{
"epoch": 0.5174129353233831,
"grad_norm": 1550.342041015625,
"learning_rate": 2.4417980852670795e-07,
"logits/chosen": -4.276875019073486,
"logits/rejected": -4.283124923706055,
"logps/chosen": -908.8800048828125,
"logps/rejected": -837.219970703125,
"loss": 3.7542,
"rewards/accuracies": 0.5493749976158142,
"rewards/chosen": 8.9493408203125,
"rewards/margins": 1.7262645959854126,
"rewards/rejected": 7.21969747543335,
"step": 2600
},
{
"epoch": 0.527363184079602,
"grad_norm": 1139.1016845703125,
"learning_rate": 2.3621269568200348e-07,
"logits/chosen": -4.569843769073486,
"logits/rejected": -4.572031021118164,
"logps/chosen": -863.0399780273438,
"logps/rejected": -832.1199951171875,
"loss": 4.4595,
"rewards/accuracies": 0.5274999737739563,
"rewards/chosen": 9.005471229553223,
"rewards/margins": 0.9330615401268005,
"rewards/rejected": 8.067304611206055,
"step": 2650
},
{
"epoch": 0.5373134328358209,
"grad_norm": 3259.9931640625,
"learning_rate": 2.2825960677036263e-07,
"logits/chosen": -5.025000095367432,
"logits/rejected": -5.025312423706055,
"logps/chosen": -900.02001953125,
"logps/rejected": -855.4600219726562,
"loss": 3.8295,
"rewards/accuracies": 0.5162500143051147,
"rewards/chosen": 7.501829624176025,
"rewards/margins": 0.9494946002960205,
"rewards/rejected": 6.554053783416748,
"step": 2700
},
{
"epoch": 0.5472636815920398,
"grad_norm": 1576.5633544921875,
"learning_rate": 2.2032863137797098e-07,
"logits/chosen": -4.931250095367432,
"logits/rejected": -4.935312271118164,
"logps/chosen": -888.0,
"logps/rejected": -890.8200073242188,
"loss": 3.6169,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": 6.200995922088623,
"rewards/margins": 2.0712647438049316,
"rewards/rejected": 4.132159233093262,
"step": 2750
},
{
"epoch": 0.5572139303482587,
"grad_norm": 2286.4580078125,
"learning_rate": 2.1242783659796472e-07,
"logits/chosen": -5.111249923706055,
"logits/rejected": -5.120625019073486,
"logps/chosen": -899.4000244140625,
"logps/rejected": -861.1199951171875,
"loss": 4.0467,
"rewards/accuracies": 0.5299999713897705,
"rewards/chosen": 6.854379653930664,
"rewards/margins": 0.9951757788658142,
"rewards/rejected": 5.860227108001709,
"step": 2800
},
{
"epoch": 0.5671641791044776,
"grad_norm": 1399.0577392578125,
"learning_rate": 2.0456525882488414e-07,
"logits/chosen": -5.425624847412109,
"logits/rejected": -5.343437671661377,
"logps/chosen": -907.280029296875,
"logps/rejected": -847.0599975585938,
"loss": 4.3571,
"rewards/accuracies": 0.5137500166893005,
"rewards/chosen": 7.121167182922363,
"rewards/margins": 0.05256347730755806,
"rewards/rejected": 7.0656046867370605,
"step": 2850
},
{
"epoch": 0.5771144278606966,
"grad_norm": 1395.7008056640625,
"learning_rate": 1.967488955803515e-07,
"logits/chosen": -5.565000057220459,
"logits/rejected": -5.533124923706055,
"logps/chosen": -921.8800048828125,
"logps/rejected": -867.5599975585938,
"loss": 3.6962,
"rewards/accuracies": 0.5237500071525574,
"rewards/chosen": 8.013593673706055,
"rewards/margins": 1.2249804735183716,
"rewards/rejected": 6.789748668670654,
"step": 2900
},
{
"epoch": 0.5870646766169154,
"grad_norm": 1817.9039306640625,
"learning_rate": 1.8898669737829009e-07,
"logits/chosen": -5.284999847412109,
"logits/rejected": -5.328750133514404,
"logps/chosen": -901.7999877929688,
"logps/rejected": -848.260009765625,
"loss": 3.4712,
"rewards/accuracies": 0.5325000286102295,
"rewards/chosen": 8.152949333190918,
"rewards/margins": 1.7032690048217773,
"rewards/rejected": 6.451176643371582,
"step": 2950
},
{
"epoch": 0.5970149253731343,
"grad_norm": 2175.98291015625,
"learning_rate": 1.8128655963795654e-07,
"logits/chosen": -5.226718902587891,
"logits/rejected": -5.177187442779541,
"logps/chosen": -896.52001953125,
"logps/rejected": -842.5,
"loss": 4.3828,
"rewards/accuracies": 0.5337499976158142,
"rewards/chosen": 6.346333026885986,
"rewards/margins": -0.23146240413188934,
"rewards/rejected": 6.57891845703125,
"step": 3000
},
{
"epoch": 0.6069651741293532,
"grad_norm": 2423.90869140625,
"learning_rate": 1.736563146530148e-07,
"logits/chosen": -5.147812366485596,
"logits/rejected": -5.121250152587891,
"logps/chosen": -906.780029296875,
"logps/rejected": -842.0599975585938,
"loss": 3.618,
"rewards/accuracies": 0.5256249904632568,
"rewards/chosen": 7.217099666595459,
"rewards/margins": 1.0019750595092773,
"rewards/rejected": 6.212661266326904,
"step": 3050
},
{
"epoch": 0.6169154228855721,
"grad_norm": 1796.0404052734375,
"learning_rate": 1.6610372362481795e-07,
"logits/chosen": -5.517499923706055,
"logits/rejected": -5.500937461853027,
"logps/chosen": -888.3800048828125,
"logps/rejected": -870.8200073242188,
"loss": 3.9941,
"rewards/accuracies": 0.534375011920929,
"rewards/chosen": 6.678945541381836,
"rewards/margins": 0.21024902164936066,
"rewards/rejected": 6.4722514152526855,
"step": 3100
},
{
"epoch": 0.6268656716417911,
"grad_norm": 3228.182373046875,
"learning_rate": 1.5863646876800294e-07,
"logits/chosen": -5.522500038146973,
"logits/rejected": -5.519062519073486,
"logps/chosen": -917.4600219726562,
"logps/rejected": -894.3400268554688,
"loss": 3.9995,
"rewards/accuracies": 0.5049999952316284,
"rewards/chosen": 6.917697906494141,
"rewards/margins": 0.9786840677261353,
"rewards/rejected": 5.941035270690918,
"step": 3150
},
{
"epoch": 0.6368159203980099,
"grad_norm": 1673.2705078125,
"learning_rate": 1.512621454964278e-07,
"logits/chosen": -5.52468729019165,
"logits/rejected": -5.51687479019165,
"logps/chosen": -920.6400146484375,
"logps/rejected": -874.0399780273438,
"loss": 3.5676,
"rewards/accuracies": 0.5256249904632568,
"rewards/chosen": 6.860390663146973,
"rewards/margins": 0.9590514898300171,
"rewards/rejected": 5.902841567993164,
"step": 3200
},
{
"epoch": 0.6467661691542289,
"grad_norm": 3450.835693359375,
"learning_rate": 1.439882546973991e-07,
"logits/chosen": -5.425468921661377,
"logits/rejected": -5.380000114440918,
"logps/chosen": -896.6799926757812,
"logps/rejected": -860.0,
"loss": 3.9909,
"rewards/accuracies": 0.5531250238418579,
"rewards/chosen": 6.930351734161377,
"rewards/margins": 0.6515478491783142,
"rewards/rejected": 6.279133319854736,
"step": 3250
},
{
"epoch": 0.6567164179104478,
"grad_norm": 2366.854736328125,
"learning_rate": 1.3682219510204828e-07,
"logits/chosen": -5.55343770980835,
"logits/rejected": -5.555312633514404,
"logps/chosen": -918.0800170898438,
"logps/rejected": -867.8400268554688,
"loss": 3.9592,
"rewards/accuracies": 0.5318750143051147,
"rewards/chosen": 6.138139724731445,
"rewards/margins": 0.7850878834724426,
"rewards/rejected": 5.353430271148682,
"step": 3300
},
{
"epoch": 0.6666666666666666,
"grad_norm": 2788.40771484375,
"learning_rate": 1.2977125575961799e-07,
"logits/chosen": -5.831562519073486,
"logits/rejected": -5.809062480926514,
"logps/chosen": -918.0999755859375,
"logps/rejected": -877.780029296875,
"loss": 3.6818,
"rewards/accuracies": 0.5325000286102295,
"rewards/chosen": 5.774457931518555,
"rewards/margins": 0.3942529261112213,
"rewards/rejected": 5.381279468536377,
"step": 3350
},
{
"epoch": 0.6766169154228856,
"grad_norm": 2038.4530029296875,
"learning_rate": 1.2284260862331184e-07,
"logits/chosen": -5.459531307220459,
"logits/rejected": -5.452187538146973,
"logps/chosen": -868.5599975585938,
"logps/rejected": -831.5,
"loss": 4.1592,
"rewards/accuracies": 0.5149999856948853,
"rewards/chosen": 8.009712219238281,
"rewards/margins": 0.3110009729862213,
"rewards/rejected": 7.698652267456055,
"step": 3400
},
{
"epoch": 0.6865671641791045,
"grad_norm": 2557.6064453125,
"learning_rate": 1.1604330125525078e-07,
"logits/chosen": -5.480000019073486,
"logits/rejected": -5.461249828338623,
"logps/chosen": -930.9000244140625,
"logps/rejected": -879.5800170898438,
"loss": 3.8611,
"rewards/accuracies": 0.5181249976158142,
"rewards/chosen": 6.370607852935791,
"rewards/margins": 0.5133349895477295,
"rewards/rejected": 5.857964038848877,
"step": 3450
},
{
"epoch": 0.6965174129353234,
"grad_norm": 2376.09716796875,
"learning_rate": 1.0938024965795506e-07,
"logits/chosen": -5.44406270980835,
"logits/rejected": -5.42312479019165,
"logps/chosen": -893.1400146484375,
"logps/rejected": -862.0,
"loss": 3.7593,
"rewards/accuracies": 0.5168750286102295,
"rewards/chosen": 7.158564567565918,
"rewards/margins": 0.6185815334320068,
"rewards/rejected": 6.541041851043701,
"step": 3500
},
{
"epoch": 0.7064676616915423,
"grad_norm": 3130.968017578125,
"learning_rate": 1.0286023123964326e-07,
"logits/chosen": -5.474531173706055,
"logits/rejected": -5.479062557220459,
"logps/chosen": -889.4000244140625,
"logps/rejected": -889.760009765625,
"loss": 3.6067,
"rewards/accuracies": 0.5049999952316284,
"rewards/chosen": 4.780278205871582,
"rewards/margins": 0.41691163182258606,
"rewards/rejected": 4.363155364990234,
"step": 3550
},
{
"epoch": 0.7164179104477612,
"grad_norm": 2820.47412109375,
"learning_rate": 9.64898779205055e-08,
"logits/chosen": -5.624687671661377,
"logits/rejected": -5.600468635559082,
"logps/chosen": -903.4400024414062,
"logps/rejected": -850.5,
"loss": 3.9096,
"rewards/accuracies": 0.5049999952316284,
"rewards/chosen": 6.33207893371582,
"rewards/margins": 0.01932373084127903,
"rewards/rejected": 6.315381050109863,
"step": 3600
},
{
"epoch": 0.7263681592039801,
"grad_norm": 3085.365478515625,
"learning_rate": 9.027566938696051e-08,
"logits/chosen": -5.869375228881836,
"logits/rejected": -5.823437690734863,
"logps/chosen": -913.4400024414062,
"logps/rejected": -874.3599853515625,
"loss": 4.0442,
"rewards/accuracies": 0.5274999737739563,
"rewards/chosen": 5.950512886047363,
"rewards/margins": 0.31416991353034973,
"rewards/rejected": 5.635661602020264,
"step": 3650
},
{
"epoch": 0.736318407960199,
"grad_norm": 2496.829833984375,
"learning_rate": 8.42239265007595e-08,
"logits/chosen": -5.743750095367432,
"logits/rejected": -5.717812538146973,
"logps/chosen": -893.8800048828125,
"logps/rejected": -847.2999877929688,
"loss": 3.7924,
"rewards/accuracies": 0.5131250023841858,
"rewards/chosen": 7.561201095581055,
"rewards/margins": 0.5582299828529358,
"rewards/rejected": 6.997402191162109,
"step": 3700
},
{
"epoch": 0.746268656716418,
"grad_norm": 3231.818115234375,
"learning_rate": 7.834080486964115e-08,
"logits/chosen": -5.849374771118164,
"logits/rejected": -5.823437690734863,
"logps/chosen": -919.1199951171875,
"logps/rejected": -881.9600219726562,
"loss": 4.0168,
"rewards/accuracies": 0.4975000023841858,
"rewards/chosen": 6.423149585723877,
"rewards/margins": -0.03854003921151161,
"rewards/rejected": 6.461066722869873,
"step": 3750
},
{
"epoch": 0.7562189054726368,
"grad_norm": 2785.632080078125,
"learning_rate": 7.263228858607615e-08,
"logits/chosen": -6.016250133514404,
"logits/rejected": -5.974062442779541,
"logps/chosen": -896.739990234375,
"logps/rejected": -847.219970703125,
"loss": 3.5495,
"rewards/accuracies": 0.5462499856948853,
"rewards/chosen": 6.750986099243164,
"rewards/margins": 0.5915331840515137,
"rewards/rejected": 6.161344051361084,
"step": 3800
},
{
"epoch": 0.7661691542288557,
"grad_norm": 1095.79541015625,
"learning_rate": 6.7104184140471e-08,
"logits/chosen": -5.880312442779541,
"logits/rejected": -5.940000057220459,
"logps/chosen": -925.3200073242188,
"logps/rejected": -870.6199951171875,
"loss": 3.5409,
"rewards/accuracies": 0.5456249713897705,
"rewards/chosen": 8.115625381469727,
"rewards/margins": 1.7515722513198853,
"rewards/rejected": 6.366718769073486,
"step": 3850
},
{
"epoch": 0.7761194029850746,
"grad_norm": 1527.818115234375,
"learning_rate": 6.176211451502181e-08,
"logits/chosen": -5.776249885559082,
"logits/rejected": -5.784687519073486,
"logps/chosen": -915.5800170898438,
"logps/rejected": -894.719970703125,
"loss": 3.6188,
"rewards/accuracies": 0.5287500023841858,
"rewards/chosen": 5.876552581787109,
"rewards/margins": 1.2620117664337158,
"rewards/rejected": 4.617353439331055,
"step": 3900
},
{
"epoch": 0.7860696517412935,
"grad_norm": 2342.368896484375,
"learning_rate": 5.66115134642263e-08,
"logits/chosen": -5.654062271118164,
"logits/rejected": -5.647812366485596,
"logps/chosen": -949.5,
"logps/rejected": -899.1599731445312,
"loss": 3.6019,
"rewards/accuracies": 0.5174999833106995,
"rewards/chosen": 5.2860107421875,
"rewards/margins": 0.6428442597389221,
"rewards/rejected": 4.6402587890625,
"step": 3950
},
{
"epoch": 0.7960199004975125,
"grad_norm": 1863.121337890625,
"learning_rate": 5.1657619987870657e-08,
"logits/chosen": -5.65500020980835,
"logits/rejected": -5.638437271118164,
"logps/chosen": -906.2000122070312,
"logps/rejected": -842.8800048828125,
"loss": 4.1956,
"rewards/accuracies": 0.5256249904632568,
"rewards/chosen": 5.409960746765137,
"rewards/margins": 0.055903319269418716,
"rewards/rejected": 5.3602614402771,
"step": 4000
},
{
"epoch": 0.8059701492537313,
"grad_norm": 3470.02197265625,
"learning_rate": 4.690547300211392e-08,
"logits/chosen": -5.610937595367432,
"logits/rejected": -5.582656383514404,
"logps/chosen": -865.780029296875,
"logps/rejected": -821.719970703125,
"loss": 3.7337,
"rewards/accuracies": 0.5174999833106995,
"rewards/chosen": 6.724204063415527,
"rewards/margins": 0.2500012218952179,
"rewards/rejected": 6.473584175109863,
"step": 4050
},
{
"epoch": 0.8159203980099502,
"grad_norm": 1288.328369140625,
"learning_rate": 4.235990621408972e-08,
"logits/chosen": -5.644999980926514,
"logits/rejected": -5.604687690734863,
"logps/chosen": -901.4000244140625,
"logps/rejected": -848.6799926757812,
"loss": 3.7236,
"rewards/accuracies": 0.5337499976158142,
"rewards/chosen": 5.766840934753418,
"rewards/margins": 0.43273621797561646,
"rewards/rejected": 5.334909439086914,
"step": 4100
},
{
"epoch": 0.8258706467661692,
"grad_norm": 2981.522216796875,
"learning_rate": 3.802554320523949e-08,
"logits/chosen": -5.59375,
"logits/rejected": -5.619062423706055,
"logps/chosen": -928.6199951171875,
"logps/rejected": -869.6599731445312,
"loss": 3.1704,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": 8.217485427856445,
"rewards/margins": 1.8676855564117432,
"rewards/rejected": 6.345156192779541,
"step": 4150
},
{
"epoch": 0.835820895522388,
"grad_norm": 1575.174072265625,
"learning_rate": 3.390679272837724e-08,
"logits/chosen": -5.644999980926514,
"logits/rejected": -5.65625,
"logps/chosen": -920.8800048828125,
"logps/rejected": -862.9199829101562,
"loss": 3.3543,
"rewards/accuracies": 0.5425000190734863,
"rewards/chosen": 6.809421539306641,
"rewards/margins": 1.4754736423492432,
"rewards/rejected": 5.333471775054932,
"step": 4200
},
{
"epoch": 0.845771144278607,
"grad_norm": 1714.1629638671875,
"learning_rate": 3.00078442232703e-08,
"logits/chosen": -5.693437576293945,
"logits/rejected": -5.644374847412109,
"logps/chosen": -917.4000244140625,
"logps/rejected": -891.1199951171875,
"loss": 4.002,
"rewards/accuracies": 0.5237500071525574,
"rewards/chosen": 5.626728534698486,
"rewards/margins": 0.3019775450229645,
"rewards/rejected": 5.322280406951904,
"step": 4250
},
{
"epoch": 0.8557213930348259,
"grad_norm": 4939.02490234375,
"learning_rate": 2.633266355529684e-08,
"logits/chosen": -5.634375095367432,
"logits/rejected": -5.644999980926514,
"logps/chosen": -928.219970703125,
"logps/rejected": -868.4199829101562,
"loss": 3.297,
"rewards/accuracies": 0.5481250286102295,
"rewards/chosen": 8.091529846191406,
"rewards/margins": 2.134963274002075,
"rewards/rejected": 5.956567287445068,
"step": 4300
},
{
"epoch": 0.8656716417910447,
"grad_norm": 3157.2939453125,
"learning_rate": 2.2884988981515447e-08,
"logits/chosen": -5.770625114440918,
"logits/rejected": -5.713437557220459,
"logps/chosen": -948.8400268554688,
"logps/rejected": -911.5800170898438,
"loss": 4.5557,
"rewards/accuracies": 0.5231249928474426,
"rewards/chosen": 5.515078067779541,
"rewards/margins": -0.4518188536167145,
"rewards/rejected": 5.966367244720459,
"step": 4350
},
{
"epoch": 0.8756218905472637,
"grad_norm": 1111.7291259765625,
"learning_rate": 1.9668327348248857e-08,
"logits/chosen": -5.699999809265137,
"logits/rejected": -5.701562404632568,
"logps/chosen": -895.1400146484375,
"logps/rejected": -893.3200073242188,
"loss": 3.5643,
"rewards/accuracies": 0.5493749976158142,
"rewards/chosen": 5.846921443939209,
"rewards/margins": 0.8055566549301147,
"rewards/rejected": 5.04319953918457,
"step": 4400
},
{
"epoch": 0.8855721393034826,
"grad_norm": 4429.13330078125,
"learning_rate": 1.6685950524050307e-08,
"logits/chosen": -5.749062538146973,
"logits/rejected": -5.731562614440918,
"logps/chosen": -940.0399780273438,
"logps/rejected": -897.780029296875,
"loss": 3.4933,
"rewards/accuracies": 0.53125,
"rewards/chosen": 5.280101299285889,
"rewards/margins": 0.7753466963768005,
"rewards/rejected": 4.501115798950195,
"step": 4450
},
{
"epoch": 0.8955223880597015,
"grad_norm": 1798.3892822265625,
"learning_rate": 1.3940892071680837e-08,
"logits/chosen": -5.7578125,
"logits/rejected": -5.719531059265137,
"logps/chosen": -913.3400268554688,
"logps/rejected": -870.8599853515625,
"loss": 3.8212,
"rewards/accuracies": 0.5056250095367432,
"rewards/chosen": 6.39865255355835,
"rewards/margins": 0.44740965962409973,
"rewards/rejected": 5.9497971534729,
"step": 4500
},
{
"epoch": 0.9054726368159204,
"grad_norm": 1578.768798828125,
"learning_rate": 1.1435944162481808e-08,
"logits/chosen": -5.743750095367432,
"logits/rejected": -5.692031383514404,
"logps/chosen": -936.0999755859375,
"logps/rejected": -878.8800048828125,
"loss": 3.6106,
"rewards/accuracies": 0.5268750190734863,
"rewards/chosen": 6.700493335723877,
"rewards/margins": 0.9369800090789795,
"rewards/rejected": 5.758784294128418,
"step": 4550
},
{
"epoch": 0.9154228855721394,
"grad_norm": 2657.033935546875,
"learning_rate": 9.17365473628226e-09,
"logits/chosen": -5.730000019073486,
"logits/rejected": -5.676718711853027,
"logps/chosen": -936.4600219726562,
"logps/rejected": -913.5800170898438,
"loss": 4.1793,
"rewards/accuracies": 0.5262500047683716,
"rewards/chosen": 6.953037261962891,
"rewards/margins": 0.0015722656389698386,
"rewards/rejected": 6.950415134429932,
"step": 4600
},
{
"epoch": 0.9253731343283582,
"grad_norm": 5578.38916015625,
"learning_rate": 7.1563249097292e-09,
"logits/chosen": -5.808750152587891,
"logits/rejected": -5.783437728881836,
"logps/chosen": -908.780029296875,
"logps/rejected": -889.7000122070312,
"loss": 3.3363,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 5.7548828125,
"rewards/margins": 1.054022192955017,
"rewards/rejected": 4.701448917388916,
"step": 4650
},
{
"epoch": 0.9353233830845771,
"grad_norm": 2391.306640625,
"learning_rate": 5.38600663567737e-09,
"logits/chosen": -5.85281229019165,
"logits/rejected": -5.862187385559082,
"logps/chosen": -873.1599731445312,
"logps/rejected": -841.780029296875,
"loss": 3.168,
"rewards/accuracies": 0.5099999904632568,
"rewards/chosen": 6.133432388305664,
"rewards/margins": 0.7415136694908142,
"rewards/rejected": 5.392402172088623,
"step": 4700
},
{
"epoch": 0.945273631840796,
"grad_norm": 1352.6234130859375,
"learning_rate": 3.864500616019228e-09,
"logits/chosen": -5.815937519073486,
"logits/rejected": -5.816874980926514,
"logps/chosen": -930.3800048828125,
"logps/rejected": -871.1799926757812,
"loss": 3.6411,
"rewards/accuracies": 0.5568749904632568,
"rewards/chosen": 6.584997653961182,
"rewards/margins": 1.1969140768051147,
"rewards/rejected": 5.385488510131836,
"step": 4750
},
{
"epoch": 0.9552238805970149,
"grad_norm": 1797.4112548828125,
"learning_rate": 2.593354470077802e-09,
"logits/chosen": -5.836249828338623,
"logits/rejected": -5.78249979019165,
"logps/chosen": -942.0,
"logps/rejected": -871.239990234375,
"loss": 3.9736,
"rewards/accuracies": 0.5318750143051147,
"rewards/chosen": 5.091171741485596,
"rewards/margins": 0.1407189965248108,
"rewards/rejected": 4.95010232925415,
"step": 4800
},
{
"epoch": 0.9651741293532339,
"grad_norm": 1682.3397216796875,
"learning_rate": 1.5738611604260433e-09,
"logits/chosen": -5.798749923706055,
"logits/rejected": -5.760000228881836,
"logps/chosen": -901.0,
"logps/rejected": -837.4600219726562,
"loss": 3.8623,
"rewards/accuracies": 0.5318750143051147,
"rewards/chosen": 6.400158882141113,
"rewards/margins": 0.044941406697034836,
"rewards/rejected": 6.351467132568359,
"step": 4850
},
{
"epoch": 0.9751243781094527,
"grad_norm": 1607.166259765625,
"learning_rate": 8.070576777333138e-10,
"logits/chosen": -5.75390625,
"logits/rejected": -5.801562309265137,
"logps/chosen": -863.0599975585938,
"logps/rejected": -844.0399780273438,
"loss": 3.1538,
"rewards/accuracies": 0.5356249809265137,
"rewards/chosen": 7.545395374298096,
"rewards/margins": 1.907900333404541,
"rewards/rejected": 5.632500171661377,
"step": 4900
},
{
"epoch": 0.9850746268656716,
"grad_norm": 1843.007568359375,
"learning_rate": 2.937239859770735e-10,
"logits/chosen": -5.721562385559082,
"logits/rejected": -5.678124904632568,
"logps/chosen": -901.3200073242188,
"logps/rejected": -831.760009765625,
"loss": 4.2105,
"rewards/accuracies": 0.5493749976158142,
"rewards/chosen": 8.320673942565918,
"rewards/margins": 0.23246826231479645,
"rewards/rejected": 8.0889253616333,
"step": 4950
},
{
"epoch": 0.9950248756218906,
"grad_norm": 2304.98828125,
"learning_rate": 3.4382229092522196e-11,
"logits/chosen": -5.78781270980835,
"logits/rejected": -5.800312519073486,
"logps/chosen": -910.6599731445312,
"logps/rejected": -895.8599853515625,
"loss": 3.3647,
"rewards/accuracies": 0.5425000190734863,
"rewards/chosen": 7.996386528015137,
"rewards/margins": 1.4546045064926147,
"rewards/rejected": 6.5366530418396,
"step": 5000
}
],
"logging_steps": 50,
"max_steps": 5025,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 250,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}