trustalign_qwen2.5_0.5b / trainer_state.json
shanghong's picture
Upload folder using huggingface_hub
da5020e verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.6155917425310937,
"eval_steps": 20,
"global_step": 700,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004615976407231696,
"grad_norm": 92.4314081607968,
"learning_rate": 1.1494252873563218e-08,
"logits/chosen": -1.3403388261795044,
"logits/rejected": -1.3443610668182373,
"logps/chosen": -48.98606872558594,
"logps/rejected": -52.890384674072266,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 2
},
{
"epoch": 0.009231952814463392,
"grad_norm": 114.27394093227154,
"learning_rate": 2.2988505747126436e-08,
"logits/chosen": -1.3453574180603027,
"logits/rejected": -1.3622318506240845,
"logps/chosen": -39.51582336425781,
"logps/rejected": -55.267478942871094,
"loss": 0.6984,
"rewards/accuracies": 0.4444444477558136,
"rewards/chosen": -0.0065580871887505054,
"rewards/margins": -0.004221578594297171,
"rewards/rejected": -0.002336508594453335,
"step": 4
},
{
"epoch": 0.01384792922169509,
"grad_norm": 150.47548740563838,
"learning_rate": 3.448275862068965e-08,
"logits/chosen": -1.3104500770568848,
"logits/rejected": -1.3256760835647583,
"logps/chosen": -46.711997985839844,
"logps/rejected": -61.08738327026367,
"loss": 0.7077,
"rewards/accuracies": 0.4583333432674408,
"rewards/chosen": -0.015531142242252827,
"rewards/margins": -0.022088024765253067,
"rewards/rejected": 0.006556881591677666,
"step": 6
},
{
"epoch": 0.018463905628926785,
"grad_norm": 127.64708744370179,
"learning_rate": 4.597701149425287e-08,
"logits/chosen": -1.3497395515441895,
"logits/rejected": -1.3723570108413696,
"logps/chosen": -50.4114875793457,
"logps/rejected": -67.92998504638672,
"loss": 0.7139,
"rewards/accuracies": 0.4027777910232544,
"rewards/chosen": 0.04585569351911545,
"rewards/margins": -0.033931903541088104,
"rewards/rejected": 0.07978759706020355,
"step": 8
},
{
"epoch": 0.023079882036158482,
"grad_norm": 104.28603257129862,
"learning_rate": 5.747126436781609e-08,
"logits/chosen": -1.3193544149398804,
"logits/rejected": -1.3253015279769897,
"logps/chosen": -48.21293258666992,
"logps/rejected": -55.63939666748047,
"loss": 0.6759,
"rewards/accuracies": 0.6527777910232544,
"rewards/chosen": 0.09531965851783752,
"rewards/margins": 0.03943055123090744,
"rewards/rejected": 0.055889103561639786,
"step": 10
},
{
"epoch": 0.02769585844339018,
"grad_norm": 122.92866276312793,
"learning_rate": 6.89655172413793e-08,
"logits/chosen": -1.356438159942627,
"logits/rejected": -1.3778866529464722,
"logps/chosen": -47.38197326660156,
"logps/rejected": -62.85205841064453,
"loss": 0.7012,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.021549424156546593,
"rewards/margins": -0.00793980248272419,
"rewards/rejected": 0.029489226639270782,
"step": 12
},
{
"epoch": 0.032311834850621876,
"grad_norm": 141.4940238366865,
"learning_rate": 8.045977011494252e-08,
"logits/chosen": -1.2811020612716675,
"logits/rejected": -1.3018286228179932,
"logps/chosen": -53.21059799194336,
"logps/rejected": -68.97090148925781,
"loss": 0.6877,
"rewards/accuracies": 0.4583333432674408,
"rewards/chosen": -0.0111711286008358,
"rewards/margins": 0.018631484359502792,
"rewards/rejected": -0.029802612960338593,
"step": 14
},
{
"epoch": 0.03692781125785357,
"grad_norm": 108.13024807755075,
"learning_rate": 9.195402298850574e-08,
"logits/chosen": -1.3367334604263306,
"logits/rejected": -1.3522446155548096,
"logps/chosen": -40.02373504638672,
"logps/rejected": -54.60912322998047,
"loss": 0.702,
"rewards/accuracies": 0.4166666567325592,
"rewards/chosen": 0.010513358749449253,
"rewards/margins": -0.013667477294802666,
"rewards/rejected": 0.024180836975574493,
"step": 16
},
{
"epoch": 0.04154378766508527,
"grad_norm": 90.94991906906556,
"learning_rate": 1.0344827586206897e-07,
"logits/chosen": -1.2363929748535156,
"logits/rejected": -1.2396348714828491,
"logps/chosen": -56.863731384277344,
"logps/rejected": -53.349342346191406,
"loss": 0.6938,
"rewards/accuracies": 0.4583333432674408,
"rewards/chosen": 0.07457832247018814,
"rewards/margins": 0.004799458663910627,
"rewards/rejected": 0.06977886706590652,
"step": 18
},
{
"epoch": 0.046159764072316964,
"grad_norm": 166.09887872402663,
"learning_rate": 1.1494252873563217e-07,
"logits/chosen": -1.2872830629348755,
"logits/rejected": -1.317036747932434,
"logps/chosen": -49.23244857788086,
"logps/rejected": -71.62715911865234,
"loss": 0.7087,
"rewards/accuracies": 0.5694444179534912,
"rewards/chosen": 0.02608451619744301,
"rewards/margins": -0.02054634317755699,
"rewards/rejected": 0.046630859375,
"step": 20
},
{
"epoch": 0.046159764072316964,
"eval_logits/chosen": -1.262330412864685,
"eval_logits/rejected": -1.273974895477295,
"eval_logps/chosen": -48.664798736572266,
"eval_logps/rejected": -56.1088752746582,
"eval_loss": 0.6998714804649353,
"eval_rewards/accuracies": 0.4228110611438751,
"eval_rewards/chosen": -0.01568525843322277,
"eval_rewards/margins": -0.007799813989549875,
"eval_rewards/rejected": -0.007885444909334183,
"eval_runtime": 231.8501,
"eval_samples_per_second": 7.479,
"eval_steps_per_second": 1.872,
"step": 20
},
{
"epoch": 0.05077574047954866,
"grad_norm": 141.4312370714434,
"learning_rate": 1.2643678160919542e-07,
"logits/chosen": -1.328560709953308,
"logits/rejected": -1.3631365299224854,
"logps/chosen": -50.79507827758789,
"logps/rejected": -79.58642578125,
"loss": 0.6838,
"rewards/accuracies": 0.5277777910232544,
"rewards/chosen": 0.04927082732319832,
"rewards/margins": 0.02934443950653076,
"rewards/rejected": 0.019926389679312706,
"step": 22
},
{
"epoch": 0.05539171688678036,
"grad_norm": 106.70546600458962,
"learning_rate": 1.379310344827586e-07,
"logits/chosen": -1.335903286933899,
"logits/rejected": -1.3434231281280518,
"logps/chosen": -59.29114532470703,
"logps/rejected": -62.07218933105469,
"loss": 0.6961,
"rewards/accuracies": 0.5138888955116272,
"rewards/chosen": 0.00791182741522789,
"rewards/margins": -0.0008803076343610883,
"rewards/rejected": 0.008792135864496231,
"step": 24
},
{
"epoch": 0.06000769329401205,
"grad_norm": 84.5108710571048,
"learning_rate": 1.4942528735632184e-07,
"logits/chosen": -1.315495491027832,
"logits/rejected": -1.314201831817627,
"logps/chosen": -52.26453399658203,
"logps/rejected": -46.654151916503906,
"loss": 0.6879,
"rewards/accuracies": 0.5972222089767456,
"rewards/chosen": 0.10577751696109772,
"rewards/margins": 0.01635124906897545,
"rewards/rejected": 0.08942626416683197,
"step": 26
},
{
"epoch": 0.06462366970124375,
"grad_norm": 142.3451166395631,
"learning_rate": 1.6091954022988505e-07,
"logits/chosen": -1.3120254278182983,
"logits/rejected": -1.3418428897857666,
"logps/chosen": -54.30976486206055,
"logps/rejected": -76.96250915527344,
"loss": 0.6796,
"rewards/accuracies": 0.4722222089767456,
"rewards/chosen": 0.14193940162658691,
"rewards/margins": 0.03795723244547844,
"rewards/rejected": 0.10398217290639877,
"step": 28
},
{
"epoch": 0.06923964610847544,
"grad_norm": 85.45105537711353,
"learning_rate": 1.7241379310344828e-07,
"logits/chosen": -1.339949131011963,
"logits/rejected": -1.35366952419281,
"logps/chosen": -47.45890808105469,
"logps/rejected": -56.32393264770508,
"loss": 0.685,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.12981660664081573,
"rewards/margins": 0.022233910858631134,
"rewards/rejected": 0.1075827032327652,
"step": 30
},
{
"epoch": 0.07385562251570714,
"grad_norm": 87.30098240848938,
"learning_rate": 1.839080459770115e-07,
"logits/chosen": -1.3168249130249023,
"logits/rejected": -1.3266100883483887,
"logps/chosen": -48.27603530883789,
"logps/rejected": -54.10696792602539,
"loss": 0.6693,
"rewards/accuracies": 0.5833333134651184,
"rewards/chosen": 0.2039007544517517,
"rewards/margins": 0.05477040261030197,
"rewards/rejected": 0.14913035929203033,
"step": 32
},
{
"epoch": 0.07847159892293884,
"grad_norm": 81.90822350206311,
"learning_rate": 1.9540229885057472e-07,
"logits/chosen": -1.3052334785461426,
"logits/rejected": -1.3256360292434692,
"logps/chosen": -44.5953483581543,
"logps/rejected": -61.29960250854492,
"loss": 0.674,
"rewards/accuracies": 0.5555555820465088,
"rewards/chosen": 0.29883071780204773,
"rewards/margins": 0.04845905303955078,
"rewards/rejected": 0.25037166476249695,
"step": 34
},
{
"epoch": 0.08308757533017054,
"grad_norm": 97.62737031971352,
"learning_rate": 2.0689655172413793e-07,
"logits/chosen": -1.3107632398605347,
"logits/rejected": -1.3140422105789185,
"logps/chosen": -51.13896179199219,
"logps/rejected": -48.718727111816406,
"loss": 0.6724,
"rewards/accuracies": 0.5972222089767456,
"rewards/chosen": 0.2992264926433563,
"rewards/margins": 0.05020047724246979,
"rewards/rejected": 0.24902603030204773,
"step": 36
},
{
"epoch": 0.08770355173740223,
"grad_norm": 91.70503719282425,
"learning_rate": 2.1839080459770114e-07,
"logits/chosen": -1.258926510810852,
"logits/rejected": -1.2699991464614868,
"logps/chosen": -50.59396743774414,
"logps/rejected": -56.2684326171875,
"loss": 0.6819,
"rewards/accuracies": 0.5138888955116272,
"rewards/chosen": 0.33299052715301514,
"rewards/margins": 0.038206882774829865,
"rewards/rejected": 0.29478365182876587,
"step": 38
},
{
"epoch": 0.09231952814463393,
"grad_norm": 92.44581774007628,
"learning_rate": 2.2988505747126435e-07,
"logits/chosen": -1.3053722381591797,
"logits/rejected": -1.316298007965088,
"logps/chosen": -52.49648666381836,
"logps/rejected": -56.09816360473633,
"loss": 0.6665,
"rewards/accuracies": 0.5555555820465088,
"rewards/chosen": 0.38449087738990784,
"rewards/margins": 0.07676863670349121,
"rewards/rejected": 0.30772221088409424,
"step": 40
},
{
"epoch": 0.09231952814463393,
"eval_logits/chosen": -1.2573766708374023,
"eval_logits/rejected": -1.269149899482727,
"eval_logps/chosen": -47.605804443359375,
"eval_logps/rejected": -55.2476806640625,
"eval_loss": 0.6606337428092957,
"eval_rewards/accuracies": 0.5725806355476379,
"eval_rewards/chosen": 0.5138096213340759,
"eval_rewards/margins": 0.09109989553689957,
"eval_rewards/rejected": 0.42270979285240173,
"eval_runtime": 227.103,
"eval_samples_per_second": 7.635,
"eval_steps_per_second": 1.911,
"step": 40
},
{
"epoch": 0.09693550455186563,
"grad_norm": 83.51334071319128,
"learning_rate": 2.413793103448276e-07,
"logits/chosen": -1.3424174785614014,
"logits/rejected": -1.3526452779769897,
"logps/chosen": -44.44143295288086,
"logps/rejected": -49.79873275756836,
"loss": 0.6451,
"rewards/accuracies": 0.6527777910232544,
"rewards/chosen": 0.5364899039268494,
"rewards/margins": 0.11971965432167053,
"rewards/rejected": 0.4167703092098236,
"step": 42
},
{
"epoch": 0.10155148095909731,
"grad_norm": 97.54181471785779,
"learning_rate": 2.5287356321839084e-07,
"logits/chosen": -1.307891845703125,
"logits/rejected": -1.3404256105422974,
"logps/chosen": -50.57395935058594,
"logps/rejected": -79.52447509765625,
"loss": 0.6618,
"rewards/accuracies": 0.5138888955116272,
"rewards/chosen": 0.5593165159225464,
"rewards/margins": 0.12356305122375488,
"rewards/rejected": 0.4357534646987915,
"step": 44
},
{
"epoch": 0.10616745736632902,
"grad_norm": 87.61515534490086,
"learning_rate": 2.64367816091954e-07,
"logits/chosen": -1.2781007289886475,
"logits/rejected": -1.2948905229568481,
"logps/chosen": -55.20244216918945,
"logps/rejected": -57.26641845703125,
"loss": 0.656,
"rewards/accuracies": 0.5833333134651184,
"rewards/chosen": 0.5990750193595886,
"rewards/margins": 0.11623137444257736,
"rewards/rejected": 0.4828437268733978,
"step": 46
},
{
"epoch": 0.11078343377356072,
"grad_norm": 89.38653936134894,
"learning_rate": 2.758620689655172e-07,
"logits/chosen": -1.2956469058990479,
"logits/rejected": -1.3047106266021729,
"logps/chosen": -48.0570068359375,
"logps/rejected": -54.5909423828125,
"loss": 0.6482,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.6316156983375549,
"rewards/margins": 0.15523308515548706,
"rewards/rejected": 0.47638261318206787,
"step": 48
},
{
"epoch": 0.1153994101807924,
"grad_norm": 80.50515017031447,
"learning_rate": 2.873563218390804e-07,
"logits/chosen": -1.3206638097763062,
"logits/rejected": -1.328073501586914,
"logps/chosen": -51.81482696533203,
"logps/rejected": -51.81681442260742,
"loss": 0.6387,
"rewards/accuracies": 0.6527777910232544,
"rewards/chosen": 0.7185304164886475,
"rewards/margins": 0.1627696007490158,
"rewards/rejected": 0.5557608008384705,
"step": 50
},
{
"epoch": 0.1200153865880241,
"grad_norm": 81.34880289712139,
"learning_rate": 2.988505747126437e-07,
"logits/chosen": -1.3584266901016235,
"logits/rejected": -1.3828377723693848,
"logps/chosen": -44.66862869262695,
"logps/rejected": -66.46587371826172,
"loss": 0.6441,
"rewards/accuracies": 0.5416666865348816,
"rewards/chosen": 0.8013312816619873,
"rewards/margins": 0.21720875799655914,
"rewards/rejected": 0.5841224789619446,
"step": 52
},
{
"epoch": 0.1246313629952558,
"grad_norm": 77.01376791928342,
"learning_rate": 3.103448275862069e-07,
"logits/chosen": -1.4568628072738647,
"logits/rejected": -1.4819716215133667,
"logps/chosen": -47.80024719238281,
"logps/rejected": -64.31399536132812,
"loss": 0.6113,
"rewards/accuracies": 0.6388888955116272,
"rewards/chosen": 0.7414547204971313,
"rewards/margins": 0.2719371020793915,
"rewards/rejected": 0.4695175588130951,
"step": 54
},
{
"epoch": 0.1292473394024875,
"grad_norm": 72.36047181825025,
"learning_rate": 3.218390804597701e-07,
"logits/chosen": -1.2487589120864868,
"logits/rejected": -1.250648021697998,
"logps/chosen": -44.55437469482422,
"logps/rejected": -49.30759811401367,
"loss": 0.6265,
"rewards/accuracies": 0.6527777910232544,
"rewards/chosen": 0.8203690052032471,
"rewards/margins": 0.1877826303243637,
"rewards/rejected": 0.6325862407684326,
"step": 56
},
{
"epoch": 0.1338633158097192,
"grad_norm": 82.28322419724,
"learning_rate": 3.333333333333333e-07,
"logits/chosen": -1.3345168828964233,
"logits/rejected": -1.3384166955947876,
"logps/chosen": -46.937095642089844,
"logps/rejected": -48.4022216796875,
"loss": 0.5824,
"rewards/accuracies": 0.6388888955116272,
"rewards/chosen": 0.8726701736450195,
"rewards/margins": 0.2923206090927124,
"rewards/rejected": 0.5803494453430176,
"step": 58
},
{
"epoch": 0.13847929221695088,
"grad_norm": 71.91030915826812,
"learning_rate": 3.4482758620689656e-07,
"logits/chosen": -1.326303243637085,
"logits/rejected": -1.3416494131088257,
"logps/chosen": -45.597774505615234,
"logps/rejected": -55.49925994873047,
"loss": 0.5933,
"rewards/accuracies": 0.7083333134651184,
"rewards/chosen": 0.8803545236587524,
"rewards/margins": 0.32467857003211975,
"rewards/rejected": 0.5556759834289551,
"step": 60
},
{
"epoch": 0.13847929221695088,
"eval_logits/chosen": -1.2449160814285278,
"eval_logits/rejected": -1.2571001052856445,
"eval_logps/chosen": -46.695552825927734,
"eval_logps/rejected": -54.76215362548828,
"eval_loss": 0.6056556105613708,
"eval_rewards/accuracies": 0.6440092325210571,
"eval_rewards/chosen": 0.9689397215843201,
"eval_rewards/margins": 0.30346596240997314,
"eval_rewards/rejected": 0.6654736995697021,
"eval_runtime": 226.9584,
"eval_samples_per_second": 7.64,
"eval_steps_per_second": 1.912,
"step": 60
},
{
"epoch": 0.1430952686241826,
"grad_norm": 73.53474176254309,
"learning_rate": 3.5632183908045977e-07,
"logits/chosen": -1.302392840385437,
"logits/rejected": -1.3110175132751465,
"logps/chosen": -47.1639289855957,
"logps/rejected": -52.83234786987305,
"loss": 0.6038,
"rewards/accuracies": 0.7361111044883728,
"rewards/chosen": 0.9415748119354248,
"rewards/margins": 0.2688363194465637,
"rewards/rejected": 0.6727384924888611,
"step": 62
},
{
"epoch": 0.14771124503141428,
"grad_norm": 72.82764174123524,
"learning_rate": 3.67816091954023e-07,
"logits/chosen": -1.3600918054580688,
"logits/rejected": -1.3759901523590088,
"logps/chosen": -48.68782043457031,
"logps/rejected": -59.76481246948242,
"loss": 0.5893,
"rewards/accuracies": 0.6666666865348816,
"rewards/chosen": 1.036276936531067,
"rewards/margins": 0.393592894077301,
"rewards/rejected": 0.6426840424537659,
"step": 64
},
{
"epoch": 0.152327221438646,
"grad_norm": 72.4502448635078,
"learning_rate": 3.793103448275862e-07,
"logits/chosen": -1.254841923713684,
"logits/rejected": -1.2786000967025757,
"logps/chosen": -45.10692596435547,
"logps/rejected": -67.49703216552734,
"loss": 0.5441,
"rewards/accuracies": 0.7638888955116272,
"rewards/chosen": 1.195844292640686,
"rewards/margins": 0.5360373854637146,
"rewards/rejected": 0.6598069667816162,
"step": 66
},
{
"epoch": 0.15694319784587768,
"grad_norm": 137.30984216926402,
"learning_rate": 3.9080459770114945e-07,
"logits/chosen": -1.3896088600158691,
"logits/rejected": -1.4264814853668213,
"logps/chosen": -41.05742645263672,
"logps/rejected": -65.78131103515625,
"loss": 0.6531,
"rewards/accuracies": 0.625,
"rewards/chosen": 1.2528166770935059,
"rewards/margins": 0.4981537461280823,
"rewards/rejected": 0.7546629905700684,
"step": 68
},
{
"epoch": 0.16155917425310937,
"grad_norm": 72.56751313040455,
"learning_rate": 4.0229885057471266e-07,
"logits/chosen": -1.3160932064056396,
"logits/rejected": -1.3216156959533691,
"logps/chosen": -42.512996673583984,
"logps/rejected": -46.83217239379883,
"loss": 0.5056,
"rewards/accuracies": 0.7361111044883728,
"rewards/chosen": 1.5907378196716309,
"rewards/margins": 0.5790587067604065,
"rewards/rejected": 1.0116791725158691,
"step": 70
},
{
"epoch": 0.16617515066034108,
"grad_norm": 78.13005318225228,
"learning_rate": 4.1379310344827586e-07,
"logits/chosen": -1.2581322193145752,
"logits/rejected": -1.2822985649108887,
"logps/chosen": -44.267303466796875,
"logps/rejected": -71.07489013671875,
"loss": 0.5435,
"rewards/accuracies": 0.7361111044883728,
"rewards/chosen": 1.4014304876327515,
"rewards/margins": 0.723700761795044,
"rewards/rejected": 0.6777297258377075,
"step": 72
},
{
"epoch": 0.17079112706757277,
"grad_norm": 56.428824716251846,
"learning_rate": 4.25287356321839e-07,
"logits/chosen": -1.2850017547607422,
"logits/rejected": -1.2932665348052979,
"logps/chosen": -52.70983123779297,
"logps/rejected": -53.61183166503906,
"loss": 0.6011,
"rewards/accuracies": 0.7083333134651184,
"rewards/chosen": 1.2770977020263672,
"rewards/margins": 0.5247661471366882,
"rewards/rejected": 0.7523314952850342,
"step": 74
},
{
"epoch": 0.17540710347480445,
"grad_norm": 80.88401808498448,
"learning_rate": 4.367816091954023e-07,
"logits/chosen": -1.2536025047302246,
"logits/rejected": -1.2705798149108887,
"logps/chosen": -47.25906753540039,
"logps/rejected": -60.58863067626953,
"loss": 0.5302,
"rewards/accuracies": 0.7916666865348816,
"rewards/chosen": 1.4430739879608154,
"rewards/margins": 0.5440190434455872,
"rewards/rejected": 0.8990550637245178,
"step": 76
},
{
"epoch": 0.18002307988203617,
"grad_norm": 68.76597330947016,
"learning_rate": 4.482758620689655e-07,
"logits/chosen": -1.2450529336929321,
"logits/rejected": -1.2557368278503418,
"logps/chosen": -42.16405487060547,
"logps/rejected": -52.293785095214844,
"loss": 0.5623,
"rewards/accuracies": 0.5972222089767456,
"rewards/chosen": 1.489140510559082,
"rewards/margins": 0.561518669128418,
"rewards/rejected": 0.9276217222213745,
"step": 78
},
{
"epoch": 0.18463905628926786,
"grad_norm": 74.75348151064274,
"learning_rate": 4.597701149425287e-07,
"logits/chosen": -1.3210065364837646,
"logits/rejected": -1.3294503688812256,
"logps/chosen": -42.845787048339844,
"logps/rejected": -46.24819564819336,
"loss": 0.5108,
"rewards/accuracies": 0.6944444179534912,
"rewards/chosen": 1.1906384229660034,
"rewards/margins": 0.6210441589355469,
"rewards/rejected": 0.5695942640304565,
"step": 80
},
{
"epoch": 0.18463905628926786,
"eval_logits/chosen": -1.226657509803772,
"eval_logits/rejected": -1.2391773462295532,
"eval_logps/chosen": -46.65249252319336,
"eval_logps/rejected": -55.351402282714844,
"eval_loss": 0.5300613045692444,
"eval_rewards/accuracies": 0.7142857313156128,
"eval_rewards/chosen": 0.9904682636260986,
"eval_rewards/margins": 0.6196123957633972,
"eval_rewards/rejected": 0.3708558976650238,
"eval_runtime": 227.2212,
"eval_samples_per_second": 7.631,
"eval_steps_per_second": 1.91,
"step": 80
},
{
"epoch": 0.18925503269649954,
"grad_norm": 60.96010208828231,
"learning_rate": 4.712643678160919e-07,
"logits/chosen": -1.2890104055404663,
"logits/rejected": -1.3069978952407837,
"logps/chosen": -50.1616096496582,
"logps/rejected": -61.80992126464844,
"loss": 0.485,
"rewards/accuracies": 0.7638888955116272,
"rewards/chosen": 0.9283716678619385,
"rewards/margins": 0.7194375991821289,
"rewards/rejected": 0.20893406867980957,
"step": 82
},
{
"epoch": 0.19387100910373126,
"grad_norm": 58.75128856791501,
"learning_rate": 4.827586206896552e-07,
"logits/chosen": -1.282674789428711,
"logits/rejected": -1.3157954216003418,
"logps/chosen": -42.50400161743164,
"logps/rejected": -75.54652404785156,
"loss": 0.4729,
"rewards/accuracies": 0.7916666865348816,
"rewards/chosen": 0.7927578091621399,
"rewards/margins": 1.0220049619674683,
"rewards/rejected": -0.22924719750881195,
"step": 84
},
{
"epoch": 0.19848698551096294,
"grad_norm": 69.45296124181107,
"learning_rate": 4.942528735632184e-07,
"logits/chosen": -1.2970733642578125,
"logits/rejected": -1.3046212196350098,
"logps/chosen": -48.21321487426758,
"logps/rejected": -59.63574981689453,
"loss": 0.4753,
"rewards/accuracies": 0.7777777910232544,
"rewards/chosen": 0.7275898456573486,
"rewards/margins": 0.7579395771026611,
"rewards/rejected": -0.030349718406796455,
"step": 86
},
{
"epoch": 0.20310296191819463,
"grad_norm": 57.934663164568036,
"learning_rate": 4.999979670146248e-07,
"logits/chosen": -1.322100043296814,
"logits/rejected": -1.333228588104248,
"logps/chosen": -51.94272232055664,
"logps/rejected": -62.809814453125,
"loss": 0.4698,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": 0.5794834494590759,
"rewards/margins": 0.9340064525604248,
"rewards/rejected": -0.3545229136943817,
"step": 88
},
{
"epoch": 0.20771893832542634,
"grad_norm": 67.05839544195749,
"learning_rate": 4.99981703330008e-07,
"logits/chosen": -1.2547653913497925,
"logits/rejected": -1.2667738199234009,
"logps/chosen": -44.88441467285156,
"logps/rejected": -55.610042572021484,
"loss": 0.4928,
"rewards/accuracies": 0.7777777910232544,
"rewards/chosen": 0.672675371170044,
"rewards/margins": 0.72029709815979,
"rewards/rejected": -0.04762159287929535,
"step": 90
},
{
"epoch": 0.21233491473265803,
"grad_norm": 51.75653250091916,
"learning_rate": 4.99949177018813e-07,
"logits/chosen": -1.3547184467315674,
"logits/rejected": -1.3632615804672241,
"logps/chosen": -41.517826080322266,
"logps/rejected": -52.83651351928711,
"loss": 0.3985,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": 0.8576219081878662,
"rewards/margins": 1.0123066902160645,
"rewards/rejected": -0.15468484163284302,
"step": 92
},
{
"epoch": 0.21695089113988972,
"grad_norm": 81.64528533284856,
"learning_rate": 4.999003901970474e-07,
"logits/chosen": -1.3031115531921387,
"logits/rejected": -1.3127225637435913,
"logps/chosen": -54.79065704345703,
"logps/rejected": -54.33130645751953,
"loss": 0.5699,
"rewards/accuracies": 0.7222222089767456,
"rewards/chosen": 0.6079578995704651,
"rewards/margins": 0.6609423160552979,
"rewards/rejected": -0.05298437178134918,
"step": 94
},
{
"epoch": 0.22156686754712143,
"grad_norm": 50.92778720915042,
"learning_rate": 4.998353460385512e-07,
"logits/chosen": -1.2541792392730713,
"logits/rejected": -1.2738375663757324,
"logps/chosen": -45.45362091064453,
"logps/rejected": -64.12794494628906,
"loss": 0.4024,
"rewards/accuracies": 0.8055555820465088,
"rewards/chosen": 0.9434598088264465,
"rewards/margins": 1.2422370910644531,
"rewards/rejected": -0.29877743124961853,
"step": 96
},
{
"epoch": 0.22618284395435312,
"grad_norm": 56.275721433856205,
"learning_rate": 4.997540487747892e-07,
"logits/chosen": -1.2653698921203613,
"logits/rejected": -1.2802023887634277,
"logps/chosen": -44.68564987182617,
"logps/rejected": -66.80543518066406,
"loss": 0.4761,
"rewards/accuracies": 0.7777777910232544,
"rewards/chosen": 0.8570264577865601,
"rewards/margins": 1.2518484592437744,
"rewards/rejected": -0.39482197165489197,
"step": 98
},
{
"epoch": 0.2307988203615848,
"grad_norm": 52.70179467657195,
"learning_rate": 4.996565036945769e-07,
"logits/chosen": -1.2993725538253784,
"logits/rejected": -1.3049986362457275,
"logps/chosen": -50.757686614990234,
"logps/rejected": -53.39494323730469,
"loss": 0.474,
"rewards/accuracies": 0.8055555820465088,
"rewards/chosen": 0.6508947014808655,
"rewards/margins": 0.9926181435585022,
"rewards/rejected": -0.3417234420776367,
"step": 100
},
{
"epoch": 0.2307988203615848,
"eval_logits/chosen": -1.2166502475738525,
"eval_logits/rejected": -1.2277939319610596,
"eval_logps/chosen": -46.99800109863281,
"eval_logps/rejected": -56.576629638671875,
"eval_loss": 0.45405343174934387,
"eval_rewards/accuracies": 0.7511520981788635,
"eval_rewards/chosen": 0.8177129626274109,
"eval_rewards/margins": 1.0594747066497803,
"eval_rewards/rejected": -0.24176181852817535,
"eval_runtime": 227.4246,
"eval_samples_per_second": 7.625,
"eval_steps_per_second": 1.908,
"step": 100
},
{
"epoch": 0.23541479676881652,
"grad_norm": 44.630871285034665,
"learning_rate": 4.995427171437356e-07,
"logits/chosen": -1.2710050344467163,
"logits/rejected": -1.2925546169281006,
"logps/chosen": -44.29911422729492,
"logps/rejected": -63.83744430541992,
"loss": 0.3944,
"rewards/accuracies": 0.8055555820465088,
"rewards/chosen": 0.7817994356155396,
"rewards/margins": 1.3069223165512085,
"rewards/rejected": -0.5251227617263794,
"step": 102
},
{
"epoch": 0.2400307731760482,
"grad_norm": 63.36947457687719,
"learning_rate": 4.994126965246796e-07,
"logits/chosen": -1.281785488128662,
"logits/rejected": -1.2921262979507446,
"logps/chosen": -45.61968994140625,
"logps/rejected": -57.57981872558594,
"loss": 0.4314,
"rewards/accuracies": 0.8194444179534912,
"rewards/chosen": 0.837566077709198,
"rewards/margins": 1.084651231765747,
"rewards/rejected": -0.247085303068161,
"step": 104
},
{
"epoch": 0.24464674958327992,
"grad_norm": 64.3165584011508,
"learning_rate": 4.992664502959351e-07,
"logits/chosen": -1.2655751705169678,
"logits/rejected": -1.3007822036743164,
"logps/chosen": -42.23821258544922,
"logps/rejected": -85.14391326904297,
"loss": 0.3478,
"rewards/accuracies": 0.7777777910232544,
"rewards/chosen": 0.9540318250656128,
"rewards/margins": 1.9801336526870728,
"rewards/rejected": -1.0261015892028809,
"step": 106
},
{
"epoch": 0.2492627259905116,
"grad_norm": 68.33245668142573,
"learning_rate": 4.991039879715898e-07,
"logits/chosen": -1.2239530086517334,
"logits/rejected": -1.2506436109542847,
"logps/chosen": -48.302852630615234,
"logps/rejected": -70.37635803222656,
"loss": 0.4095,
"rewards/accuracies": 0.7777777910232544,
"rewards/chosen": 1.1005271673202515,
"rewards/margins": 1.5407756567001343,
"rewards/rejected": -0.4402484893798828,
"step": 108
},
{
"epoch": 0.2538787023977433,
"grad_norm": 41.58256253343475,
"learning_rate": 4.989253201206736e-07,
"logits/chosen": -1.3317803144454956,
"logits/rejected": -1.3334723711013794,
"logps/chosen": -47.94260787963867,
"logps/rejected": -48.03237533569336,
"loss": 0.4524,
"rewards/accuracies": 0.6944444179534912,
"rewards/chosen": 0.9174912571907043,
"rewards/margins": 1.0279741287231445,
"rewards/rejected": -0.11048289388418198,
"step": 110
},
{
"epoch": 0.258494678804975,
"grad_norm": 66.23818262108296,
"learning_rate": 4.987304583664712e-07,
"logits/chosen": -1.2193766832351685,
"logits/rejected": -1.2288120985031128,
"logps/chosen": -55.089717864990234,
"logps/rejected": -61.21225357055664,
"loss": 0.4449,
"rewards/accuracies": 0.7222222089767456,
"rewards/chosen": 0.7610146999359131,
"rewards/margins": 1.0799916982650757,
"rewards/rejected": -0.3189769983291626,
"step": 112
},
{
"epoch": 0.26311065521220667,
"grad_norm": 58.36439834992655,
"learning_rate": 4.985194153857662e-07,
"logits/chosen": -1.3395094871520996,
"logits/rejected": -1.3416942358016968,
"logps/chosen": -43.976890563964844,
"logps/rejected": -45.82760238647461,
"loss": 0.4929,
"rewards/accuracies": 0.7222222089767456,
"rewards/chosen": 0.4750349223613739,
"rewards/margins": 0.913371741771698,
"rewards/rejected": -0.4383367896080017,
"step": 114
},
{
"epoch": 0.2677266316194384,
"grad_norm": 41.441657797600875,
"learning_rate": 4.982922049080163e-07,
"logits/chosen": -1.3572431802749634,
"logits/rejected": -1.3625115156173706,
"logps/chosen": -42.45515441894531,
"logps/rejected": -49.73173522949219,
"loss": 0.3691,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": 0.41610708832740784,
"rewards/margins": 1.2800379991531372,
"rewards/rejected": -0.8639309406280518,
"step": 116
},
{
"epoch": 0.2723426080266701,
"grad_norm": 60.30435878433939,
"learning_rate": 4.980488417144599e-07,
"logits/chosen": -1.2863659858703613,
"logits/rejected": -1.3199315071105957,
"logps/chosen": -48.62416076660156,
"logps/rejected": -85.73699188232422,
"loss": 0.4597,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.11831729859113693,
"rewards/margins": 1.6947745084762573,
"rewards/rejected": -1.5764573812484741,
"step": 118
},
{
"epoch": 0.27695858443390176,
"grad_norm": 41.355652091388535,
"learning_rate": 4.977893416371544e-07,
"logits/chosen": -1.2884269952774048,
"logits/rejected": -1.2976269721984863,
"logps/chosen": -41.79518127441406,
"logps/rejected": -54.925662994384766,
"loss": 0.3826,
"rewards/accuracies": 0.8194444179534912,
"rewards/chosen": 0.0362311452627182,
"rewards/margins": 1.4945807456970215,
"rewards/rejected": -1.4583497047424316,
"step": 120
},
{
"epoch": 0.27695858443390176,
"eval_logits/chosen": -1.207366704940796,
"eval_logits/rejected": -1.2173371315002441,
"eval_logps/chosen": -48.50833511352539,
"eval_logps/rejected": -58.64011764526367,
"eval_loss": 0.4093886911869049,
"eval_rewards/accuracies": 0.7724654674530029,
"eval_rewards/chosen": 0.06255079805850983,
"eval_rewards/margins": 1.336057424545288,
"eval_rewards/rejected": -1.27350652217865,
"eval_runtime": 227.1307,
"eval_samples_per_second": 7.634,
"eval_steps_per_second": 1.911,
"step": 120
},
{
"epoch": 0.28157456084113347,
"grad_norm": 57.25843545889309,
"learning_rate": 4.975137215579469e-07,
"logits/chosen": -1.1866450309753418,
"logits/rejected": -1.186094045639038,
"logps/chosen": -55.403018951416016,
"logps/rejected": -51.23255920410156,
"loss": 0.4266,
"rewards/accuracies": 0.7777777910232544,
"rewards/chosen": 0.05289599671959877,
"rewards/margins": 1.1414172649383545,
"rewards/rejected": -1.0885213613510132,
"step": 122
},
{
"epoch": 0.2861905372483652,
"grad_norm": 39.62322228531182,
"learning_rate": 4.972219994073755e-07,
"logits/chosen": -1.18235182762146,
"logits/rejected": -1.2118382453918457,
"logps/chosen": -48.611637115478516,
"logps/rejected": -79.41681671142578,
"loss": 0.3892,
"rewards/accuracies": 0.7638888955116272,
"rewards/chosen": -0.14360421895980835,
"rewards/margins": 1.8774958848953247,
"rewards/rejected": -2.0211000442504883,
"step": 124
},
{
"epoch": 0.2908065136555969,
"grad_norm": 52.17921097346343,
"learning_rate": 4.969141941635025e-07,
"logits/chosen": -1.2253869771957397,
"logits/rejected": -1.2435510158538818,
"logps/chosen": -50.45375061035156,
"logps/rejected": -69.55599212646484,
"loss": 0.4746,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.31797370314598083,
"rewards/margins": 1.7364860773086548,
"rewards/rejected": -2.054459571838379,
"step": 126
},
{
"epoch": 0.29542249006282856,
"grad_norm": 70.38261738253976,
"learning_rate": 4.965903258506806e-07,
"logits/chosen": -1.1796499490737915,
"logits/rejected": -1.1957367658615112,
"logps/chosen": -49.15879440307617,
"logps/rejected": -73.3575210571289,
"loss": 0.3346,
"rewards/accuracies": 0.8055555820465088,
"rewards/chosen": 0.04160241410136223,
"rewards/margins": 1.7600233554840088,
"rewards/rejected": -1.7184207439422607,
"step": 128
},
{
"epoch": 0.30003846647006027,
"grad_norm": 57.934452383223125,
"learning_rate": 4.962504155382493e-07,
"logits/chosen": -1.3256597518920898,
"logits/rejected": -1.3279542922973633,
"logps/chosen": -44.50282287597656,
"logps/rejected": -48.98247146606445,
"loss": 0.382,
"rewards/accuracies": 0.7916666865348816,
"rewards/chosen": 0.2423497587442398,
"rewards/margins": 1.2484761476516724,
"rewards/rejected": -1.0061264038085938,
"step": 130
},
{
"epoch": 0.304654442877292,
"grad_norm": 58.71983276565311,
"learning_rate": 4.958944853391652e-07,
"logits/chosen": -1.1831316947937012,
"logits/rejected": -1.1885439157485962,
"logps/chosen": -45.726505279541016,
"logps/rejected": -54.00067138671875,
"loss": 0.4078,
"rewards/accuracies": 0.7638888955116272,
"rewards/chosen": 0.6056569814682007,
"rewards/margins": 1.2579783201217651,
"rewards/rejected": -0.6523212790489197,
"step": 132
},
{
"epoch": 0.30927041928452365,
"grad_norm": 50.9411946225504,
"learning_rate": 4.955225584085624e-07,
"logits/chosen": -1.3628097772598267,
"logits/rejected": -1.3736504316329956,
"logps/chosen": -44.589229583740234,
"logps/rejected": -60.06134796142578,
"loss": 0.4245,
"rewards/accuracies": 0.7638888955116272,
"rewards/chosen": 0.8993210196495056,
"rewards/margins": 1.6077146530151367,
"rewards/rejected": -0.7083935737609863,
"step": 134
},
{
"epoch": 0.31388639569175536,
"grad_norm": 53.494824083866554,
"learning_rate": 4.951346589422467e-07,
"logits/chosen": -1.2143707275390625,
"logits/rejected": -1.2256660461425781,
"logps/chosen": -44.73698425292969,
"logps/rejected": -63.74873733520508,
"loss": 0.4379,
"rewards/accuracies": 0.75,
"rewards/chosen": 1.0820810794830322,
"rewards/margins": 1.6512118577957153,
"rewards/rejected": -0.5691307187080383,
"step": 136
},
{
"epoch": 0.3185023720989871,
"grad_norm": 87.56798544187119,
"learning_rate": 4.94730812175122e-07,
"logits/chosen": -1.338615894317627,
"logits/rejected": -1.344118595123291,
"logps/chosen": -45.98448181152344,
"logps/rejected": -49.50959396362305,
"loss": 0.4706,
"rewards/accuracies": 0.6805555820465088,
"rewards/chosen": 1.05098295211792,
"rewards/margins": 1.33613121509552,
"rewards/rejected": -0.2851482033729553,
"step": 138
},
{
"epoch": 0.32311834850621873,
"grad_norm": 68.21005063884613,
"learning_rate": 4.943110443795476e-07,
"logits/chosen": -1.258334755897522,
"logits/rejected": -1.2637797594070435,
"logps/chosen": -50.91215515136719,
"logps/rejected": -52.46604537963867,
"loss": 0.4759,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.7942720055580139,
"rewards/margins": 1.164951205253601,
"rewards/rejected": -0.37067916989326477,
"step": 140
},
{
"epoch": 0.32311834850621873,
"eval_logits/chosen": -1.1890074014663696,
"eval_logits/rejected": -1.1990076303482056,
"eval_logps/chosen": -47.355777740478516,
"eval_logps/rejected": -58.06831359863281,
"eval_loss": 0.38173291087150574,
"eval_rewards/accuracies": 0.7718893885612488,
"eval_rewards/chosen": 0.6388264298439026,
"eval_rewards/margins": 1.6264294385910034,
"eval_rewards/rejected": -0.987602949142456,
"eval_runtime": 227.2808,
"eval_samples_per_second": 7.629,
"eval_steps_per_second": 1.91,
"step": 140
},
{
"epoch": 0.32773432491345045,
"grad_norm": 50.18122842606284,
"learning_rate": 4.938753828636297e-07,
"logits/chosen": -1.2129461765289307,
"logits/rejected": -1.21940279006958,
"logps/chosen": -53.97553253173828,
"logps/rejected": -53.66658020019531,
"loss": 0.4623,
"rewards/accuracies": 0.6805555820465088,
"rewards/chosen": 0.7083945274353027,
"rewards/margins": 1.2968156337738037,
"rewards/rejected": -0.5884211659431458,
"step": 142
},
{
"epoch": 0.33235030132068216,
"grad_norm": 50.59660648914643,
"learning_rate": 4.934238559694447e-07,
"logits/chosen": -1.1950477361679077,
"logits/rejected": -1.2141423225402832,
"logps/chosen": -47.11637878417969,
"logps/rejected": -65.34754943847656,
"loss": 0.3506,
"rewards/accuracies": 0.7916666865348816,
"rewards/chosen": 0.5280824899673462,
"rewards/margins": 1.7753053903579712,
"rewards/rejected": -1.2472230195999146,
"step": 144
},
{
"epoch": 0.3369662777279138,
"grad_norm": 48.072274874250304,
"learning_rate": 4.929564930711957e-07,
"logits/chosen": -1.281104564666748,
"logits/rejected": -1.2873430252075195,
"logps/chosen": -46.95094299316406,
"logps/rejected": -52.25088882446289,
"loss": 0.3785,
"rewards/accuracies": 0.8194444179534912,
"rewards/chosen": 0.3459606468677521,
"rewards/margins": 1.2943564653396606,
"rewards/rejected": -0.9483956098556519,
"step": 146
},
{
"epoch": 0.34158225413514554,
"grad_norm": 45.84138607827678,
"learning_rate": 4.924733245733008e-07,
"logits/chosen": -1.168983817100525,
"logits/rejected": -1.1675101518630981,
"logps/chosen": -53.398841857910156,
"logps/rejected": -48.5836181640625,
"loss": 0.372,
"rewards/accuracies": 0.7638888955116272,
"rewards/chosen": 0.4610249102115631,
"rewards/margins": 1.1595698595046997,
"rewards/rejected": -0.6985449194908142,
"step": 148
},
{
"epoch": 0.34619823054237725,
"grad_norm": 41.47855724733255,
"learning_rate": 4.91974381908416e-07,
"logits/chosen": -1.3004454374313354,
"logits/rejected": -1.3239775896072388,
"logps/chosen": -44.39426803588867,
"logps/rejected": -69.07200622558594,
"loss": 0.3055,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.01821739226579666,
"rewards/margins": 2.1426663398742676,
"rewards/rejected": -2.124448776245117,
"step": 150
},
{
"epoch": 0.3508142069496089,
"grad_norm": 38.94525175640219,
"learning_rate": 4.914596975353898e-07,
"logits/chosen": -1.263897180557251,
"logits/rejected": -1.2764997482299805,
"logps/chosen": -47.19563674926758,
"logps/rejected": -57.78141403198242,
"loss": 0.386,
"rewards/accuracies": 0.7916666865348816,
"rewards/chosen": 0.055071063339710236,
"rewards/margins": 1.5383036136627197,
"rewards/rejected": -1.4832323789596558,
"step": 152
},
{
"epoch": 0.3554301833568406,
"grad_norm": 66.86847446450086,
"learning_rate": 4.909293049371519e-07,
"logits/chosen": -1.2072829008102417,
"logits/rejected": -1.2103402614593506,
"logps/chosen": -55.665191650390625,
"logps/rejected": -53.18820571899414,
"loss": 0.4106,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.15528617799282074,
"rewards/margins": 1.4208853244781494,
"rewards/rejected": -1.2655991315841675,
"step": 154
},
{
"epoch": 0.36004615976407234,
"grad_norm": 58.39718753271016,
"learning_rate": 4.903832386185343e-07,
"logits/chosen": -1.246012568473816,
"logits/rejected": -1.2516732215881348,
"logps/chosen": -52.729427337646484,
"logps/rejected": -53.41749572753906,
"loss": 0.4389,
"rewards/accuracies": 0.7361111044883728,
"rewards/chosen": 0.07690320909023285,
"rewards/margins": 1.1573313474655151,
"rewards/rejected": -1.080428123474121,
"step": 156
},
{
"epoch": 0.364662136171304,
"grad_norm": 48.60983015034901,
"learning_rate": 4.89821534104028e-07,
"logits/chosen": -1.2771211862564087,
"logits/rejected": -1.2919840812683105,
"logps/chosen": -50.660606384277344,
"logps/rejected": -68.34771728515625,
"loss": 0.3141,
"rewards/accuracies": 0.7916666865348816,
"rewards/chosen": 0.26177337765693665,
"rewards/margins": 2.4097442626953125,
"rewards/rejected": -2.1479711532592773,
"step": 158
},
{
"epoch": 0.3692781125785357,
"grad_norm": 53.41098297744381,
"learning_rate": 4.892442279354698e-07,
"logits/chosen": -1.2370442152023315,
"logits/rejected": -1.2537869215011597,
"logps/chosen": -52.96739959716797,
"logps/rejected": -71.32413482666016,
"loss": 0.3591,
"rewards/accuracies": 0.8055555820465088,
"rewards/chosen": -0.09003262966871262,
"rewards/margins": 1.8999587297439575,
"rewards/rejected": -1.989991545677185,
"step": 160
},
{
"epoch": 0.3692781125785357,
"eval_logits/chosen": -1.1776655912399292,
"eval_logits/rejected": -1.1867269277572632,
"eval_logps/chosen": -48.62409973144531,
"eval_logps/rejected": -59.5378303527832,
"eval_loss": 0.3585492968559265,
"eval_rewards/accuracies": 0.7908986210823059,
"eval_rewards/chosen": 0.0046647959388792515,
"eval_rewards/margins": 1.7270255088806152,
"eval_rewards/rejected": -1.7223609685897827,
"eval_runtime": 226.9778,
"eval_samples_per_second": 7.64,
"eval_steps_per_second": 1.912,
"step": 160
},
{
"epoch": 0.3738940889857674,
"grad_norm": 48.30891742529218,
"learning_rate": 4.886513576696673e-07,
"logits/chosen": -1.2570397853851318,
"logits/rejected": -1.2753015756607056,
"logps/chosen": -50.679901123046875,
"logps/rejected": -69.58779907226562,
"loss": 0.3731,
"rewards/accuracies": 0.8194444179534912,
"rewards/chosen": -0.10189250111579895,
"rewards/margins": 1.991517424583435,
"rewards/rejected": -2.093410015106201,
"step": 162
},
{
"epoch": 0.3785100653929991,
"grad_norm": 38.94107066608042,
"learning_rate": 4.880429618759543e-07,
"logits/chosen": -1.300181269645691,
"logits/rejected": -1.309295892715454,
"logps/chosen": -54.4401969909668,
"logps/rejected": -57.44432830810547,
"loss": 0.4199,
"rewards/accuracies": 0.7777777910232544,
"rewards/chosen": -0.0659228190779686,
"rewards/margins": 1.5560578107833862,
"rewards/rejected": -1.6219807863235474,
"step": 164
},
{
"epoch": 0.3831260418002308,
"grad_norm": 42.007218912580726,
"learning_rate": 4.874190801336817e-07,
"logits/chosen": -1.2063199281692505,
"logits/rejected": -1.2187817096710205,
"logps/chosen": -52.59967041015625,
"logps/rejected": -62.30234909057617,
"loss": 0.3373,
"rewards/accuracies": 0.8194444179534912,
"rewards/chosen": -0.0022201803512871265,
"rewards/margins": 1.9247175455093384,
"rewards/rejected": -1.9269376993179321,
"step": 166
},
{
"epoch": 0.3877420182074625,
"grad_norm": 41.8692578527763,
"learning_rate": 4.867797530296431e-07,
"logits/chosen": -1.2532522678375244,
"logits/rejected": -1.2671799659729004,
"logps/chosen": -54.77018737792969,
"logps/rejected": -65.7540512084961,
"loss": 0.2993,
"rewards/accuracies": 0.7916666865348816,
"rewards/chosen": -0.03232141211628914,
"rewards/margins": 2.12778377532959,
"rewards/rejected": -2.1601054668426514,
"step": 168
},
{
"epoch": 0.39235799461469417,
"grad_norm": 34.82958012225073,
"learning_rate": 4.861250221554343e-07,
"logits/chosen": -1.208885669708252,
"logits/rejected": -1.2256441116333008,
"logps/chosen": -43.97615051269531,
"logps/rejected": -71.41213989257812,
"loss": 0.3062,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": 0.07487620413303375,
"rewards/margins": 2.4894187450408936,
"rewards/rejected": -2.4145421981811523,
"step": 170
},
{
"epoch": 0.3969739710219259,
"grad_norm": 31.38457100645333,
"learning_rate": 4.854549301047476e-07,
"logits/chosen": -1.232684850692749,
"logits/rejected": -1.2298487424850464,
"logps/chosen": -51.118751525878906,
"logps/rejected": -52.634708404541016,
"loss": 0.3429,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": 0.21625421941280365,
"rewards/margins": 1.5239174365997314,
"rewards/rejected": -1.3076633214950562,
"step": 172
},
{
"epoch": 0.4015899474291576,
"grad_norm": 51.50376706566093,
"learning_rate": 4.847695204706005e-07,
"logits/chosen": -1.26514732837677,
"logits/rejected": -1.266494631767273,
"logps/chosen": -46.0894889831543,
"logps/rejected": -49.9418830871582,
"loss": 0.3849,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.47246673703193665,
"rewards/margins": 1.3556917905807495,
"rewards/rejected": -0.88322514295578,
"step": 174
},
{
"epoch": 0.40620592383638926,
"grad_norm": 39.155946696884016,
"learning_rate": 4.840688378425e-07,
"logits/chosen": -1.1615080833435059,
"logits/rejected": -1.1706922054290771,
"logps/chosen": -54.5849609375,
"logps/rejected": -67.2327651977539,
"loss": 0.2596,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": 0.5416942238807678,
"rewards/margins": 2.2614989280700684,
"rewards/rejected": -1.7198045253753662,
"step": 176
},
{
"epoch": 0.410821900243621,
"grad_norm": 41.671444907690415,
"learning_rate": 4.833529278035422e-07,
"logits/chosen": -1.30013108253479,
"logits/rejected": -1.3356281518936157,
"logps/chosen": -45.55073547363281,
"logps/rejected": -83.93589782714844,
"loss": 0.3114,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": 0.6670578718185425,
"rewards/margins": 3.266463279724121,
"rewards/rejected": -2.599405527114868,
"step": 178
},
{
"epoch": 0.4154378766508527,
"grad_norm": 39.182516678430176,
"learning_rate": 4.826218369274459e-07,
"logits/chosen": -1.1979715824127197,
"logits/rejected": -1.2180922031402588,
"logps/chosen": -46.9954948425293,
"logps/rejected": -76.03369140625,
"loss": 0.3212,
"rewards/accuracies": 0.8055555820465088,
"rewards/chosen": 0.5715588927268982,
"rewards/margins": 2.937516212463379,
"rewards/rejected": -2.3659567832946777,
"step": 180
},
{
"epoch": 0.4154378766508527,
"eval_logits/chosen": -1.169126272201538,
"eval_logits/rejected": -1.1776955127716064,
"eval_logps/chosen": -46.824092864990234,
"eval_logps/rejected": -58.37122344970703,
"eval_loss": 0.33698517084121704,
"eval_rewards/accuracies": 0.7857142686843872,
"eval_rewards/chosen": 0.9046696424484253,
"eval_rewards/margins": 2.043729782104492,
"eval_rewards/rejected": -1.139060139656067,
"eval_runtime": 227.2193,
"eval_samples_per_second": 7.631,
"eval_steps_per_second": 1.91,
"step": 180
},
{
"epoch": 0.42005385305808435,
"grad_norm": 34.66268965474155,
"learning_rate": 4.818756127755237e-07,
"logits/chosen": -1.245609164237976,
"logits/rejected": -1.2478203773498535,
"logps/chosen": -44.18710708618164,
"logps/rejected": -48.15581130981445,
"loss": 0.2919,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": 1.065934658050537,
"rewards/margins": 1.712632656097412,
"rewards/rejected": -0.6466982364654541,
"step": 182
},
{
"epoch": 0.42466982946531606,
"grad_norm": 48.14030342071319,
"learning_rate": 4.811143038935873e-07,
"logits/chosen": -1.1818798780441284,
"logits/rejected": -1.189144253730774,
"logps/chosen": -51.0610237121582,
"logps/rejected": -54.243797302246094,
"loss": 0.3902,
"rewards/accuracies": 0.75,
"rewards/chosen": 1.165444254875183,
"rewards/margins": 1.9199776649475098,
"rewards/rejected": -0.7545332908630371,
"step": 184
},
{
"epoch": 0.4292858058725478,
"grad_norm": 45.03104875996758,
"learning_rate": 4.803379598087899e-07,
"logits/chosen": -1.2626315355300903,
"logits/rejected": -1.2647953033447266,
"logps/chosen": -47.61143493652344,
"logps/rejected": -46.91539764404297,
"loss": 0.3381,
"rewards/accuracies": 0.8055555820465088,
"rewards/chosen": 0.8671760559082031,
"rewards/margins": 1.6762652397155762,
"rewards/rejected": -0.8090891242027283,
"step": 186
},
{
"epoch": 0.43390178227977944,
"grad_norm": 63.031567731654626,
"learning_rate": 4.795466310264034e-07,
"logits/chosen": -1.18837308883667,
"logits/rejected": -1.2106761932373047,
"logps/chosen": -45.96682357788086,
"logps/rejected": -75.37088012695312,
"loss": 0.4522,
"rewards/accuracies": 0.7222222089767456,
"rewards/chosen": 0.41675278544425964,
"rewards/margins": 2.4167346954345703,
"rewards/rejected": -1.9999819993972778,
"step": 188
},
{
"epoch": 0.43851775868701115,
"grad_norm": 30.590120071737818,
"learning_rate": 4.787403690265335e-07,
"logits/chosen": -1.2696727514266968,
"logits/rejected": -1.2787107229232788,
"logps/chosen": -46.919532775878906,
"logps/rejected": -57.34629440307617,
"loss": 0.3259,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": 0.6651458144187927,
"rewards/margins": 2.023615837097168,
"rewards/rejected": -1.358469843864441,
"step": 190
},
{
"epoch": 0.44313373509424286,
"grad_norm": 65.00881113396572,
"learning_rate": 4.779192262607702e-07,
"logits/chosen": -1.1799297332763672,
"logits/rejected": -1.1909632682800293,
"logps/chosen": -51.90432357788086,
"logps/rejected": -67.85346221923828,
"loss": 0.3613,
"rewards/accuracies": 0.7361111044883728,
"rewards/chosen": 0.8239190578460693,
"rewards/margins": 2.4376637935638428,
"rewards/rejected": -1.6137449741363525,
"step": 192
},
{
"epoch": 0.4477497115014745,
"grad_norm": 45.51586779707086,
"learning_rate": 4.770832561487758e-07,
"logits/chosen": -1.273619294166565,
"logits/rejected": -1.2734426259994507,
"logps/chosen": -51.00930404663086,
"logps/rejected": -50.82846450805664,
"loss": 0.2704,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": 0.6313294172286987,
"rewards/margins": 1.9725594520568848,
"rewards/rejected": -1.3412299156188965,
"step": 194
},
{
"epoch": 0.45236568790870624,
"grad_norm": 59.66597715979004,
"learning_rate": 4.762325130748097e-07,
"logits/chosen": -1.1289526224136353,
"logits/rejected": -1.1318069696426392,
"logps/chosen": -56.16923522949219,
"logps/rejected": -53.92496109008789,
"loss": 0.3301,
"rewards/accuracies": 0.8194444179534912,
"rewards/chosen": 0.5848253965377808,
"rewards/margins": 1.7668564319610596,
"rewards/rejected": -1.1820309162139893,
"step": 196
},
{
"epoch": 0.45698166431593795,
"grad_norm": 27.39596009585691,
"learning_rate": 4.7536705238418995e-07,
"logits/chosen": -1.2408547401428223,
"logits/rejected": -1.2465531826019287,
"logps/chosen": -51.9420280456543,
"logps/rejected": -59.4866943359375,
"loss": 0.2815,
"rewards/accuracies": 0.8194444179534912,
"rewards/chosen": 0.4031212031841278,
"rewards/margins": 2.25166654586792,
"rewards/rejected": -1.8485453128814697,
"step": 198
},
{
"epoch": 0.4615976407231696,
"grad_norm": 55.89572206023644,
"learning_rate": 4.7448693037969336e-07,
"logits/chosen": -1.221846103668213,
"logits/rejected": -1.229733943939209,
"logps/chosen": -51.18299102783203,
"logps/rejected": -57.01997375488281,
"loss": 0.3319,
"rewards/accuracies": 0.7916666865348816,
"rewards/chosen": 0.33577215671539307,
"rewards/margins": 1.9459080696105957,
"rewards/rejected": -1.6101359128952026,
"step": 200
},
{
"epoch": 0.4615976407231696,
"eval_logits/chosen": -1.1536659002304077,
"eval_logits/rejected": -1.1624053716659546,
"eval_logps/chosen": -47.9106559753418,
"eval_logps/rejected": -59.707550048828125,
"eval_loss": 0.3186802566051483,
"eval_rewards/accuracies": 0.8012672662734985,
"eval_rewards/chosen": 0.3613872528076172,
"eval_rewards/margins": 2.168609857559204,
"eval_rewards/rejected": -1.807222604751587,
"eval_runtime": 227.3404,
"eval_samples_per_second": 7.627,
"eval_steps_per_second": 1.909,
"step": 200
},
{
"epoch": 0.4662136171304013,
"grad_norm": 47.53414859111433,
"learning_rate": 4.735922043178923e-07,
"logits/chosen": -1.130042552947998,
"logits/rejected": -1.1414666175842285,
"logps/chosen": -50.747615814208984,
"logps/rejected": -67.29204559326172,
"loss": 0.2861,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": 0.3648765981197357,
"rewards/margins": 2.223259449005127,
"rewards/rejected": -1.8583827018737793,
"step": 202
},
{
"epoch": 0.47082959353763304,
"grad_norm": 45.15212642774012,
"learning_rate": 4.7268293240543017e-07,
"logits/chosen": -1.2075278759002686,
"logits/rejected": -1.2145761251449585,
"logps/chosen": -50.80036163330078,
"logps/rejected": -64.42232513427734,
"loss": 0.3686,
"rewards/accuracies": 0.7916666865348816,
"rewards/chosen": 0.3706679344177246,
"rewards/margins": 2.0169572830200195,
"rewards/rejected": -1.646289348602295,
"step": 204
},
{
"epoch": 0.4754455699448647,
"grad_norm": 51.8551798480754,
"learning_rate": 4.717591737952344e-07,
"logits/chosen": -1.225889801979065,
"logits/rejected": -1.2406290769577026,
"logps/chosen": -42.99085235595703,
"logps/rejected": -63.13933563232422,
"loss": 0.31,
"rewards/accuracies": 0.8194444179534912,
"rewards/chosen": 0.2732833921909332,
"rewards/margins": 2.319049835205078,
"rewards/rejected": -2.045766592025757,
"step": 206
},
{
"epoch": 0.4800615463520964,
"grad_norm": 45.48745944142938,
"learning_rate": 4.7082098858266837e-07,
"logits/chosen": -1.2216678857803345,
"logits/rejected": -1.2374674081802368,
"logps/chosen": -39.35933303833008,
"logps/rejected": -69.94754791259766,
"loss": 0.3897,
"rewards/accuracies": 0.7083333134651184,
"rewards/chosen": 0.15072119235992432,
"rewards/margins": 2.5561957359313965,
"rewards/rejected": -2.4054746627807617,
"step": 208
},
{
"epoch": 0.4846775227593281,
"grad_norm": 23.330292154413335,
"learning_rate": 4.698684378016222e-07,
"logits/chosen": -1.235012412071228,
"logits/rejected": -1.2410335540771484,
"logps/chosen": -51.16822052001953,
"logps/rejected": -67.03107452392578,
"loss": 0.2774,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": 0.5210573673248291,
"rewards/margins": 2.396049976348877,
"rewards/rejected": -1.8749926090240479,
"step": 210
},
{
"epoch": 0.48929349916655984,
"grad_norm": 44.38100018119066,
"learning_rate": 4.6890158342054174e-07,
"logits/chosen": -1.2579890489578247,
"logits/rejected": -1.2686585187911987,
"logps/chosen": -43.69232177734375,
"logps/rejected": -59.38545608520508,
"loss": 0.3209,
"rewards/accuracies": 0.8055555820465088,
"rewards/chosen": 0.6999004483222961,
"rewards/margins": 2.572566032409668,
"rewards/rejected": -1.8726658821105957,
"step": 212
},
{
"epoch": 0.4939094755737915,
"grad_norm": 48.05758145861009,
"learning_rate": 4.679204883383973e-07,
"logits/chosen": -1.291759729385376,
"logits/rejected": -1.311813235282898,
"logps/chosen": -42.26872253417969,
"logps/rejected": -75.81320190429688,
"loss": 0.2963,
"rewards/accuracies": 0.7916666865348816,
"rewards/chosen": 0.7384893894195557,
"rewards/margins": 3.459284782409668,
"rewards/rejected": -2.7207956314086914,
"step": 214
},
{
"epoch": 0.4985254519810232,
"grad_norm": 49.231820908366814,
"learning_rate": 4.669252163805919e-07,
"logits/chosen": -1.2944141626358032,
"logits/rejected": -1.3142738342285156,
"logps/chosen": -45.93452453613281,
"logps/rejected": -61.74488830566406,
"loss": 0.3873,
"rewards/accuracies": 0.8055555820465088,
"rewards/chosen": 0.7554634809494019,
"rewards/margins": 2.4260568618774414,
"rewards/rejected": -1.6705933809280396,
"step": 216
},
{
"epoch": 0.5031414283882549,
"grad_norm": 39.371230612811594,
"learning_rate": 4.65915832294809e-07,
"logits/chosen": -1.2367289066314697,
"logits/rejected": -1.2460105419158936,
"logps/chosen": -43.46434783935547,
"logps/rejected": -65.94113159179688,
"loss": 0.3123,
"rewards/accuracies": 0.8055555820465088,
"rewards/chosen": 0.8203165531158447,
"rewards/margins": 2.6564347743988037,
"rewards/rejected": -1.836118221282959,
"step": 218
},
{
"epoch": 0.5077574047954866,
"grad_norm": 36.504230004493856,
"learning_rate": 4.6489240174680026e-07,
"logits/chosen": -1.274338722229004,
"logits/rejected": -1.274038314819336,
"logps/chosen": -46.72291564941406,
"logps/rejected": -46.98860549926758,
"loss": 0.3653,
"rewards/accuracies": 0.7916666865348816,
"rewards/chosen": 0.7758186459541321,
"rewards/margins": 1.619003415107727,
"rewards/rejected": -0.8431846499443054,
"step": 220
},
{
"epoch": 0.5077574047954866,
"eval_logits/chosen": -1.1508095264434814,
"eval_logits/rejected": -1.158846139907837,
"eval_logps/chosen": -47.08699035644531,
"eval_logps/rejected": -59.3014030456543,
"eval_loss": 0.30850934982299805,
"eval_rewards/accuracies": 0.8029953837394714,
"eval_rewards/chosen": 0.7732176184654236,
"eval_rewards/margins": 2.377366065979004,
"eval_rewards/rejected": -1.6041483879089355,
"eval_runtime": 227.3591,
"eval_samples_per_second": 7.627,
"eval_steps_per_second": 1.909,
"step": 220
},
{
"epoch": 0.5123733812027182,
"grad_norm": 58.78605593626179,
"learning_rate": 4.638549913161138e-07,
"logits/chosen": -1.163451910018921,
"logits/rejected": -1.1646764278411865,
"logps/chosen": -52.2976188659668,
"logps/rejected": -53.40987777709961,
"loss": 0.2728,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": 0.7431490421295166,
"rewards/margins": 2.2729909420013428,
"rewards/rejected": -1.5298418998718262,
"step": 222
},
{
"epoch": 0.51698935760995,
"grad_norm": 47.481692611666446,
"learning_rate": 4.6280366849176267e-07,
"logits/chosen": -1.1689667701721191,
"logits/rejected": -1.176836371421814,
"logps/chosen": -48.97719192504883,
"logps/rejected": -53.70207977294922,
"loss": 0.3088,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.4173290729522705,
"rewards/margins": 1.9205392599105835,
"rewards/rejected": -1.503210186958313,
"step": 224
},
{
"epoch": 0.5216053340171817,
"grad_norm": 26.23879949976686,
"learning_rate": 4.6173850166783446e-07,
"logits/chosen": -1.1005306243896484,
"logits/rejected": -1.1028097867965698,
"logps/chosen": -48.85009002685547,
"logps/rejected": -60.73183822631836,
"loss": 0.2688,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": 0.3439117670059204,
"rewards/margins": 2.138643741607666,
"rewards/rejected": -1.7947319746017456,
"step": 226
},
{
"epoch": 0.5262213104244133,
"grad_norm": 42.68538113603284,
"learning_rate": 4.606595601390417e-07,
"logits/chosen": -1.1647553443908691,
"logits/rejected": -1.1835236549377441,
"logps/chosen": -47.58196258544922,
"logps/rejected": -73.24917602539062,
"loss": 0.2677,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": 0.032037004828453064,
"rewards/margins": 2.9576616287231445,
"rewards/rejected": -2.9256248474121094,
"step": 228
},
{
"epoch": 0.5308372868316451,
"grad_norm": 47.39085343776366,
"learning_rate": 4.595669140962143e-07,
"logits/chosen": -1.2832393646240234,
"logits/rejected": -1.3168139457702637,
"logps/chosen": -42.34735870361328,
"logps/rejected": -91.76000213623047,
"loss": 0.3269,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": -0.22074511647224426,
"rewards/margins": 3.9149208068847656,
"rewards/rejected": -4.1356658935546875,
"step": 230
},
{
"epoch": 0.5354532632388768,
"grad_norm": 34.48196631675489,
"learning_rate": 4.5846063462173284e-07,
"logits/chosen": -1.2154712677001953,
"logits/rejected": -1.2207145690917969,
"logps/chosen": -46.80333709716797,
"logps/rejected": -61.56758499145508,
"loss": 0.299,
"rewards/accuracies": 0.8055555820465088,
"rewards/chosen": 0.07934803515672684,
"rewards/margins": 2.2728195190429688,
"rewards/rejected": -2.1934714317321777,
"step": 232
},
{
"epoch": 0.5400692396461084,
"grad_norm": 53.84175279608908,
"learning_rate": 4.573407936849044e-07,
"logits/chosen": -1.235826015472412,
"logits/rejected": -1.2379093170166016,
"logps/chosen": -53.666229248046875,
"logps/rejected": -56.39564895629883,
"loss": 0.34,
"rewards/accuracies": 0.7916666865348816,
"rewards/chosen": 0.15499740839004517,
"rewards/margins": 2.093020439147949,
"rewards/rejected": -1.9380230903625488,
"step": 234
},
{
"epoch": 0.5446852160533402,
"grad_norm": 55.22566434624621,
"learning_rate": 4.5620746413728063e-07,
"logits/chosen": -1.168860912322998,
"logits/rejected": -1.167116641998291,
"logps/chosen": -59.95442199707031,
"logps/rejected": -55.52897644042969,
"loss": 0.2556,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": 0.048602912575006485,
"rewards/margins": 2.1055731773376465,
"rewards/rejected": -2.0569701194763184,
"step": 236
},
{
"epoch": 0.5493011924605719,
"grad_norm": 40.26544195874405,
"learning_rate": 4.550607197079185e-07,
"logits/chosen": -1.1609958410263062,
"logits/rejected": -1.158744215965271,
"logps/chosen": -46.247127532958984,
"logps/rejected": -53.768096923828125,
"loss": 0.2776,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": 0.5583436489105225,
"rewards/margins": 1.9103565216064453,
"rewards/rejected": -1.3520128726959229,
"step": 238
},
{
"epoch": 0.5539171688678035,
"grad_norm": 22.296733757945297,
"learning_rate": 4.5390063499858353e-07,
"logits/chosen": -1.143466591835022,
"logits/rejected": -1.1510720252990723,
"logps/chosen": -56.236507415771484,
"logps/rejected": -72.3839111328125,
"loss": 0.2216,
"rewards/accuracies": 0.9166666865348816,
"rewards/chosen": 0.30002227425575256,
"rewards/margins": 2.8752281665802,
"rewards/rejected": -2.5752060413360596,
"step": 240
},
{
"epoch": 0.5539171688678035,
"eval_logits/chosen": -1.1655231714248657,
"eval_logits/rejected": -1.1714258193969727,
"eval_logps/chosen": -47.62031173706055,
"eval_logps/rejected": -60.05898666381836,
"eval_loss": 0.2956756353378296,
"eval_rewards/accuracies": 0.8093317747116089,
"eval_rewards/chosen": 0.5065575242042542,
"eval_rewards/margins": 2.4894962310791016,
"eval_rewards/rejected": -1.9829388856887817,
"eval_runtime": 227.4458,
"eval_samples_per_second": 7.624,
"eval_steps_per_second": 1.908,
"step": 240
},
{
"epoch": 0.5585331452750353,
"grad_norm": 25.00341489978113,
"learning_rate": 4.5272728547889687e-07,
"logits/chosen": -1.2413771152496338,
"logits/rejected": -1.2534655332565308,
"logps/chosen": -51.89939880371094,
"logps/rejected": -61.60024642944336,
"loss": 0.2157,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": 0.444425106048584,
"rewards/margins": 3.039713144302368,
"rewards/rejected": -2.595287799835205,
"step": 242
},
{
"epoch": 0.5631491216822669,
"grad_norm": 59.288877946216886,
"learning_rate": 4.5154074748142535e-07,
"logits/chosen": -1.1979435682296753,
"logits/rejected": -1.200371265411377,
"logps/chosen": -51.88737106323242,
"logps/rejected": -62.96666717529297,
"loss": 0.3125,
"rewards/accuracies": 0.8055555820465088,
"rewards/chosen": 0.536859929561615,
"rewards/margins": 2.3762218952178955,
"rewards/rejected": -1.8393617868423462,
"step": 244
},
{
"epoch": 0.5677650980894986,
"grad_norm": 49.81197105903562,
"learning_rate": 4.503410981967158e-07,
"logits/chosen": -1.1927361488342285,
"logits/rejected": -1.2023940086364746,
"logps/chosen": -43.52791213989258,
"logps/rejected": -68.50804138183594,
"loss": 0.3784,
"rewards/accuracies": 0.7083333134651184,
"rewards/chosen": 0.9149570465087891,
"rewards/margins": 2.78161883354187,
"rewards/rejected": -1.8666616678237915,
"step": 246
},
{
"epoch": 0.5723810744967304,
"grad_norm": 36.729769896842384,
"learning_rate": 4.4912841566827333e-07,
"logits/chosen": -1.2108688354492188,
"logits/rejected": -1.2207088470458984,
"logps/chosen": -47.57191848754883,
"logps/rejected": -65.24411010742188,
"loss": 0.2484,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": 1.3121000528335571,
"rewards/margins": 2.9495744705200195,
"rewards/rejected": -1.6374740600585938,
"step": 248
},
{
"epoch": 0.576997050903962,
"grad_norm": 52.94413715920982,
"learning_rate": 4.4790277878748415e-07,
"logits/chosen": -1.2431470155715942,
"logits/rejected": -1.2514811754226685,
"logps/chosen": -41.430206298828125,
"logps/rejected": -57.99823760986328,
"loss": 0.2988,
"rewards/accuracies": 0.8055555820465088,
"rewards/chosen": 0.8287627696990967,
"rewards/margins": 2.483140707015991,
"rewards/rejected": -1.6543784141540527,
"step": 250
},
{
"epoch": 0.5816130273111938,
"grad_norm": 35.54104658985613,
"learning_rate": 4.466642672884835e-07,
"logits/chosen": -1.1631712913513184,
"logits/rejected": -1.168829321861267,
"logps/chosen": -44.09839630126953,
"logps/rejected": -58.30562973022461,
"loss": 0.2578,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": 0.6901536583900452,
"rewards/margins": 2.6523261070251465,
"rewards/rejected": -1.9621726274490356,
"step": 252
},
{
"epoch": 0.5862290037184255,
"grad_norm": 37.73217497274028,
"learning_rate": 4.454129617429682e-07,
"logits/chosen": -1.2659639120101929,
"logits/rejected": -1.26548433303833,
"logps/chosen": -48.81840515136719,
"logps/rejected": -50.18553161621094,
"loss": 0.303,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": 0.6821924448013306,
"rewards/margins": 2.1100261211395264,
"rewards/rejected": -1.4278337955474854,
"step": 254
},
{
"epoch": 0.5908449801256571,
"grad_norm": 19.374735161669783,
"learning_rate": 4.441489435549551e-07,
"logits/chosen": -1.1399126052856445,
"logits/rejected": -1.1459710597991943,
"logps/chosen": -51.82578659057617,
"logps/rejected": -68.86441040039062,
"loss": 0.2382,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": 0.7167325615882874,
"rewards/margins": 3.1428277492523193,
"rewards/rejected": -2.426095485687256,
"step": 256
},
{
"epoch": 0.5954609565328889,
"grad_norm": 41.763567555359415,
"learning_rate": 4.4287229495548573e-07,
"logits/chosen": -1.183684229850769,
"logits/rejected": -1.190173625946045,
"logps/chosen": -53.23274230957031,
"logps/rejected": -66.342041015625,
"loss": 0.2552,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": 0.7042552828788757,
"rewards/margins": 3.324800729751587,
"rewards/rejected": -2.6205453872680664,
"step": 258
},
{
"epoch": 0.6000769329401205,
"grad_norm": 53.08605142258815,
"learning_rate": 4.415830989972761e-07,
"logits/chosen": -1.1716980934143066,
"logits/rejected": -1.1756532192230225,
"logps/chosen": -48.24959182739258,
"logps/rejected": -57.04353713989258,
"loss": 0.2986,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": 0.6846582889556885,
"rewards/margins": 2.4726505279541016,
"rewards/rejected": -1.7879924774169922,
"step": 260
},
{
"epoch": 0.6000769329401205,
"eval_logits/chosen": -1.1839433908462524,
"eval_logits/rejected": -1.1875815391540527,
"eval_logps/chosen": -47.51332092285156,
"eval_logps/rejected": -60.25726318359375,
"eval_loss": 0.2889851927757263,
"eval_rewards/accuracies": 0.8139401078224182,
"eval_rewards/chosen": 0.5600550174713135,
"eval_rewards/margins": 2.6421334743499756,
"eval_rewards/rejected": -2.082078218460083,
"eval_runtime": 227.3437,
"eval_samples_per_second": 7.627,
"eval_steps_per_second": 1.909,
"step": 260
},
{
"epoch": 0.6046929093473522,
"grad_norm": 40.94616402060229,
"learning_rate": 4.402814395493142e-07,
"logits/chosen": -1.2763242721557617,
"logits/rejected": -1.2830497026443481,
"logps/chosen": -46.679168701171875,
"logps/rejected": -46.02785873413086,
"loss": 0.3389,
"rewards/accuracies": 0.7916666865348816,
"rewards/chosen": 0.5309950113296509,
"rewards/margins": 1.931261658668518,
"rewards/rejected": -1.4002668857574463,
"step": 262
},
{
"epoch": 0.609308885754584,
"grad_norm": 37.45563925423882,
"learning_rate": 4.3896740129140354e-07,
"logits/chosen": -1.2274925708770752,
"logits/rejected": -1.2251402139663696,
"logps/chosen": -49.84899139404297,
"logps/rejected": -48.70707702636719,
"loss": 0.2565,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": 0.42680761218070984,
"rewards/margins": 2.4617788791656494,
"rewards/rejected": -2.0349714756011963,
"step": 264
},
{
"epoch": 0.6139248621618156,
"grad_norm": 39.43164361196429,
"learning_rate": 4.3764106970865456e-07,
"logits/chosen": -1.2649712562561035,
"logits/rejected": -1.2683297395706177,
"logps/chosen": -42.68367004394531,
"logps/rejected": -59.22146224975586,
"loss": 0.3223,
"rewards/accuracies": 0.8055555820465088,
"rewards/chosen": 0.032459404319524765,
"rewards/margins": 2.20072603225708,
"rewards/rejected": -2.168266534805298,
"step": 266
},
{
"epoch": 0.6185408385690473,
"grad_norm": 38.60665005525895,
"learning_rate": 4.3630253108592305e-07,
"logits/chosen": -1.1689542531967163,
"logits/rejected": -1.1707943677902222,
"logps/chosen": -55.54972839355469,
"logps/rejected": -63.90589141845703,
"loss": 0.2454,
"rewards/accuracies": 0.9027777910232544,
"rewards/chosen": -0.198521688580513,
"rewards/margins": 2.9068400859832764,
"rewards/rejected": -3.1053614616394043,
"step": 268
},
{
"epoch": 0.6231568149762791,
"grad_norm": 52.06719541190528,
"learning_rate": 4.3495187250219723e-07,
"logits/chosen": -1.2435555458068848,
"logits/rejected": -1.247399926185608,
"logps/chosen": -44.81391906738281,
"logps/rejected": -66.15491485595703,
"loss": 0.3261,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": -0.1677350103855133,
"rewards/margins": 2.8860220909118652,
"rewards/rejected": -3.053757429122925,
"step": 270
},
{
"epoch": 0.6277727913835107,
"grad_norm": 34.05944822234142,
"learning_rate": 4.3358918182493253e-07,
"logits/chosen": -1.1470799446105957,
"logits/rejected": -1.146039366722107,
"logps/chosen": -48.27168273925781,
"logps/rejected": -55.25007629394531,
"loss": 0.2124,
"rewards/accuracies": 0.9166666865348816,
"rewards/chosen": 0.0416293665766716,
"rewards/margins": 2.1943235397338867,
"rewards/rejected": -2.1526942253112793,
"step": 272
},
{
"epoch": 0.6323887677907424,
"grad_norm": 32.55880144659725,
"learning_rate": 4.3221454770433554e-07,
"logits/chosen": -1.2215373516082764,
"logits/rejected": -1.223750352859497,
"logps/chosen": -53.73216247558594,
"logps/rejected": -58.15590286254883,
"loss": 0.217,
"rewards/accuracies": 0.9027777910232544,
"rewards/chosen": 0.4631071090698242,
"rewards/margins": 2.812567949295044,
"rewards/rejected": -2.3494608402252197,
"step": 274
},
{
"epoch": 0.6370047441979741,
"grad_norm": 37.29286642220413,
"learning_rate": 4.308280595675966e-07,
"logits/chosen": -1.2593313455581665,
"logits/rejected": -1.2601191997528076,
"logps/chosen": -51.24105453491211,
"logps/rejected": -58.06007766723633,
"loss": 0.305,
"rewards/accuracies": 0.7916666865348816,
"rewards/chosen": 0.41045814752578735,
"rewards/margins": 2.3772544860839844,
"rewards/rejected": -1.9667962789535522,
"step": 276
},
{
"epoch": 0.6416207206052058,
"grad_norm": 27.73832644868097,
"learning_rate": 4.2942980761307227e-07,
"logits/chosen": -1.2309229373931885,
"logits/rejected": -1.2335686683654785,
"logps/chosen": -48.84478759765625,
"logps/rejected": -58.93284225463867,
"loss": 0.2202,
"rewards/accuracies": 0.9027777910232544,
"rewards/chosen": 0.5729148387908936,
"rewards/margins": 2.5622036457061768,
"rewards/rejected": -1.9892889261245728,
"step": 278
},
{
"epoch": 0.6462366970124375,
"grad_norm": 49.846247584121116,
"learning_rate": 4.2801988280441765e-07,
"logits/chosen": -1.2041369676589966,
"logits/rejected": -1.2069140672683716,
"logps/chosen": -50.05432891845703,
"logps/rejected": -59.74306106567383,
"loss": 0.2356,
"rewards/accuracies": 0.875,
"rewards/chosen": 1.4567064046859741,
"rewards/margins": 2.9142255783081055,
"rewards/rejected": -1.4575190544128418,
"step": 280
},
{
"epoch": 0.6462366970124375,
"eval_logits/chosen": -1.1684162616729736,
"eval_logits/rejected": -1.1727546453475952,
"eval_logps/chosen": -45.66713333129883,
"eval_logps/rejected": -58.96503829956055,
"eval_loss": 0.28808632493019104,
"eval_rewards/accuracies": 0.807603657245636,
"eval_rewards/chosen": 1.483147144317627,
"eval_rewards/margins": 2.9191133975982666,
"eval_rewards/rejected": -1.4359666109085083,
"eval_runtime": 227.2327,
"eval_samples_per_second": 7.631,
"eval_steps_per_second": 1.91,
"step": 280
},
{
"epoch": 0.6508526734196692,
"grad_norm": 37.200050161533866,
"learning_rate": 4.2659837686466813e-07,
"logits/chosen": -1.2258718013763428,
"logits/rejected": -1.226365327835083,
"logps/chosen": -45.65945816040039,
"logps/rejected": -56.360023498535156,
"loss": 0.2663,
"rewards/accuracies": 0.8194444179534912,
"rewards/chosen": 1.7170668840408325,
"rewards/margins": 2.946417808532715,
"rewards/rejected": -1.2293510437011719,
"step": 282
},
{
"epoch": 0.6554686498269009,
"grad_norm": 46.91080670819706,
"learning_rate": 4.25165382270273e-07,
"logits/chosen": -1.194913387298584,
"logits/rejected": -1.1949591636657715,
"logps/chosen": -41.45118713378906,
"logps/rejected": -49.03273391723633,
"loss": 0.2576,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": 1.5383753776550293,
"rewards/margins": 2.4985907077789307,
"rewards/rejected": -0.9602153301239014,
"step": 284
},
{
"epoch": 0.6600846262341326,
"grad_norm": 38.28106252628991,
"learning_rate": 4.2372099224507875e-07,
"logits/chosen": -1.262522578239441,
"logits/rejected": -1.2750235795974731,
"logps/chosen": -38.96574401855469,
"logps/rejected": -67.78875732421875,
"loss": 0.2815,
"rewards/accuracies": 0.8194444179534912,
"rewards/chosen": 1.0857527256011963,
"rewards/margins": 3.374871253967285,
"rewards/rejected": -2.289118766784668,
"step": 286
},
{
"epoch": 0.6647006026413643,
"grad_norm": 48.61343601009988,
"learning_rate": 4.2226530075426503e-07,
"logits/chosen": -1.1528538465499878,
"logits/rejected": -1.1555874347686768,
"logps/chosen": -56.5745849609375,
"logps/rejected": -58.78268051147461,
"loss": 0.2806,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": 0.9941625595092773,
"rewards/margins": 2.6789402961730957,
"rewards/rejected": -1.6847774982452393,
"step": 288
},
{
"epoch": 0.669316579048596,
"grad_norm": 41.895687140764245,
"learning_rate": 4.2079840249823106e-07,
"logits/chosen": -1.1860905885696411,
"logits/rejected": -1.1894373893737793,
"logps/chosen": -50.26545715332031,
"logps/rejected": -72.03532409667969,
"loss": 0.289,
"rewards/accuracies": 0.7916666865348816,
"rewards/chosen": 0.2916366159915924,
"rewards/margins": 3.1359870433807373,
"rewards/rejected": -2.8443503379821777,
"step": 290
},
{
"epoch": 0.6739325554558276,
"grad_norm": 41.288114692753076,
"learning_rate": 4.193203929064353e-07,
"logits/chosen": -1.2005698680877686,
"logits/rejected": -1.2083783149719238,
"logps/chosen": -51.12635803222656,
"logps/rejected": -71.51712036132812,
"loss": 0.345,
"rewards/accuracies": 0.7916666865348816,
"rewards/chosen": 0.0697176456451416,
"rewards/margins": 2.9638874530792236,
"rewards/rejected": -2.894169807434082,
"step": 292
},
{
"epoch": 0.6785485318630594,
"grad_norm": 42.3623221323418,
"learning_rate": 4.1783136813118705e-07,
"logits/chosen": -1.222592830657959,
"logits/rejected": -1.225614309310913,
"logps/chosen": -50.67860794067383,
"logps/rejected": -61.38881301879883,
"loss": 0.2915,
"rewards/accuracies": 0.8055555820465088,
"rewards/chosen": -0.12122215330600739,
"rewards/margins": 2.681208610534668,
"rewards/rejected": -2.8024303913116455,
"step": 294
},
{
"epoch": 0.6831645082702911,
"grad_norm": 22.982390981962695,
"learning_rate": 4.163314250413913e-07,
"logits/chosen": -1.1687074899673462,
"logits/rejected": -1.165205955505371,
"logps/chosen": -46.08445739746094,
"logps/rejected": -56.52994918823242,
"loss": 0.1833,
"rewards/accuracies": 0.9305555820465088,
"rewards/chosen": 0.2055283784866333,
"rewards/margins": 2.5608911514282227,
"rewards/rejected": -2.355362892150879,
"step": 296
},
{
"epoch": 0.6877804846775227,
"grad_norm": 37.84043310920863,
"learning_rate": 4.1482066121624716e-07,
"logits/chosen": -1.207397222518921,
"logits/rejected": -1.208192229270935,
"logps/chosen": -49.74457550048828,
"logps/rejected": -50.62785339355469,
"loss": 0.3247,
"rewards/accuracies": 0.8055555820465088,
"rewards/chosen": 0.17621606588363647,
"rewards/margins": 2.258789539337158,
"rewards/rejected": -2.082573413848877,
"step": 298
},
{
"epoch": 0.6923964610847545,
"grad_norm": 41.026889158159065,
"learning_rate": 4.1329917493889933e-07,
"logits/chosen": -1.3157416582107544,
"logits/rejected": -1.3190845251083374,
"logps/chosen": -46.27326965332031,
"logps/rejected": -61.60360336303711,
"loss": 0.2407,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": 0.021905170753598213,
"rewards/margins": 2.830016613006592,
"rewards/rejected": -2.8081114292144775,
"step": 300
},
{
"epoch": 0.6923964610847545,
"eval_logits/chosen": -1.1676603555679321,
"eval_logits/rejected": -1.1713868379592896,
"eval_logps/chosen": -47.389183044433594,
"eval_logps/rejected": -60.791683197021484,
"eval_loss": 0.2743883430957794,
"eval_rewards/accuracies": 0.8133640289306641,
"eval_rewards/chosen": 0.6221204400062561,
"eval_rewards/margins": 2.9714088439941406,
"eval_rewards/rejected": -2.349287986755371,
"eval_runtime": 227.521,
"eval_samples_per_second": 7.621,
"eval_steps_per_second": 1.908,
"step": 300
},
{
"epoch": 0.6970124374919862,
"grad_norm": 42.14326526501706,
"learning_rate": 4.117670651900446e-07,
"logits/chosen": -1.2038425207138062,
"logits/rejected": -1.2065904140472412,
"logps/chosen": -50.9475212097168,
"logps/rejected": -56.376590728759766,
"loss": 0.3052,
"rewards/accuracies": 0.8194444179534912,
"rewards/chosen": 0.660882830619812,
"rewards/margins": 2.4505326747894287,
"rewards/rejected": -1.789649486541748,
"step": 302
},
{
"epoch": 0.7016284138992178,
"grad_norm": 59.036735448430065,
"learning_rate": 4.1022443164149237e-07,
"logits/chosen": -1.1808403730392456,
"logits/rejected": -1.1900469064712524,
"logps/chosen": -51.46991729736328,
"logps/rejected": -69.61980438232422,
"loss": 0.2946,
"rewards/accuracies": 0.7916666865348816,
"rewards/chosen": 0.5994511842727661,
"rewards/margins": 3.409363269805908,
"rewards/rejected": -2.8099122047424316,
"step": 304
},
{
"epoch": 0.7062443903064496,
"grad_norm": 29.330872231920097,
"learning_rate": 4.086713746496808e-07,
"logits/chosen": -1.2124900817871094,
"logits/rejected": -1.2128535509109497,
"logps/chosen": -45.35523223876953,
"logps/rejected": -56.160545349121094,
"loss": 0.3006,
"rewards/accuracies": 0.7777777910232544,
"rewards/chosen": 0.7718818187713623,
"rewards/margins": 2.726104259490967,
"rewards/rejected": -1.954222559928894,
"step": 306
},
{
"epoch": 0.7108603667136812,
"grad_norm": 34.74292770737631,
"learning_rate": 4.0710799524914805e-07,
"logits/chosen": -1.1383283138275146,
"logits/rejected": -1.1413955688476562,
"logps/chosen": -55.99458312988281,
"logps/rejected": -62.90273666381836,
"loss": 0.2295,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": 0.6053013205528259,
"rewards/margins": 2.9940264225006104,
"rewards/rejected": -2.3887250423431396,
"step": 308
},
{
"epoch": 0.7154763431209129,
"grad_norm": 30.476243441323902,
"learning_rate": 4.055343951459592e-07,
"logits/chosen": -1.191731572151184,
"logits/rejected": -1.1988056898117065,
"logps/chosen": -44.046875,
"logps/rejected": -64.41764068603516,
"loss": 0.2429,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.2551959455013275,
"rewards/margins": 3.2936155796051025,
"rewards/rejected": -3.038419485092163,
"step": 310
},
{
"epoch": 0.7200923195281447,
"grad_norm": 30.700879501780552,
"learning_rate": 4.0395067671108985e-07,
"logits/chosen": -1.2448992729187012,
"logits/rejected": -1.2440135478973389,
"logps/chosen": -41.98812484741211,
"logps/rejected": -49.34878921508789,
"loss": 0.2697,
"rewards/accuracies": 0.8194444179534912,
"rewards/chosen": 0.3203623592853546,
"rewards/margins": 2.6056876182556152,
"rewards/rejected": -2.285325050354004,
"step": 312
},
{
"epoch": 0.7247082959353763,
"grad_norm": 42.617870752566596,
"learning_rate": 4.0235694297376637e-07,
"logits/chosen": -1.139160394668579,
"logits/rejected": -1.1415181159973145,
"logps/chosen": -58.74495315551758,
"logps/rejected": -63.99246597290039,
"loss": 0.2614,
"rewards/accuracies": 0.7916666865348816,
"rewards/chosen": 0.32423093914985657,
"rewards/margins": 2.9529130458831787,
"rewards/rejected": -2.6286821365356445,
"step": 314
},
{
"epoch": 0.729324272342608,
"grad_norm": 40.78600953952273,
"learning_rate": 4.0075329761476347e-07,
"logits/chosen": -1.216194748878479,
"logits/rejected": -1.2174675464630127,
"logps/chosen": -50.62156677246094,
"logps/rejected": -53.62016296386719,
"loss": 0.2407,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.07387811690568924,
"rewards/margins": 2.2588860988616943,
"rewards/rejected": -2.332764148712158,
"step": 316
},
{
"epoch": 0.7339402487498398,
"grad_norm": 30.002748027873594,
"learning_rate": 3.991398449596588e-07,
"logits/chosen": -1.2065101861953735,
"logits/rejected": -1.211814045906067,
"logps/chosen": -53.182777404785156,
"logps/rejected": -66.31592559814453,
"loss": 0.209,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.0831962302327156,
"rewards/margins": 3.359541177749634,
"rewards/rejected": -3.2763442993164062,
"step": 318
},
{
"epoch": 0.7385562251570714,
"grad_norm": 23.665196371870486,
"learning_rate": 3.9751668997204647e-07,
"logits/chosen": -1.1500531435012817,
"logits/rejected": -1.154016375541687,
"logps/chosen": -52.478214263916016,
"logps/rejected": -61.09043884277344,
"loss": 0.199,
"rewards/accuracies": 0.9305555820465088,
"rewards/chosen": 0.16100816428661346,
"rewards/margins": 2.913641929626465,
"rewards/rejected": -2.752634048461914,
"step": 320
},
{
"epoch": 0.7385562251570714,
"eval_logits/chosen": -1.152096152305603,
"eval_logits/rejected": -1.1563034057617188,
"eval_logps/chosen": -48.12975311279297,
"eval_logps/rejected": -61.73065948486328,
"eval_loss": 0.2689039707183838,
"eval_rewards/accuracies": 0.8185483813285828,
"eval_rewards/chosen": 0.2518383860588074,
"eval_rewards/margins": 3.0706160068511963,
"eval_rewards/rejected": -2.818777322769165,
"eval_runtime": 227.7908,
"eval_samples_per_second": 7.612,
"eval_steps_per_second": 1.905,
"step": 320
},
{
"epoch": 0.7431722015643031,
"grad_norm": 48.30684834494479,
"learning_rate": 3.958839382467084e-07,
"logits/chosen": -1.222053050994873,
"logits/rejected": -1.2289328575134277,
"logps/chosen": -43.89524459838867,
"logps/rejected": -56.49114990234375,
"loss": 0.3092,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": 0.43327876925468445,
"rewards/margins": 2.904792308807373,
"rewards/rejected": -2.471513032913208,
"step": 322
},
{
"epoch": 0.7477881779715349,
"grad_norm": 42.66239436350315,
"learning_rate": 3.9424169600274494e-07,
"logits/chosen": -1.2450088262557983,
"logits/rejected": -1.24375581741333,
"logps/chosen": -50.182308197021484,
"logps/rejected": -56.25563049316406,
"loss": 0.2956,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": -0.13703547418117523,
"rewards/margins": 2.3541619777679443,
"rewards/rejected": -2.4911975860595703,
"step": 324
},
{
"epoch": 0.7524041543787665,
"grad_norm": 27.173798633882665,
"learning_rate": 3.9259007007666436e-07,
"logits/chosen": -1.212989330291748,
"logits/rejected": -1.2185275554656982,
"logps/chosen": -51.97760009765625,
"logps/rejected": -63.76836395263672,
"loss": 0.282,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": 0.34300458431243896,
"rewards/margins": 2.9810492992401123,
"rewards/rejected": -2.638044595718384,
"step": 326
},
{
"epoch": 0.7570201307859982,
"grad_norm": 41.510696079488966,
"learning_rate": 3.909291679154332e-07,
"logits/chosen": -1.2237902879714966,
"logits/rejected": -1.232872486114502,
"logps/chosen": -48.094669342041016,
"logps/rejected": -73.24827575683594,
"loss": 0.3194,
"rewards/accuracies": 0.7916666865348816,
"rewards/chosen": -0.10229718685150146,
"rewards/margins": 3.730062246322632,
"rewards/rejected": -3.832359790802002,
"step": 328
},
{
"epoch": 0.7616361071932299,
"grad_norm": 36.28616141840599,
"learning_rate": 3.892590975694858e-07,
"logits/chosen": -1.2199351787567139,
"logits/rejected": -1.2302178144454956,
"logps/chosen": -44.97592544555664,
"logps/rejected": -71.27304077148438,
"loss": 0.23,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.4918098449707031,
"rewards/margins": 4.322786331176758,
"rewards/rejected": -3.830976724624634,
"step": 330
},
{
"epoch": 0.7662520836004616,
"grad_norm": 23.858921961912298,
"learning_rate": 3.875799676856952e-07,
"logits/chosen": -1.1334155797958374,
"logits/rejected": -1.1350713968276978,
"logps/chosen": -50.02524948120117,
"logps/rejected": -62.23411560058594,
"loss": 0.2071,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": 0.42125022411346436,
"rewards/margins": 3.270005226135254,
"rewards/rejected": -2.8487555980682373,
"step": 332
},
{
"epoch": 0.7708680600076933,
"grad_norm": 46.04858312353762,
"learning_rate": 3.858918875003053e-07,
"logits/chosen": -1.2362879514694214,
"logits/rejected": -1.2460957765579224,
"logps/chosen": -48.63706970214844,
"logps/rejected": -72.94087219238281,
"loss": 0.2514,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": 0.5030243396759033,
"rewards/margins": 4.145318031311035,
"rewards/rejected": -3.64229416847229,
"step": 334
},
{
"epoch": 0.775484036414925,
"grad_norm": 21.37867116532646,
"learning_rate": 3.8419496683182396e-07,
"logits/chosen": -1.1005958318710327,
"logits/rejected": -1.1029105186462402,
"logps/chosen": -46.87886047363281,
"logps/rejected": -63.63945007324219,
"loss": 0.1861,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": 0.5341323018074036,
"rewards/margins": 3.1560137271881104,
"rewards/rejected": -2.6218814849853516,
"step": 336
},
{
"epoch": 0.7801000128221567,
"grad_norm": 34.41230269099275,
"learning_rate": 3.824893160738792e-07,
"logits/chosen": -1.1643880605697632,
"logits/rejected": -1.1722698211669922,
"logps/chosen": -47.30472946166992,
"logps/rejected": -66.55563354492188,
"loss": 0.2727,
"rewards/accuracies": 0.8055555820465088,
"rewards/chosen": 0.9107217788696289,
"rewards/margins": 3.5939722061157227,
"rewards/rejected": -2.683250904083252,
"step": 338
},
{
"epoch": 0.7847159892293883,
"grad_norm": 20.704541622004893,
"learning_rate": 3.8077504618803737e-07,
"logits/chosen": -1.1662912368774414,
"logits/rejected": -1.1619421243667603,
"logps/chosen": -56.10293197631836,
"logps/rejected": -53.84746170043945,
"loss": 0.2129,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": 0.9300886988639832,
"rewards/margins": 2.765589475631714,
"rewards/rejected": -1.8355004787445068,
"step": 340
},
{
"epoch": 0.7847159892293883,
"eval_logits/chosen": -1.1408087015151978,
"eval_logits/rejected": -1.1458781957626343,
"eval_logps/chosen": -47.36520004272461,
"eval_logps/rejected": -61.242679595947266,
"eval_loss": 0.26537051796913147,
"eval_rewards/accuracies": 0.8185483813285828,
"eval_rewards/chosen": 0.6341149806976318,
"eval_rewards/margins": 3.208899736404419,
"eval_rewards/rejected": -2.574784755706787,
"eval_runtime": 227.4228,
"eval_samples_per_second": 7.625,
"eval_steps_per_second": 1.908,
"step": 340
},
{
"epoch": 0.7893319656366201,
"grad_norm": 29.91569671776041,
"learning_rate": 3.7905226869658446e-07,
"logits/chosen": -1.1567282676696777,
"logits/rejected": -1.1574435234069824,
"logps/chosen": -50.19194793701172,
"logps/rejected": -61.67422866821289,
"loss": 0.2453,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": 0.956231415271759,
"rewards/margins": 3.3507864475250244,
"rewards/rejected": -2.394554853439331,
"step": 342
},
{
"epoch": 0.7939479420438518,
"grad_norm": 35.82510237615759,
"learning_rate": 3.773210956752709e-07,
"logits/chosen": -1.1932220458984375,
"logits/rejected": -1.1889605522155762,
"logps/chosen": -46.48088073730469,
"logps/rejected": -49.8135986328125,
"loss": 0.2891,
"rewards/accuracies": 0.7638888955116272,
"rewards/chosen": 0.5073845982551575,
"rewards/margins": 2.379626989364624,
"rewards/rejected": -1.8722424507141113,
"step": 344
},
{
"epoch": 0.7985639184510834,
"grad_norm": 28.47228811073422,
"learning_rate": 3.7558163974602093e-07,
"logits/chosen": -1.1920644044876099,
"logits/rejected": -1.201475977897644,
"logps/chosen": -43.6226806640625,
"logps/rejected": -64.39244842529297,
"loss": 0.2762,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": 0.10022352635860443,
"rewards/margins": 3.255174398422241,
"rewards/rejected": -3.1549510955810547,
"step": 346
},
{
"epoch": 0.8031798948583152,
"grad_norm": 41.618837869458865,
"learning_rate": 3.73834014069605e-07,
"logits/chosen": -1.1160246133804321,
"logits/rejected": -1.1237598657608032,
"logps/chosen": -55.9088249206543,
"logps/rejected": -71.82222747802734,
"loss": 0.236,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": 0.08166421949863434,
"rewards/margins": 3.3770830631256104,
"rewards/rejected": -3.2954187393188477,
"step": 348
},
{
"epoch": 0.8077958712655469,
"grad_norm": 34.292113974031,
"learning_rate": 3.7207833233827914e-07,
"logits/chosen": -1.221280813217163,
"logits/rejected": -1.228824257850647,
"logps/chosen": -51.23641586303711,
"logps/rejected": -67.17003631591797,
"loss": 0.2952,
"rewards/accuracies": 0.7916666865348816,
"rewards/chosen": -0.29954856634140015,
"rewards/margins": 3.7896947860717773,
"rewards/rejected": -4.089242935180664,
"step": 350
},
{
"epoch": 0.8124118476727785,
"grad_norm": 27.12194701535223,
"learning_rate": 3.7031470876838786e-07,
"logits/chosen": -1.1533750295639038,
"logits/rejected": -1.1603755950927734,
"logps/chosen": -49.57859802246094,
"logps/rejected": -74.35897064208984,
"loss": 0.2616,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": -0.34589090943336487,
"rewards/margins": 3.77805495262146,
"rewards/rejected": -4.123946189880371,
"step": 352
},
{
"epoch": 0.8170278240800103,
"grad_norm": 33.865318639168144,
"learning_rate": 3.6854325809293455e-07,
"logits/chosen": -1.2225959300994873,
"logits/rejected": -1.2331852912902832,
"logps/chosen": -42.565975189208984,
"logps/rejected": -74.49568939208984,
"loss": 0.2349,
"rewards/accuracies": 0.8194444179534912,
"rewards/chosen": -0.32742711901664734,
"rewards/margins": 4.166980743408203,
"rewards/rejected": -4.494408130645752,
"step": 354
},
{
"epoch": 0.821643800487242,
"grad_norm": 44.72468218463322,
"learning_rate": 3.6676409555411653e-07,
"logits/chosen": -1.1373627185821533,
"logits/rejected": -1.1474027633666992,
"logps/chosen": -51.0811882019043,
"logps/rejected": -69.32566833496094,
"loss": 0.2441,
"rewards/accuracies": 0.9027777910232544,
"rewards/chosen": -0.21452119946479797,
"rewards/margins": 3.681729555130005,
"rewards/rejected": -3.8962512016296387,
"step": 356
},
{
"epoch": 0.8262597768944736,
"grad_norm": 18.3343181780076,
"learning_rate": 3.6497733689582866e-07,
"logits/chosen": -1.1717758178710938,
"logits/rejected": -1.1715316772460938,
"logps/chosen": -45.530574798583984,
"logps/rejected": -56.84959030151367,
"loss": 0.1942,
"rewards/accuracies": 0.9027777910232544,
"rewards/chosen": 0.07019754499197006,
"rewards/margins": 3.02067232131958,
"rewards/rejected": -2.950474739074707,
"step": 358
},
{
"epoch": 0.8308757533017054,
"grad_norm": 35.844182387455,
"learning_rate": 3.631830983561335e-07,
"logits/chosen": -1.1136425733566284,
"logits/rejected": -1.1134952306747437,
"logps/chosen": -54.07844924926758,
"logps/rejected": -60.01144027709961,
"loss": 0.2174,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.009207261726260185,
"rewards/margins": 3.035071611404419,
"rewards/rejected": -3.025864601135254,
"step": 360
},
{
"epoch": 0.8308757533017054,
"eval_logits/chosen": -1.137829303741455,
"eval_logits/rejected": -1.142533779144287,
"eval_logps/chosen": -49.13446807861328,
"eval_logps/rejected": -63.1205940246582,
"eval_loss": 0.2611147463321686,
"eval_rewards/accuracies": 0.8231566548347473,
"eval_rewards/chosen": -0.25051993131637573,
"eval_rewards/margins": 3.2632253170013428,
"eval_rewards/rejected": -3.5137455463409424,
"eval_runtime": 227.4188,
"eval_samples_per_second": 7.625,
"eval_steps_per_second": 1.908,
"step": 360
},
{
"epoch": 0.835491729708937,
"grad_norm": 36.48149555986552,
"learning_rate": 3.613814966596991e-07,
"logits/chosen": -1.1987216472625732,
"logits/rejected": -1.2045807838439941,
"logps/chosen": -49.24374008178711,
"logps/rejected": -66.37528228759766,
"loss": 0.2604,
"rewards/accuracies": 0.8194444179534912,
"rewards/chosen": -0.2955743074417114,
"rewards/margins": 3.364227533340454,
"rewards/rejected": -3.659801959991455,
"step": 362
},
{
"epoch": 0.8401077061161687,
"grad_norm": 17.123248215692428,
"learning_rate": 3.595726490102059e-07,
"logits/chosen": -1.1486543416976929,
"logits/rejected": -1.155872106552124,
"logps/chosen": -47.52320098876953,
"logps/rejected": -72.59225463867188,
"loss": 0.1309,
"rewards/accuracies": 0.9166666865348816,
"rewards/chosen": -0.2556496858596802,
"rewards/margins": 3.9981601238250732,
"rewards/rejected": -4.253809928894043,
"step": 364
},
{
"epoch": 0.8447236825234005,
"grad_norm": 22.608323262045403,
"learning_rate": 3.577566730827214e-07,
"logits/chosen": -1.1728885173797607,
"logits/rejected": -1.1819621324539185,
"logps/chosen": -47.8609504699707,
"logps/rejected": -66.17147064208984,
"loss": 0.2715,
"rewards/accuracies": 0.8194444179534912,
"rewards/chosen": -0.29853469133377075,
"rewards/margins": 3.493640661239624,
"rewards/rejected": -3.792175769805908,
"step": 366
},
{
"epoch": 0.8493396589306321,
"grad_norm": 37.233167771755866,
"learning_rate": 3.559336870160453e-07,
"logits/chosen": -1.1891751289367676,
"logits/rejected": -1.193422794342041,
"logps/chosen": -43.88676834106445,
"logps/rejected": -61.30946350097656,
"loss": 0.2259,
"rewards/accuracies": 0.9166666865348816,
"rewards/chosen": -0.18390725553035736,
"rewards/margins": 3.307628631591797,
"rewards/rejected": -3.4915361404418945,
"step": 368
},
{
"epoch": 0.8539556353378638,
"grad_norm": 26.22453096528665,
"learning_rate": 3.541038094050241e-07,
"logits/chosen": -1.155517339706421,
"logits/rejected": -1.1603643894195557,
"logps/chosen": -52.52191162109375,
"logps/rejected": -73.11701202392578,
"loss": 0.1787,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": -0.36556607484817505,
"rewards/margins": 4.345615863800049,
"rewards/rejected": -4.711181163787842,
"step": 370
},
{
"epoch": 0.8585716117450956,
"grad_norm": 39.171509700373235,
"learning_rate": 3.52267159292835e-07,
"logits/chosen": -1.1714898347854614,
"logits/rejected": -1.1753745079040527,
"logps/chosen": -52.40201950073242,
"logps/rejected": -71.5418701171875,
"loss": 0.2399,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.4184909164905548,
"rewards/margins": 3.973634958267212,
"rewards/rejected": -4.392125606536865,
"step": 372
},
{
"epoch": 0.8631875881523272,
"grad_norm": 23.054835061356226,
"learning_rate": 3.5042385616324236e-07,
"logits/chosen": -1.3357490301132202,
"logits/rejected": -1.345274806022644,
"logps/chosen": -41.36846923828125,
"logps/rejected": -68.94818115234375,
"loss": 0.2237,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": -0.5928651094436646,
"rewards/margins": 4.040990352630615,
"rewards/rejected": -4.63385534286499,
"step": 374
},
{
"epoch": 0.8678035645595589,
"grad_norm": 23.34114570013692,
"learning_rate": 3.485740199328244e-07,
"logits/chosen": -1.1034616231918335,
"logits/rejected": -1.1045833826065063,
"logps/chosen": -55.426727294921875,
"logps/rejected": -59.57293701171875,
"loss": 0.1942,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": -0.04073745757341385,
"rewards/margins": 3.197631597518921,
"rewards/rejected": -3.2383692264556885,
"step": 376
},
{
"epoch": 0.8724195409667906,
"grad_norm": 19.26131705604103,
"learning_rate": 3.4671777094317196e-07,
"logits/chosen": -1.1123476028442383,
"logits/rejected": -1.1115697622299194,
"logps/chosen": -54.29782485961914,
"logps/rejected": -60.28561782836914,
"loss": 0.1845,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": -0.4158029556274414,
"rewards/margins": 3.2452781200408936,
"rewards/rejected": -3.661081075668335,
"step": 378
},
{
"epoch": 0.8770355173740223,
"grad_norm": 40.47170182440331,
"learning_rate": 3.448552299530595e-07,
"logits/chosen": -1.1856770515441895,
"logits/rejected": -1.188659429550171,
"logps/chosen": -47.90871047973633,
"logps/rejected": -58.21343231201172,
"loss": 0.2866,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": -0.3236946761608124,
"rewards/margins": 3.1452713012695312,
"rewards/rejected": -3.468966245651245,
"step": 380
},
{
"epoch": 0.8770355173740223,
"eval_logits/chosen": -1.1315786838531494,
"eval_logits/rejected": -1.1364408731460571,
"eval_logps/chosen": -48.8283576965332,
"eval_logps/rejected": -63.289493560791016,
"eval_loss": 0.2587234079837799,
"eval_rewards/accuracies": 0.820852518081665,
"eval_rewards/chosen": -0.09746361523866653,
"eval_rewards/margins": 3.500731945037842,
"eval_rewards/rejected": -3.5981955528259277,
"eval_runtime": 227.2598,
"eval_samples_per_second": 7.63,
"eval_steps_per_second": 1.91,
"step": 380
},
{
"epoch": 0.881651493781254,
"grad_norm": 50.06622297059798,
"learning_rate": 3.429865181305894e-07,
"logits/chosen": -1.1800260543823242,
"logits/rejected": -1.1820400953292847,
"logps/chosen": -53.985992431640625,
"logps/rejected": -63.98917770385742,
"loss": 0.3147,
"rewards/accuracies": 0.7916666865348816,
"rewards/chosen": -0.11410641670227051,
"rewards/margins": 3.317570447921753,
"rewards/rejected": -3.4316766262054443,
"step": 382
},
{
"epoch": 0.8862674701884857,
"grad_norm": 50.74505917044877,
"learning_rate": 3.411117570453091e-07,
"logits/chosen": -1.1595518589019775,
"logits/rejected": -1.1614227294921875,
"logps/chosen": -48.32596969604492,
"logps/rejected": -61.07521057128906,
"loss": 0.2287,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": -0.054149970412254333,
"rewards/margins": 3.22495174407959,
"rewards/rejected": -3.279102087020874,
"step": 384
},
{
"epoch": 0.8908834465957174,
"grad_norm": 33.82726208082807,
"learning_rate": 3.392310686603025e-07,
"logits/chosen": -1.2322266101837158,
"logits/rejected": -1.235073447227478,
"logps/chosen": -48.9878044128418,
"logps/rejected": -57.65345001220703,
"loss": 0.3178,
"rewards/accuracies": 0.7638888955116272,
"rewards/chosen": -0.5888361930847168,
"rewards/margins": 2.6325643062591553,
"rewards/rejected": -3.221400260925293,
"step": 386
},
{
"epoch": 0.895499423002949,
"grad_norm": 31.968246601846598,
"learning_rate": 3.3734457532425554e-07,
"logits/chosen": -1.14549720287323,
"logits/rejected": -1.1533808708190918,
"logps/chosen": -47.60829544067383,
"logps/rejected": -65.90504455566406,
"loss": 0.2893,
"rewards/accuracies": 0.8055555820465088,
"rewards/chosen": -0.4989345967769623,
"rewards/margins": 3.784000873565674,
"rewards/rejected": -4.28293514251709,
"step": 388
},
{
"epoch": 0.9001153994101808,
"grad_norm": 38.79859580208747,
"learning_rate": 3.354523997634969e-07,
"logits/chosen": -1.1520222425460815,
"logits/rejected": -1.159006953239441,
"logps/chosen": -51.533203125,
"logps/rejected": -68.82389068603516,
"loss": 0.2439,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": -0.5372670292854309,
"rewards/margins": 3.8664119243621826,
"rewards/rejected": -4.403678894042969,
"step": 390
},
{
"epoch": 0.9047313758174125,
"grad_norm": 48.99149097647652,
"learning_rate": 3.3355466507401374e-07,
"logits/chosen": -1.1964970827102661,
"logits/rejected": -1.195936679840088,
"logps/chosen": -49.486724853515625,
"logps/rejected": -51.76087951660156,
"loss": 0.3225,
"rewards/accuracies": 0.8055555820465088,
"rewards/chosen": -0.6623955965042114,
"rewards/margins": 2.538459539413452,
"rewards/rejected": -3.200855255126953,
"step": 392
},
{
"epoch": 0.9093473522246441,
"grad_norm": 34.31601060375262,
"learning_rate": 3.3165149471344394e-07,
"logits/chosen": -1.1766917705535889,
"logits/rejected": -1.1769944429397583,
"logps/chosen": -50.33845901489258,
"logps/rejected": -60.9412841796875,
"loss": 0.2826,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": -0.7761159539222717,
"rewards/margins": 2.734633445739746,
"rewards/rejected": -3.510749340057373,
"step": 394
},
{
"epoch": 0.9139633286318759,
"grad_norm": 44.03751755180035,
"learning_rate": 3.297430124930444e-07,
"logits/chosen": -1.0980035066604614,
"logits/rejected": -1.098144292831421,
"logps/chosen": -56.62092590332031,
"logps/rejected": -61.69611358642578,
"loss": 0.3528,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.423416405916214,
"rewards/margins": 2.6373236179351807,
"rewards/rejected": -3.0607402324676514,
"step": 396
},
{
"epoch": 0.9185793050391076,
"grad_norm": 20.653109576582093,
"learning_rate": 3.2782934256963647e-07,
"logits/chosen": -1.1482434272766113,
"logits/rejected": -1.1585140228271484,
"logps/chosen": -52.95622253417969,
"logps/rejected": -71.6562728881836,
"loss": 0.2816,
"rewards/accuracies": 0.8055555820465088,
"rewards/chosen": -0.51191645860672,
"rewards/margins": 3.753627300262451,
"rewards/rejected": -4.265543460845947,
"step": 398
},
{
"epoch": 0.9231952814463392,
"grad_norm": 33.58507227677532,
"learning_rate": 3.259106094375289e-07,
"logits/chosen": -1.1832419633865356,
"logits/rejected": -1.1917306184768677,
"logps/chosen": -46.096675872802734,
"logps/rejected": -71.52079010009766,
"loss": 0.2501,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": -0.2719300389289856,
"rewards/margins": 4.045370101928711,
"rewards/rejected": -4.317299842834473,
"step": 400
},
{
"epoch": 0.9231952814463392,
"eval_logits/chosen": -1.1317284107208252,
"eval_logits/rejected": -1.1366225481033325,
"eval_logps/chosen": -49.24271011352539,
"eval_logps/rejected": -63.64417266845703,
"eval_loss": 0.2558155655860901,
"eval_rewards/accuracies": 0.820852518081665,
"eval_rewards/chosen": -0.30464252829551697,
"eval_rewards/margins": 3.470890522003174,
"eval_rewards/rejected": -3.775533437728882,
"eval_runtime": 227.3955,
"eval_samples_per_second": 7.625,
"eval_steps_per_second": 1.909,
"step": 400
},
{
"epoch": 0.927811257853571,
"grad_norm": 40.05600195486227,
"learning_rate": 3.239869379204189e-07,
"logits/chosen": -1.165150761604309,
"logits/rejected": -1.1658389568328857,
"logps/chosen": -51.360679626464844,
"logps/rejected": -65.45414733886719,
"loss": 0.2124,
"rewards/accuracies": 0.9027777910232544,
"rewards/chosen": -0.36249446868896484,
"rewards/margins": 3.780195951461792,
"rewards/rejected": -4.142690658569336,
"step": 402
},
{
"epoch": 0.9324272342608027,
"grad_norm": 39.216765683029095,
"learning_rate": 3.2205845316327144e-07,
"logits/chosen": -1.183584213256836,
"logits/rejected": -1.1847604513168335,
"logps/chosen": -40.19718933105469,
"logps/rejected": -51.98136901855469,
"loss": 0.3544,
"rewards/accuracies": 0.7361111044883728,
"rewards/chosen": -0.23264381289482117,
"rewards/margins": 2.18786883354187,
"rewards/rejected": -2.4205124378204346,
"step": 404
},
{
"epoch": 0.9370432106680343,
"grad_norm": 27.136867213884347,
"learning_rate": 3.2012528062417845e-07,
"logits/chosen": -1.1893184185028076,
"logits/rejected": -1.186366081237793,
"logps/chosen": -48.742347717285156,
"logps/rejected": -53.711280822753906,
"loss": 0.2583,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": -0.4608592987060547,
"rewards/margins": 2.4421236515045166,
"rewards/rejected": -2.9029834270477295,
"step": 406
},
{
"epoch": 0.9416591870752661,
"grad_norm": 28.572200442333273,
"learning_rate": 3.1818754606619643e-07,
"logits/chosen": -1.146033763885498,
"logits/rejected": -1.154913306236267,
"logps/chosen": -43.58420944213867,
"logps/rejected": -65.13819885253906,
"loss": 0.3209,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.25047793984413147,
"rewards/margins": 4.219507217407227,
"rewards/rejected": -3.969028949737549,
"step": 408
},
{
"epoch": 0.9462751634824977,
"grad_norm": 22.48708730265553,
"learning_rate": 3.162453755491655e-07,
"logits/chosen": -1.2108149528503418,
"logits/rejected": -1.2209829092025757,
"logps/chosen": -45.49837112426758,
"logps/rejected": -68.48489379882812,
"loss": 0.1921,
"rewards/accuracies": 0.9027777910232544,
"rewards/chosen": 0.18967992067337036,
"rewards/margins": 3.9147284030914307,
"rewards/rejected": -3.725048542022705,
"step": 410
},
{
"epoch": 0.9508911398897294,
"grad_norm": 37.73589723314555,
"learning_rate": 3.142988954215079e-07,
"logits/chosen": -1.1515933275222778,
"logits/rejected": -1.1659475564956665,
"logps/chosen": -48.16081619262695,
"logps/rejected": -75.52259063720703,
"loss": 0.2767,
"rewards/accuracies": 0.8055555820465088,
"rewards/chosen": 0.4500226378440857,
"rewards/margins": 3.918086051940918,
"rewards/rejected": -3.4680633544921875,
"step": 412
},
{
"epoch": 0.9555071162969612,
"grad_norm": 50.447595273319266,
"learning_rate": 3.1234823231200925e-07,
"logits/chosen": -1.1608054637908936,
"logits/rejected": -1.1741983890533447,
"logps/chosen": -46.12180709838867,
"logps/rejected": -76.99250030517578,
"loss": 0.2659,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": 0.0659295991063118,
"rewards/margins": 4.484518527984619,
"rewards/rejected": -4.4185895919799805,
"step": 414
},
{
"epoch": 0.9601230927041928,
"grad_norm": 34.31172314274907,
"learning_rate": 3.1039351312157993e-07,
"logits/chosen": -1.1714129447937012,
"logits/rejected": -1.1802603006362915,
"logps/chosen": -47.3844108581543,
"logps/rejected": -66.79096221923828,
"loss": 0.2247,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": 0.06639357656240463,
"rewards/margins": 3.8744897842407227,
"rewards/rejected": -3.8080966472625732,
"step": 416
},
{
"epoch": 0.9647390691114246,
"grad_norm": 59.97310545853326,
"learning_rate": 3.0843486501499967e-07,
"logits/chosen": -1.1815389394760132,
"logits/rejected": -1.1873387098312378,
"logps/chosen": -49.0379638671875,
"logps/rejected": -60.66644287109375,
"loss": 0.375,
"rewards/accuracies": 0.7916666865348816,
"rewards/chosen": 0.37162768840789795,
"rewards/margins": 2.9354913234710693,
"rewards/rejected": -2.563863515853882,
"step": 418
},
{
"epoch": 0.9693550455186563,
"grad_norm": 28.24836123391339,
"learning_rate": 3.064724154126449e-07,
"logits/chosen": -1.1865981817245483,
"logits/rejected": -1.186213731765747,
"logps/chosen": -49.98203659057617,
"logps/rejected": -52.87804412841797,
"loss": 0.231,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": 0.10280448943376541,
"rewards/margins": 2.791215181350708,
"rewards/rejected": -2.688410758972168,
"step": 420
},
{
"epoch": 0.9693550455186563,
"eval_logits/chosen": -1.1221855878829956,
"eval_logits/rejected": -1.1280066967010498,
"eval_logps/chosen": -48.28245544433594,
"eval_logps/rejected": -62.80416488647461,
"eval_loss": 0.252390593290329,
"eval_rewards/accuracies": 0.820852518081665,
"eval_rewards/chosen": 0.17548592388629913,
"eval_rewards/margins": 3.531018018722534,
"eval_rewards/rejected": -3.35553240776062,
"eval_runtime": 227.4345,
"eval_samples_per_second": 7.624,
"eval_steps_per_second": 1.908,
"step": 420
},
{
"epoch": 0.9739710219258879,
"grad_norm": 34.078391098200555,
"learning_rate": 3.045062919821995e-07,
"logits/chosen": -1.1267274618148804,
"logits/rejected": -1.139290452003479,
"logps/chosen": -46.573524475097656,
"logps/rejected": -72.93567657470703,
"loss": 0.2995,
"rewards/accuracies": 0.7916666865348816,
"rewards/chosen": 0.05330763757228851,
"rewards/margins": 4.125481605529785,
"rewards/rejected": -4.072174072265625,
"step": 422
},
{
"epoch": 0.9785869983331197,
"grad_norm": 40.16987886487936,
"learning_rate": 3.0253662263034925e-07,
"logits/chosen": -1.1718653440475464,
"logits/rejected": -1.1762607097625732,
"logps/chosen": -51.13752746582031,
"logps/rejected": -70.25437927246094,
"loss": 0.2582,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": -0.11435369402170181,
"rewards/margins": 3.699098825454712,
"rewards/rejected": -3.8134524822235107,
"step": 424
},
{
"epoch": 0.9832029747403513,
"grad_norm": 32.13301323909956,
"learning_rate": 3.005635354944606e-07,
"logits/chosen": -1.1121575832366943,
"logits/rejected": -1.113258957862854,
"logps/chosen": -53.563053131103516,
"logps/rejected": -52.64200210571289,
"loss": 0.2696,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": -0.11980216950178146,
"rewards/margins": 2.639040231704712,
"rewards/rejected": -2.7588419914245605,
"step": 426
},
{
"epoch": 0.987818951147583,
"grad_norm": 37.16965622890279,
"learning_rate": 2.9858715893424504e-07,
"logits/chosen": -1.1091896295547485,
"logits/rejected": -1.1275534629821777,
"logps/chosen": -45.88606643676758,
"logps/rejected": -73.55078125,
"loss": 0.1871,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": -0.062331780791282654,
"rewards/margins": 4.677850723266602,
"rewards/rejected": -4.740182876586914,
"step": 428
},
{
"epoch": 0.9924349275548148,
"grad_norm": 26.22123779547593,
"learning_rate": 2.966076215234082e-07,
"logits/chosen": -1.066051959991455,
"logits/rejected": -1.0764615535736084,
"logps/chosen": -54.595703125,
"logps/rejected": -72.26995849609375,
"loss": 0.1937,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": 0.17351490259170532,
"rewards/margins": 4.106486797332764,
"rewards/rejected": -3.932971954345703,
"step": 430
},
{
"epoch": 0.9970509039620464,
"grad_norm": 33.565390661724486,
"learning_rate": 2.94625052041286e-07,
"logits/chosen": -1.1814826726913452,
"logits/rejected": -1.1842460632324219,
"logps/chosen": -50.375,
"logps/rejected": -58.42066955566406,
"loss": 0.255,
"rewards/accuracies": 0.8194444179534912,
"rewards/chosen": -0.07062428444623947,
"rewards/margins": 3.117582082748413,
"rewards/rejected": -3.188206672668457,
"step": 432
},
{
"epoch": 1.001666880369278,
"grad_norm": 21.605417390859568,
"learning_rate": 2.926395794644665e-07,
"logits/chosen": -1.1752268075942993,
"logits/rejected": -1.1771807670593262,
"logps/chosen": -51.052242279052734,
"logps/rejected": -61.03368377685547,
"loss": 0.1838,
"rewards/accuracies": 0.9027777910232544,
"rewards/chosen": 0.23978914320468903,
"rewards/margins": 3.637639045715332,
"rewards/rejected": -3.3978495597839355,
"step": 434
},
{
"epoch": 1.0062828567765099,
"grad_norm": 29.001645648697252,
"learning_rate": 2.906513329583991e-07,
"logits/chosen": -1.186964511871338,
"logits/rejected": -1.1928526163101196,
"logps/chosen": -46.22761535644531,
"logps/rejected": -62.58315658569336,
"loss": 0.2362,
"rewards/accuracies": 0.8055555820465088,
"rewards/chosen": -0.08661065995693207,
"rewards/margins": 3.6797680854797363,
"rewards/rejected": -3.76637864112854,
"step": 436
},
{
"epoch": 1.0108988331837414,
"grad_norm": 25.079284366199737,
"learning_rate": 2.886604418689921e-07,
"logits/chosen": -1.1703391075134277,
"logits/rejected": -1.1838041543960571,
"logps/chosen": -44.59703063964844,
"logps/rejected": -76.01687622070312,
"loss": 0.2554,
"rewards/accuracies": 0.8194444179534912,
"rewards/chosen": -0.19546601176261902,
"rewards/margins": 4.618009567260742,
"rewards/rejected": -4.813475131988525,
"step": 438
},
{
"epoch": 1.0155148095909732,
"grad_norm": 15.893743443702332,
"learning_rate": 2.866670357141979e-07,
"logits/chosen": -1.1566696166992188,
"logits/rejected": -1.1605850458145142,
"logps/chosen": -50.24718475341797,
"logps/rejected": -61.23912048339844,
"loss": 0.2096,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": 0.3380340337753296,
"rewards/margins": 4.049044609069824,
"rewards/rejected": -3.711010456085205,
"step": 440
},
{
"epoch": 1.0155148095909732,
"eval_logits/chosen": -1.1188750267028809,
"eval_logits/rejected": -1.124423861503601,
"eval_logps/chosen": -47.42933654785156,
"eval_logps/rejected": -62.06835174560547,
"eval_loss": 0.2514457404613495,
"eval_rewards/accuracies": 0.8237327337265015,
"eval_rewards/chosen": 0.6020476222038269,
"eval_rewards/margins": 3.589672088623047,
"eval_rewards/rejected": -2.9876248836517334,
"eval_runtime": 227.3665,
"eval_samples_per_second": 7.626,
"eval_steps_per_second": 1.909,
"step": 440
},
{
"epoch": 1.020130785998205,
"grad_norm": 13.432673135223586,
"learning_rate": 2.8467124417558737e-07,
"logits/chosen": -1.1185688972473145,
"logits/rejected": -1.1209557056427002,
"logps/chosen": -48.8853759765625,
"logps/rejected": -63.88041305541992,
"loss": 0.1931,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": 0.8135783076286316,
"rewards/margins": 4.117891788482666,
"rewards/rejected": -3.3043136596679688,
"step": 442
},
{
"epoch": 1.0247467624054365,
"grad_norm": 26.849141741016137,
"learning_rate": 2.8267319708991253e-07,
"logits/chosen": -1.096121907234192,
"logits/rejected": -1.0977866649627686,
"logps/chosen": -52.22743225097656,
"logps/rejected": -55.36363220214844,
"loss": 0.1924,
"rewards/accuracies": 0.9166666865348816,
"rewards/chosen": 0.6431276202201843,
"rewards/margins": 3.097036123275757,
"rewards/rejected": -2.453908681869507,
"step": 444
},
{
"epoch": 1.0293627388126683,
"grad_norm": 31.09854358728895,
"learning_rate": 2.806730244406612e-07,
"logits/chosen": -1.1671628952026367,
"logits/rejected": -1.1714211702346802,
"logps/chosen": -46.41295623779297,
"logps/rejected": -59.33990478515625,
"loss": 0.2477,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": 0.5287280082702637,
"rewards/margins": 3.4002442359924316,
"rewards/rejected": -2.871516227722168,
"step": 446
},
{
"epoch": 1.0339787152199,
"grad_norm": 31.751017976380037,
"learning_rate": 2.786708563496001e-07,
"logits/chosen": -1.2408480644226074,
"logits/rejected": -1.2548807859420776,
"logps/chosen": -49.92308044433594,
"logps/rejected": -67.60689544677734,
"loss": 0.179,
"rewards/accuracies": 0.9166666865348816,
"rewards/chosen": 0.6711897850036621,
"rewards/margins": 4.588629722595215,
"rewards/rejected": -3.9174396991729736,
"step": 448
},
{
"epoch": 1.0385946916271316,
"grad_norm": 32.2047076869101,
"learning_rate": 2.7666682306830994e-07,
"logits/chosen": -1.19577157497406,
"logits/rejected": -1.194454550743103,
"logps/chosen": -46.63425827026367,
"logps/rejected": -49.01081848144531,
"loss": 0.2547,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": 0.5076774954795837,
"rewards/margins": 2.832526206970215,
"rewards/rejected": -2.3248488903045654,
"step": 450
},
{
"epoch": 1.0432106680343634,
"grad_norm": 28.553486090102076,
"learning_rate": 2.746610549697119e-07,
"logits/chosen": -1.1639982461929321,
"logits/rejected": -1.1696867942810059,
"logps/chosen": -49.013301849365234,
"logps/rejected": -65.69493865966797,
"loss": 0.2036,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": 0.5706920623779297,
"rewards/margins": 3.6936237812042236,
"rewards/rejected": -3.122931957244873,
"step": 452
},
{
"epoch": 1.0478266444415951,
"grad_norm": 13.92439999757166,
"learning_rate": 2.7265368253958615e-07,
"logits/chosen": -1.2167223691940308,
"logits/rejected": -1.2195782661437988,
"logps/chosen": -45.21904754638672,
"logps/rejected": -53.50370407104492,
"loss": 0.1746,
"rewards/accuracies": 0.9027777910232544,
"rewards/chosen": 0.7654822468757629,
"rewards/margins": 3.2732510566711426,
"rewards/rejected": -2.5077688694000244,
"step": 454
},
{
"epoch": 1.0524426208488267,
"grad_norm": 36.82786613306219,
"learning_rate": 2.706448363680831e-07,
"logits/chosen": -1.1744914054870605,
"logits/rejected": -1.1839237213134766,
"logps/chosen": -47.35738754272461,
"logps/rejected": -73.86341094970703,
"loss": 0.1569,
"rewards/accuracies": 0.9583333134651184,
"rewards/chosen": 0.696943998336792,
"rewards/margins": 4.708230018615723,
"rewards/rejected": -4.011285781860352,
"step": 456
},
{
"epoch": 1.0570585972560584,
"grad_norm": 13.78515040845917,
"learning_rate": 2.686346471412277e-07,
"logits/chosen": -1.1262328624725342,
"logits/rejected": -1.1403940916061401,
"logps/chosen": -49.68544387817383,
"logps/rejected": -76.74578857421875,
"loss": 0.1419,
"rewards/accuracies": 0.9305555820465088,
"rewards/chosen": 0.3619132339954376,
"rewards/margins": 4.7866668701171875,
"rewards/rejected": -4.424753665924072,
"step": 458
},
{
"epoch": 1.0616745736632902,
"grad_norm": 26.57858405427076,
"learning_rate": 2.6662324563241805e-07,
"logits/chosen": -1.2429147958755493,
"logits/rejected": -1.2455251216888428,
"logps/chosen": -45.16366195678711,
"logps/rejected": -57.56427001953125,
"loss": 0.2357,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": 0.5941984057426453,
"rewards/margins": 3.2229466438293457,
"rewards/rejected": -2.6287484169006348,
"step": 460
},
{
"epoch": 1.0616745736632902,
"eval_logits/chosen": -1.1103266477584839,
"eval_logits/rejected": -1.1161012649536133,
"eval_logps/chosen": -47.78204345703125,
"eval_logps/rejected": -62.643646240234375,
"eval_loss": 0.2479603886604309,
"eval_rewards/accuracies": 0.8277649879455566,
"eval_rewards/chosen": 0.42569395899772644,
"eval_rewards/margins": 3.700965642929077,
"eval_rewards/rejected": -3.2752716541290283,
"eval_runtime": 227.8171,
"eval_samples_per_second": 7.611,
"eval_steps_per_second": 1.905,
"step": 460
},
{
"epoch": 1.0662905500705218,
"grad_norm": 16.869585682992792,
"learning_rate": 2.6461076269391713e-07,
"logits/chosen": -1.0661816596984863,
"logits/rejected": -1.0739065408706665,
"logps/chosen": -54.75306701660156,
"logps/rejected": -72.30966186523438,
"loss": 0.1519,
"rewards/accuracies": 0.9027777910232544,
"rewards/chosen": 0.5898687839508057,
"rewards/margins": 4.626287460327148,
"rewards/rejected": -4.03641939163208,
"step": 462
},
{
"epoch": 1.0709065264777535,
"grad_norm": 39.49599828169064,
"learning_rate": 2.625973292483409e-07,
"logits/chosen": -1.1013195514678955,
"logits/rejected": -1.1079678535461426,
"logps/chosen": -56.018310546875,
"logps/rejected": -69.99239349365234,
"loss": 0.2341,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": 0.3379661738872528,
"rewards/margins": 3.941251516342163,
"rewards/rejected": -3.603285551071167,
"step": 464
},
{
"epoch": 1.0755225028849853,
"grad_norm": 19.11015860441972,
"learning_rate": 2.6058307628014065e-07,
"logits/chosen": -1.110822319984436,
"logits/rejected": -1.1167054176330566,
"logps/chosen": -53.93489074707031,
"logps/rejected": -66.05422973632812,
"loss": 0.1708,
"rewards/accuracies": 0.9027777910232544,
"rewards/chosen": 0.3638114929199219,
"rewards/margins": 4.259352207183838,
"rewards/rejected": -3.895540952682495,
"step": 466
},
{
"epoch": 1.0801384792922168,
"grad_norm": 29.43582021185457,
"learning_rate": 2.5856813482708217e-07,
"logits/chosen": -1.184598445892334,
"logits/rejected": -1.1916086673736572,
"logps/chosen": -49.6500244140625,
"logps/rejected": -56.30369567871094,
"loss": 0.2254,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.36434826254844666,
"rewards/margins": 3.6883039474487305,
"rewards/rejected": -3.32395601272583,
"step": 468
},
{
"epoch": 1.0847544556994486,
"grad_norm": 25.173770275980058,
"learning_rate": 2.565526359717206e-07,
"logits/chosen": -1.1290383338928223,
"logits/rejected": -1.1306836605072021,
"logps/chosen": -43.971435546875,
"logps/rejected": -53.389156341552734,
"loss": 0.289,
"rewards/accuracies": 0.7777777910232544,
"rewards/chosen": 0.18997710943222046,
"rewards/margins": 3.0762457847595215,
"rewards/rejected": -2.8862688541412354,
"step": 470
},
{
"epoch": 1.0893704321066804,
"grad_norm": 19.76426072194178,
"learning_rate": 2.545367108328731e-07,
"logits/chosen": -1.163740873336792,
"logits/rejected": -1.1682920455932617,
"logps/chosen": -49.15140914916992,
"logps/rejected": -59.53387451171875,
"loss": 0.187,
"rewards/accuracies": 0.9027777910232544,
"rewards/chosen": 0.26795661449432373,
"rewards/margins": 3.4266703128814697,
"rewards/rejected": -3.1587133407592773,
"step": 472
},
{
"epoch": 1.0939864085139122,
"grad_norm": 17.775087193890624,
"learning_rate": 2.525204905570889e-07,
"logits/chosen": -1.1204829216003418,
"logits/rejected": -1.1253838539123535,
"logps/chosen": -54.046390533447266,
"logps/rejected": -66.79469299316406,
"loss": 0.1607,
"rewards/accuracies": 0.9027777910232544,
"rewards/chosen": 0.39596712589263916,
"rewards/margins": 4.161929130554199,
"rewards/rejected": -3.7659616470336914,
"step": 474
},
{
"epoch": 1.0986023849211437,
"grad_norm": 24.757995548476888,
"learning_rate": 2.505041063101171e-07,
"logits/chosen": -1.1805049180984497,
"logits/rejected": -1.1901381015777588,
"logps/chosen": -53.46992492675781,
"logps/rejected": -59.58029556274414,
"loss": 0.2762,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": 0.13624940812587738,
"rewards/margins": 3.440009832382202,
"rewards/rejected": -3.303760290145874,
"step": 476
},
{
"epoch": 1.1032183613283755,
"grad_norm": 19.970980113968885,
"learning_rate": 2.4848768926837466e-07,
"logits/chosen": -1.0963982343673706,
"logits/rejected": -1.112579345703125,
"logps/chosen": -47.13343811035156,
"logps/rejected": -87.57925415039062,
"loss": 0.1717,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.06597856432199478,
"rewards/margins": 5.439146518707275,
"rewards/rejected": -5.373167991638184,
"step": 478
},
{
"epoch": 1.107834337735607,
"grad_norm": 16.28084665624267,
"learning_rate": 2.464713706104113e-07,
"logits/chosen": -1.1157184839248657,
"logits/rejected": -1.1205692291259766,
"logps/chosen": -50.34828567504883,
"logps/rejected": -63.41987609863281,
"loss": 0.1608,
"rewards/accuracies": 0.9166666865348816,
"rewards/chosen": 0.043437667191028595,
"rewards/margins": 4.042869567871094,
"rewards/rejected": -3.99943208694458,
"step": 480
},
{
"epoch": 1.107834337735607,
"eval_logits/chosen": -1.1090331077575684,
"eval_logits/rejected": -1.1147438287734985,
"eval_logps/chosen": -48.225521087646484,
"eval_logps/rejected": -63.15263748168945,
"eval_loss": 0.24383509159088135,
"eval_rewards/accuracies": 0.8294931054115295,
"eval_rewards/chosen": 0.203952819108963,
"eval_rewards/margins": 3.733717441558838,
"eval_rewards/rejected": -3.529764175415039,
"eval_runtime": 227.3827,
"eval_samples_per_second": 7.626,
"eval_steps_per_second": 1.909,
"step": 480
},
{
"epoch": 1.1124503141428388,
"grad_norm": 22.059822838666445,
"learning_rate": 2.444552815083767e-07,
"logits/chosen": -1.1268048286437988,
"logits/rejected": -1.1286168098449707,
"logps/chosen": -49.519527435302734,
"logps/rejected": -52.751312255859375,
"loss": 0.2264,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": 0.3959563970565796,
"rewards/margins": 3.2894110679626465,
"rewards/rejected": -2.8934545516967773,
"step": 482
},
{
"epoch": 1.1170662905500706,
"grad_norm": 19.4616654191151,
"learning_rate": 2.4243955311948693e-07,
"logits/chosen": -1.1610568761825562,
"logits/rejected": -1.1703104972839355,
"logps/chosen": -45.62093734741211,
"logps/rejected": -71.41989135742188,
"loss": 0.2218,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": 0.056832365691661835,
"rewards/margins": 4.676138877868652,
"rewards/rejected": -4.619307041168213,
"step": 484
},
{
"epoch": 1.1216822669573023,
"grad_norm": 29.74478192767247,
"learning_rate": 2.4042431657749115e-07,
"logits/chosen": -1.082115650177002,
"logits/rejected": -1.097312092781067,
"logps/chosen": -47.262996673583984,
"logps/rejected": -84.21355438232422,
"loss": 0.1955,
"rewards/accuracies": 0.9027777910232544,
"rewards/chosen": 0.2266267091035843,
"rewards/margins": 4.9980058670043945,
"rewards/rejected": -4.771378993988037,
"step": 486
},
{
"epoch": 1.1262982433645339,
"grad_norm": 34.83946230592783,
"learning_rate": 2.384097029841419e-07,
"logits/chosen": -1.1644551753997803,
"logits/rejected": -1.1694614887237549,
"logps/chosen": -49.46389389038086,
"logps/rejected": -59.47831344604492,
"loss": 0.2086,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": 0.2441619485616684,
"rewards/margins": 3.580085277557373,
"rewards/rejected": -3.335923433303833,
"step": 488
},
{
"epoch": 1.1309142197717656,
"grad_norm": 21.45284163289699,
"learning_rate": 2.3639584340066544e-07,
"logits/chosen": -1.1405658721923828,
"logits/rejected": -1.146559238433838,
"logps/chosen": -41.80732727050781,
"logps/rejected": -62.37071990966797,
"loss": 0.2166,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": 0.7920265197753906,
"rewards/margins": 4.414507865905762,
"rewards/rejected": -3.622481346130371,
"step": 490
},
{
"epoch": 1.1355301961789972,
"grad_norm": 20.900830046410174,
"learning_rate": 2.3438286883923539e-07,
"logits/chosen": -1.164839267730713,
"logits/rejected": -1.1716415882110596,
"logps/chosen": -52.77006912231445,
"logps/rejected": -60.97998046875,
"loss": 0.2024,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": 0.7002817988395691,
"rewards/margins": 3.6204490661621094,
"rewards/rejected": -2.9201676845550537,
"step": 492
},
{
"epoch": 1.140146172586229,
"grad_norm": 22.811055008597176,
"learning_rate": 2.323709102544506e-07,
"logits/chosen": -1.1385387182235718,
"logits/rejected": -1.1350369453430176,
"logps/chosen": -44.97706604003906,
"logps/rejected": -47.24876403808594,
"loss": 0.267,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": 0.8821587562561035,
"rewards/margins": 2.6500444412231445,
"rewards/rejected": -1.7678859233856201,
"step": 494
},
{
"epoch": 1.1447621489934607,
"grad_norm": 25.7099907801656,
"learning_rate": 2.3036009853481474e-07,
"logits/chosen": -1.1164131164550781,
"logits/rejected": -1.124334454536438,
"logps/chosen": -44.46446228027344,
"logps/rejected": -66.84891510009766,
"loss": 0.252,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": 0.42438387870788574,
"rewards/margins": 4.415454387664795,
"rewards/rejected": -3.9910707473754883,
"step": 496
},
{
"epoch": 1.1493781254006925,
"grad_norm": 24.703959475838438,
"learning_rate": 2.283505644942223e-07,
"logits/chosen": -1.1680512428283691,
"logits/rejected": -1.172341227531433,
"logps/chosen": -39.898555755615234,
"logps/rejected": -61.25082778930664,
"loss": 0.1888,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.8999897837638855,
"rewards/margins": 4.018680095672607,
"rewards/rejected": -3.118690252304077,
"step": 498
},
{
"epoch": 1.153994101807924,
"grad_norm": 22.54617945872361,
"learning_rate": 2.2634243886344781e-07,
"logits/chosen": -1.1353996992111206,
"logits/rejected": -1.1466150283813477,
"logps/chosen": -47.095890045166016,
"logps/rejected": -63.17418670654297,
"loss": 0.1944,
"rewards/accuracies": 0.875,
"rewards/chosen": 1.0475714206695557,
"rewards/margins": 4.362390518188477,
"rewards/rejected": -3.314818859100342,
"step": 500
},
{
"epoch": 1.153994101807924,
"eval_logits/chosen": -1.1096464395523071,
"eval_logits/rejected": -1.1153897047042847,
"eval_logps/chosen": -46.75843811035156,
"eval_logps/rejected": -61.96721649169922,
"eval_loss": 0.24556967616081238,
"eval_rewards/accuracies": 0.8312212228775024,
"eval_rewards/chosen": 0.9374985098838806,
"eval_rewards/margins": 3.8745529651641846,
"eval_rewards/rejected": -2.9370551109313965,
"eval_runtime": 227.4325,
"eval_samples_per_second": 7.624,
"eval_steps_per_second": 1.908,
"step": 500
},
{
"epoch": 1.1586100782151558,
"grad_norm": 27.18085844791722,
"learning_rate": 2.2433585228164115e-07,
"logits/chosen": -1.1491751670837402,
"logits/rejected": -1.1580781936645508,
"logps/chosen": -50.64473342895508,
"logps/rejected": -75.20685577392578,
"loss": 0.2218,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": 0.8257203698158264,
"rewards/margins": 5.088500022888184,
"rewards/rejected": -4.26278018951416,
"step": 502
},
{
"epoch": 1.1632260546223874,
"grad_norm": 23.89654504779422,
"learning_rate": 2.2233093528782938e-07,
"logits/chosen": -1.1577401161193848,
"logits/rejected": -1.1688594818115234,
"logps/chosen": -54.145572662353516,
"logps/rejected": -66.3442611694336,
"loss": 0.1751,
"rewards/accuracies": 0.9027777910232544,
"rewards/chosen": 1.085695743560791,
"rewards/margins": 4.018051624298096,
"rewards/rejected": -2.9323554039001465,
"step": 504
},
{
"epoch": 1.1678420310296191,
"grad_norm": 23.769575263027864,
"learning_rate": 2.2032781831242367e-07,
"logits/chosen": -1.1783199310302734,
"logits/rejected": -1.182464838027954,
"logps/chosen": -41.669734954833984,
"logps/rejected": -51.315006256103516,
"loss": 0.2418,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": 0.9586374759674072,
"rewards/margins": 3.437027931213379,
"rewards/rejected": -2.4783899784088135,
"step": 506
},
{
"epoch": 1.172458007436851,
"grad_norm": 36.28094297883332,
"learning_rate": 2.183266316687347e-07,
"logits/chosen": -1.1632755994796753,
"logits/rejected": -1.1601239442825317,
"logps/chosen": -47.738983154296875,
"logps/rejected": -49.51511001586914,
"loss": 0.2641,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": 1.0999305248260498,
"rewards/margins": 2.7960052490234375,
"rewards/rejected": -1.6960747241973877,
"step": 508
},
{
"epoch": 1.1770739838440827,
"grad_norm": 18.77553292711027,
"learning_rate": 2.16327505544495e-07,
"logits/chosen": -1.1429252624511719,
"logits/rejected": -1.1522622108459473,
"logps/chosen": -50.42318344116211,
"logps/rejected": -66.82962036132812,
"loss": 0.1438,
"rewards/accuracies": 0.9166666865348816,
"rewards/chosen": 1.0773425102233887,
"rewards/margins": 4.845379829406738,
"rewards/rejected": -3.7680368423461914,
"step": 510
},
{
"epoch": 1.1816899602513142,
"grad_norm": 17.949048045246396,
"learning_rate": 2.143305699933892e-07,
"logits/chosen": -1.1755365133285522,
"logits/rejected": -1.180452823638916,
"logps/chosen": -43.959930419921875,
"logps/rejected": -64.32354736328125,
"loss": 0.2051,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.7747830748558044,
"rewards/margins": 4.00773811340332,
"rewards/rejected": -3.23295521736145,
"step": 512
},
{
"epoch": 1.186305936658546,
"grad_norm": 29.609702959008317,
"learning_rate": 2.1233595492659382e-07,
"logits/chosen": -1.0717233419418335,
"logits/rejected": -1.0747785568237305,
"logps/chosen": -56.3906135559082,
"logps/rejected": -58.85280990600586,
"loss": 0.1651,
"rewards/accuracies": 0.9305555820465088,
"rewards/chosen": 0.6519337892532349,
"rewards/margins": 3.8963236808776855,
"rewards/rejected": -3.244389772415161,
"step": 514
},
{
"epoch": 1.1909219130657775,
"grad_norm": 20.918336765308602,
"learning_rate": 2.1034379010432542e-07,
"logits/chosen": -1.1730085611343384,
"logits/rejected": -1.1727977991104126,
"logps/chosen": -44.0880126953125,
"logps/rejected": -56.85806655883789,
"loss": 0.1976,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": 0.4944622218608856,
"rewards/margins": 3.6419970989227295,
"rewards/rejected": -3.1475343704223633,
"step": 516
},
{
"epoch": 1.1955378894730093,
"grad_norm": 25.288305102390122,
"learning_rate": 2.0835420512739957e-07,
"logits/chosen": -1.1438689231872559,
"logits/rejected": -1.1519646644592285,
"logps/chosen": -47.1004638671875,
"logps/rejected": -83.05133819580078,
"loss": 0.1839,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.2153804749250412,
"rewards/margins": 5.248907089233398,
"rewards/rejected": -5.03352689743042,
"step": 518
},
{
"epoch": 1.200153865880241,
"grad_norm": 19.59880042292574,
"learning_rate": 2.0636732942879917e-07,
"logits/chosen": -1.1264581680297852,
"logits/rejected": -1.1307095289230347,
"logps/chosen": -50.47024917602539,
"logps/rejected": -64.73220825195312,
"loss": 0.1619,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": 0.3765060007572174,
"rewards/margins": 4.3557209968566895,
"rewards/rejected": -3.979214906692505,
"step": 520
},
{
"epoch": 1.200153865880241,
"eval_logits/chosen": -1.1257847547531128,
"eval_logits/rejected": -1.1301593780517578,
"eval_logps/chosen": -48.62382507324219,
"eval_logps/rejected": -63.695838928222656,
"eval_loss": 0.24041977524757385,
"eval_rewards/accuracies": 0.8335253596305847,
"eval_rewards/chosen": 0.004803389776498079,
"eval_rewards/margins": 3.8061721324920654,
"eval_rewards/rejected": -3.8013687133789062,
"eval_runtime": 227.3215,
"eval_samples_per_second": 7.628,
"eval_steps_per_second": 1.909,
"step": 520
},
{
"epoch": 1.2047698422874729,
"grad_norm": 17.44277679097044,
"learning_rate": 2.0438329226525415e-07,
"logits/chosen": -1.144020915031433,
"logits/rejected": -1.1453847885131836,
"logps/chosen": -49.49299621582031,
"logps/rejected": -53.57789611816406,
"loss": 0.2025,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": 0.42205584049224854,
"rewards/margins": 3.364236831665039,
"rewards/rejected": -2.942180871963501,
"step": 522
},
{
"epoch": 1.2093858186947044,
"grad_norm": 37.73486109040497,
"learning_rate": 2.0240222270883288e-07,
"logits/chosen": -1.1556731462478638,
"logits/rejected": -1.1702880859375,
"logps/chosen": -50.98395538330078,
"logps/rejected": -76.1040267944336,
"loss": 0.2225,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": -0.07603338360786438,
"rewards/margins": 4.920933246612549,
"rewards/rejected": -4.9969658851623535,
"step": 524
},
{
"epoch": 1.2140017951019362,
"grad_norm": 25.377653028594498,
"learning_rate": 2.0042424963854542e-07,
"logits/chosen": -1.196961760520935,
"logits/rejected": -1.2145916223526,
"logps/chosen": -47.896732330322266,
"logps/rejected": -83.96469116210938,
"loss": 0.1463,
"rewards/accuracies": 0.9027777910232544,
"rewards/chosen": -0.26778745651245117,
"rewards/margins": 5.1378092765808105,
"rewards/rejected": -5.40559720993042,
"step": 526
},
{
"epoch": 1.2186177715091677,
"grad_norm": 21.037533939572622,
"learning_rate": 1.9844950173195883e-07,
"logits/chosen": -1.2031718492507935,
"logits/rejected": -1.207302212715149,
"logps/chosen": -48.16632843017578,
"logps/rejected": -63.03504943847656,
"loss": 0.1798,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": -0.3718983829021454,
"rewards/margins": 3.580104351043701,
"rewards/rejected": -3.95200252532959,
"step": 528
},
{
"epoch": 1.2232337479163995,
"grad_norm": 22.283357312346677,
"learning_rate": 1.964781074568265e-07,
"logits/chosen": -1.2361119985580444,
"logits/rejected": -1.2355589866638184,
"logps/chosen": -48.786136627197266,
"logps/rejected": -53.23166275024414,
"loss": 0.1959,
"rewards/accuracies": 0.9027777910232544,
"rewards/chosen": -0.26044440269470215,
"rewards/margins": 3.046844482421875,
"rewards/rejected": -3.307288885116577,
"step": 530
},
{
"epoch": 1.2278497243236313,
"grad_norm": 32.696983725912126,
"learning_rate": 1.9451019506273018e-07,
"logits/chosen": -1.1622250080108643,
"logits/rejected": -1.16159987449646,
"logps/chosen": -42.64177322387695,
"logps/rejected": -55.659339904785156,
"loss": 0.2379,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": -0.17224609851837158,
"rewards/margins": 3.0260884761810303,
"rewards/rejected": -3.1983346939086914,
"step": 532
},
{
"epoch": 1.232465700730863,
"grad_norm": 31.81561570512381,
"learning_rate": 1.9254589257273712e-07,
"logits/chosen": -1.1568024158477783,
"logits/rejected": -1.1644660234451294,
"logps/chosen": -43.20817565917969,
"logps/rejected": -66.71856689453125,
"loss": 0.1648,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.03500910475850105,
"rewards/margins": 4.864539623260498,
"rewards/rejected": -4.899548530578613,
"step": 534
},
{
"epoch": 1.2370816771380946,
"grad_norm": 28.042577352949,
"learning_rate": 1.9058532777507141e-07,
"logits/chosen": -1.1810351610183716,
"logits/rejected": -1.186877727508545,
"logps/chosen": -46.86735534667969,
"logps/rejected": -58.00865173339844,
"loss": 0.1946,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.07706524431705475,
"rewards/margins": 3.6953649520874023,
"rewards/rejected": -3.772430896759033,
"step": 536
},
{
"epoch": 1.2416976535453264,
"grad_norm": 31.06776524909045,
"learning_rate": 1.886286282148002e-07,
"logits/chosen": -1.1419155597686768,
"logits/rejected": -1.1518969535827637,
"logps/chosen": -48.36051559448242,
"logps/rejected": -68.71621704101562,
"loss": 0.2628,
"rewards/accuracies": 0.8194444179534912,
"rewards/chosen": -0.3396787643432617,
"rewards/margins": 4.181530952453613,
"rewards/rejected": -4.521209716796875,
"step": 538
},
{
"epoch": 1.246313629952558,
"grad_norm": 25.267305267108494,
"learning_rate": 1.8667592118553693e-07,
"logits/chosen": -1.2024831771850586,
"logits/rejected": -1.2076871395111084,
"logps/chosen": -52.623023986816406,
"logps/rejected": -61.89004898071289,
"loss": 0.2083,
"rewards/accuracies": 0.9166666865348816,
"rewards/chosen": -0.21523982286453247,
"rewards/margins": 3.845079183578491,
"rewards/rejected": -4.060319423675537,
"step": 540
},
{
"epoch": 1.246313629952558,
"eval_logits/chosen": -1.1086678504943848,
"eval_logits/rejected": -1.11427903175354,
"eval_logps/chosen": -48.652427673339844,
"eval_logps/rejected": -63.7818717956543,
"eval_loss": 0.2395094931125641,
"eval_rewards/accuracies": 0.8323732614517212,
"eval_rewards/chosen": -0.009500053711235523,
"eval_rewards/margins": 3.8348822593688965,
"eval_rewards/rejected": -3.8443822860717773,
"eval_runtime": 227.3478,
"eval_samples_per_second": 7.627,
"eval_steps_per_second": 1.909,
"step": 540
},
{
"epoch": 1.2509296063597897,
"grad_norm": 29.79858133121253,
"learning_rate": 1.8472733372115956e-07,
"logits/chosen": -1.2199275493621826,
"logits/rejected": -1.2279648780822754,
"logps/chosen": -50.78293228149414,
"logps/rejected": -72.13998413085938,
"loss": 0.2006,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": -0.18516860902309418,
"rewards/margins": 4.692964553833008,
"rewards/rejected": -4.878133773803711,
"step": 542
},
{
"epoch": 1.2555455827670214,
"grad_norm": 18.335692774068683,
"learning_rate": 1.8278299258754692e-07,
"logits/chosen": -1.1382191181182861,
"logits/rejected": -1.1532717943191528,
"logps/chosen": -51.21404266357422,
"logps/rejected": -84.46864318847656,
"loss": 0.2507,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": -0.002586497226729989,
"rewards/margins": 6.08302116394043,
"rewards/rejected": -6.085607528686523,
"step": 544
},
{
"epoch": 1.2601615591742532,
"grad_norm": 16.812848293352094,
"learning_rate": 1.808430242743316e-07,
"logits/chosen": -1.1635518074035645,
"logits/rejected": -1.1678366661071777,
"logps/chosen": -50.37785339355469,
"logps/rejected": -63.68332290649414,
"loss": 0.207,
"rewards/accuracies": 0.9027777910232544,
"rewards/chosen": 0.4088381230831146,
"rewards/margins": 4.470311641693115,
"rewards/rejected": -4.0614728927612305,
"step": 546
},
{
"epoch": 1.2647775355814848,
"grad_norm": 15.038796138409147,
"learning_rate": 1.7890755498667104e-07,
"logits/chosen": -1.1664639711380005,
"logits/rejected": -1.1719496250152588,
"logps/chosen": -42.69700622558594,
"logps/rejected": -64.51258850097656,
"loss": 0.1614,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.17392092943191528,
"rewards/margins": 4.139625549316406,
"rewards/rejected": -3.9657046794891357,
"step": 548
},
{
"epoch": 1.2693935119887165,
"grad_norm": 27.84348089734983,
"learning_rate": 1.7697671063703756e-07,
"logits/chosen": -1.1588454246520996,
"logits/rejected": -1.1661314964294434,
"logps/chosen": -44.9870719909668,
"logps/rejected": -65.78500366210938,
"loss": 0.2201,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": 0.46526622772216797,
"rewards/margins": 4.418604373931885,
"rewards/rejected": -3.953338623046875,
"step": 550
},
{
"epoch": 1.274009488395948,
"grad_norm": 44.28075638457142,
"learning_rate": 1.750506168370267e-07,
"logits/chosen": -1.1754214763641357,
"logits/rejected": -1.175520896911621,
"logps/chosen": -46.97793960571289,
"logps/rejected": -54.8778076171875,
"loss": 0.2392,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": 0.9450171589851379,
"rewards/margins": 3.314897298812866,
"rewards/rejected": -2.369880199432373,
"step": 552
},
{
"epoch": 1.2786254648031798,
"grad_norm": 17.20774183339189,
"learning_rate": 1.7312939888918594e-07,
"logits/chosen": -1.1337149143218994,
"logits/rejected": -1.1420766115188599,
"logps/chosen": -49.699161529541016,
"logps/rejected": -73.22956085205078,
"loss": 0.1671,
"rewards/accuracies": 0.9444444179534912,
"rewards/chosen": 0.23295314610004425,
"rewards/margins": 4.598634243011475,
"rewards/rejected": -4.36568021774292,
"step": 554
},
{
"epoch": 1.2832414412104116,
"grad_norm": 12.879730723232214,
"learning_rate": 1.712131817788628e-07,
"logits/chosen": -1.1121582984924316,
"logits/rejected": -1.1107348203659058,
"logps/chosen": -46.21583938598633,
"logps/rejected": -56.827064514160156,
"loss": 0.2081,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": 0.3201579451560974,
"rewards/margins": 3.4782567024230957,
"rewards/rejected": -3.1580984592437744,
"step": 556
},
{
"epoch": 1.2878574176176434,
"grad_norm": 18.379299257318767,
"learning_rate": 1.693020901660738e-07,
"logits/chosen": -1.1363815069198608,
"logits/rejected": -1.1399273872375488,
"logps/chosen": -53.33127212524414,
"logps/rejected": -65.93392181396484,
"loss": 0.1484,
"rewards/accuracies": 0.9305555820465088,
"rewards/chosen": 0.24407842755317688,
"rewards/margins": 4.509576320648193,
"rewards/rejected": -4.265497207641602,
"step": 558
},
{
"epoch": 1.292473394024875,
"grad_norm": 30.348807794181617,
"learning_rate": 1.6739624837739518e-07,
"logits/chosen": -1.1836738586425781,
"logits/rejected": -1.188474416732788,
"logps/chosen": -53.792484283447266,
"logps/rejected": -60.70852279663086,
"loss": 0.2063,
"rewards/accuracies": 0.9027777910232544,
"rewards/chosen": -0.07742541283369064,
"rewards/margins": 3.257985830307007,
"rewards/rejected": -3.335411310195923,
"step": 560
},
{
"epoch": 1.292473394024875,
"eval_logits/chosen": -1.1171412467956543,
"eval_logits/rejected": -1.1220649480819702,
"eval_logps/chosen": -48.55030059814453,
"eval_logps/rejected": -63.84280014038086,
"eval_loss": 0.2384917438030243,
"eval_rewards/accuracies": 0.8329492807388306,
"eval_rewards/chosen": 0.04156512767076492,
"eval_rewards/margins": 3.9164135456085205,
"eval_rewards/rejected": -3.8748483657836914,
"eval_runtime": 227.4385,
"eval_samples_per_second": 7.624,
"eval_steps_per_second": 1.908,
"step": 560
},
{
"epoch": 1.2970893704321067,
"grad_norm": 16.156045882880065,
"learning_rate": 1.6549578039787434e-07,
"logits/chosen": -1.174346923828125,
"logits/rejected": -1.1770930290222168,
"logps/chosen": -50.916481018066406,
"logps/rejected": -73.8545913696289,
"loss": 0.2379,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": 0.09411442279815674,
"rewards/margins": 4.1428680419921875,
"rewards/rejected": -4.04875373840332,
"step": 562
},
{
"epoch": 1.3017053468393383,
"grad_norm": 16.73959305434838,
"learning_rate": 1.6360080986296384e-07,
"logits/chosen": -1.14383065700531,
"logits/rejected": -1.1593358516693115,
"logps/chosen": -43.3416862487793,
"logps/rejected": -74.88224029541016,
"loss": 0.1743,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": -0.07543455064296722,
"rewards/margins": 5.390232563018799,
"rewards/rejected": -5.465667247772217,
"step": 564
},
{
"epoch": 1.30632132324657,
"grad_norm": 19.01188883701719,
"learning_rate": 1.6171146005047894e-07,
"logits/chosen": -1.1023046970367432,
"logits/rejected": -1.1084368228912354,
"logps/chosen": -55.277671813964844,
"logps/rejected": -72.66205596923828,
"loss": 0.1773,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.12563864886760712,
"rewards/margins": 4.530452728271484,
"rewards/rejected": -4.404813766479492,
"step": 566
},
{
"epoch": 1.3109372996538018,
"grad_norm": 21.87767395078719,
"learning_rate": 1.5982785387257694e-07,
"logits/chosen": -1.1128134727478027,
"logits/rejected": -1.1113499402999878,
"logps/chosen": -49.72315979003906,
"logps/rejected": -56.22350311279297,
"loss": 0.1993,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": -0.21243299543857574,
"rewards/margins": 3.1131999492645264,
"rewards/rejected": -3.3256328105926514,
"step": 568
},
{
"epoch": 1.3155532760610336,
"grad_norm": 23.959413620362366,
"learning_rate": 1.5795011386776159e-07,
"logits/chosen": -1.2445893287658691,
"logits/rejected": -1.2455319166183472,
"logps/chosen": -49.374000549316406,
"logps/rejected": -54.24154281616211,
"loss": 0.2022,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.03143635019659996,
"rewards/margins": 3.3321659564971924,
"rewards/rejected": -3.300729751586914,
"step": 570
},
{
"epoch": 1.320169252468265,
"grad_norm": 16.120326025420926,
"learning_rate": 1.560783621929113e-07,
"logits/chosen": -1.216759204864502,
"logits/rejected": -1.2203327417373657,
"logps/chosen": -57.26462936401367,
"logps/rejected": -62.25014877319336,
"loss": 0.1895,
"rewards/accuracies": 0.9027777910232544,
"rewards/chosen": 0.36375537514686584,
"rewards/margins": 3.7947163581848145,
"rewards/rejected": -3.4309606552124023,
"step": 572
},
{
"epoch": 1.3247852288754969,
"grad_norm": 48.11745636864845,
"learning_rate": 1.5421272061533177e-07,
"logits/chosen": -1.1405613422393799,
"logits/rejected": -1.1527469158172607,
"logps/chosen": -43.10445785522461,
"logps/rejected": -70.5496826171875,
"loss": 0.3131,
"rewards/accuracies": 0.7638888955116272,
"rewards/chosen": 0.3498223125934601,
"rewards/margins": 4.530541896820068,
"rewards/rejected": -4.180719375610352,
"step": 574
},
{
"epoch": 1.3294012052827284,
"grad_norm": 36.29330680340726,
"learning_rate": 1.5235331050483513e-07,
"logits/chosen": -1.110296607017517,
"logits/rejected": -1.1132102012634277,
"logps/chosen": -50.03364944458008,
"logps/rejected": -65.54788208007812,
"loss": 0.2241,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": -0.058246809989213943,
"rewards/margins": 3.8965775966644287,
"rewards/rejected": -3.954824209213257,
"step": 576
},
{
"epoch": 1.3340171816899602,
"grad_norm": 12.515890509319174,
"learning_rate": 1.5050025282584327e-07,
"logits/chosen": -1.1224780082702637,
"logits/rejected": -1.1311124563217163,
"logps/chosen": -56.89671325683594,
"logps/rejected": -74.29792785644531,
"loss": 0.1357,
"rewards/accuracies": 0.9444444179534912,
"rewards/chosen": 0.14432966709136963,
"rewards/margins": 4.722050666809082,
"rewards/rejected": -4.577720642089844,
"step": 578
},
{
"epoch": 1.338633158097192,
"grad_norm": 16.31324930298768,
"learning_rate": 1.4865366812951921e-07,
"logits/chosen": -1.0956053733825684,
"logits/rejected": -1.0948615074157715,
"logps/chosen": -44.66694641113281,
"logps/rejected": -53.35417175292969,
"loss": 0.186,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": -0.01459770742803812,
"rewards/margins": 3.5722241401672363,
"rewards/rejected": -3.586822032928467,
"step": 580
},
{
"epoch": 1.338633158097192,
"eval_logits/chosen": -1.1196925640106201,
"eval_logits/rejected": -1.1241984367370605,
"eval_logps/chosen": -49.20341873168945,
"eval_logps/rejected": -64.5632095336914,
"eval_loss": 0.23692870140075684,
"eval_rewards/accuracies": 0.8352534770965576,
"eval_rewards/chosen": -0.28499341011047363,
"eval_rewards/margins": 3.9500606060028076,
"eval_rewards/rejected": -4.2350544929504395,
"eval_runtime": 227.2284,
"eval_samples_per_second": 7.631,
"eval_steps_per_second": 1.91,
"step": 580
},
{
"epoch": 1.3432491345044237,
"grad_norm": 20.781517687408783,
"learning_rate": 1.4681367654592446e-07,
"logits/chosen": -1.1334997415542603,
"logits/rejected": -1.1324316263198853,
"logps/chosen": -51.77867889404297,
"logps/rejected": -59.6033935546875,
"loss": 0.1616,
"rewards/accuracies": 0.9444444179534912,
"rewards/chosen": -0.17863652110099792,
"rewards/margins": 3.280036449432373,
"rewards/rejected": -3.4586730003356934,
"step": 582
},
{
"epoch": 1.3478651109116553,
"grad_norm": 23.414987494186462,
"learning_rate": 1.4498039777620353e-07,
"logits/chosen": -1.1315072774887085,
"logits/rejected": -1.1402992010116577,
"logps/chosen": -56.817928314208984,
"logps/rejected": -78.69415283203125,
"loss": 0.1822,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": 0.08201665431261063,
"rewards/margins": 4.948797702789307,
"rewards/rejected": -4.866781234741211,
"step": 584
},
{
"epoch": 1.352481087318887,
"grad_norm": 29.704044318217893,
"learning_rate": 1.4315395108479728e-07,
"logits/chosen": -1.170173168182373,
"logits/rejected": -1.1773362159729004,
"logps/chosen": -49.731021881103516,
"logps/rejected": -67.60116577148438,
"loss": 0.1818,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": -0.5396385788917542,
"rewards/margins": 3.7138872146606445,
"rewards/rejected": -4.253525733947754,
"step": 586
},
{
"epoch": 1.3570970637261186,
"grad_norm": 26.692588714788616,
"learning_rate": 1.4133445529168365e-07,
"logits/chosen": -1.1388497352600098,
"logits/rejected": -1.1425156593322754,
"logps/chosen": -54.95615005493164,
"logps/rejected": -69.78382873535156,
"loss": 0.1626,
"rewards/accuracies": 0.9027777910232544,
"rewards/chosen": -0.5585896968841553,
"rewards/margins": 4.065035820007324,
"rewards/rejected": -4.623625755310059,
"step": 588
},
{
"epoch": 1.3617130401333504,
"grad_norm": 17.240579232471294,
"learning_rate": 1.395220287646483e-07,
"logits/chosen": -1.1489366292953491,
"logits/rejected": -1.1541228294372559,
"logps/chosen": -52.8204345703125,
"logps/rejected": -65.34224700927734,
"loss": 0.1726,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": -0.6690115332603455,
"rewards/margins": 3.6602673530578613,
"rewards/rejected": -4.329278945922852,
"step": 590
},
{
"epoch": 1.3663290165405821,
"grad_norm": 26.542897065871216,
"learning_rate": 1.377167894115837e-07,
"logits/chosen": -1.092912197113037,
"logits/rejected": -1.1030445098876953,
"logps/chosen": -45.3072509765625,
"logps/rejected": -80.26785278320312,
"loss": 0.1874,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": -0.4750543236732483,
"rewards/margins": 4.889781475067139,
"rewards/rejected": -5.3648362159729,
"step": 592
},
{
"epoch": 1.370944992947814,
"grad_norm": 14.292615359684781,
"learning_rate": 1.3591885467281877e-07,
"logits/chosen": -1.2319241762161255,
"logits/rejected": -1.2370344400405884,
"logps/chosen": -47.54234313964844,
"logps/rejected": -69.16310119628906,
"loss": 0.175,
"rewards/accuracies": 0.9305555820465088,
"rewards/chosen": -0.6050369739532471,
"rewards/margins": 4.743907451629639,
"rewards/rejected": -5.348944664001465,
"step": 594
},
{
"epoch": 1.3755609693550455,
"grad_norm": 24.44530683273813,
"learning_rate": 1.3412834151347896e-07,
"logits/chosen": -1.1558417081832886,
"logits/rejected": -1.1581149101257324,
"logps/chosen": -51.461524963378906,
"logps/rejected": -66.95028686523438,
"loss": 0.1852,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": -0.5921568274497986,
"rewards/margins": 4.088617324829102,
"rewards/rejected": -4.680773735046387,
"step": 596
},
{
"epoch": 1.3801769457622772,
"grad_norm": 20.235949045232623,
"learning_rate": 1.323453664158769e-07,
"logits/chosen": -1.15675687789917,
"logits/rejected": -1.1713188886642456,
"logps/chosen": -47.177574157714844,
"logps/rejected": -76.56927490234375,
"loss": 0.2271,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": -0.8099576234817505,
"rewards/margins": 4.676289081573486,
"rewards/rejected": -5.486246585845947,
"step": 598
},
{
"epoch": 1.3847929221695088,
"grad_norm": 17.33444042750883,
"learning_rate": 1.3057004537193422e-07,
"logits/chosen": -1.178117036819458,
"logits/rejected": -1.1799274682998657,
"logps/chosen": -53.55020523071289,
"logps/rejected": -62.39503860473633,
"loss": 0.1845,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": -0.5546784996986389,
"rewards/margins": 4.285789966583252,
"rewards/rejected": -4.840468883514404,
"step": 600
},
{
"epoch": 1.3847929221695088,
"eval_logits/chosen": -1.1150095462799072,
"eval_logits/rejected": -1.1200028657913208,
"eval_logps/chosen": -49.0994873046875,
"eval_logps/rejected": -64.55924987792969,
"eval_loss": 0.23688365519046783,
"eval_rewards/accuracies": 0.8312212228775024,
"eval_rewards/chosen": -0.2330285757780075,
"eval_rewards/margins": 4.000042915344238,
"eval_rewards/rejected": -4.2330708503723145,
"eval_runtime": 227.4951,
"eval_samples_per_second": 7.622,
"eval_steps_per_second": 1.908,
"step": 600
},
{
"epoch": 1.3894088985767405,
"grad_norm": 37.7777346930926,
"learning_rate": 1.2880249387563662e-07,
"logits/chosen": -1.1187705993652344,
"logits/rejected": -1.1262003183364868,
"logps/chosen": -51.49440002441406,
"logps/rejected": -74.16474914550781,
"loss": 0.1865,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": -0.5179623365402222,
"rewards/margins": 4.877338409423828,
"rewards/rejected": -5.395299911499023,
"step": 602
},
{
"epoch": 1.3940248749839723,
"grad_norm": 9.235672719669628,
"learning_rate": 1.2704282691551938e-07,
"logits/chosen": -1.116797924041748,
"logits/rejected": -1.1314423084259033,
"logps/chosen": -47.851707458496094,
"logps/rejected": -79.61589813232422,
"loss": 0.1526,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": 0.13351619243621826,
"rewards/margins": 5.681245803833008,
"rewards/rejected": -5.5477294921875,
"step": 604
},
{
"epoch": 1.398640851391204,
"grad_norm": 22.69345823971204,
"learning_rate": 1.2529115896718714e-07,
"logits/chosen": -1.1550525426864624,
"logits/rejected": -1.158216118812561,
"logps/chosen": -52.753231048583984,
"logps/rejected": -61.50723648071289,
"loss": 0.1924,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": -0.3491640090942383,
"rewards/margins": 3.678175926208496,
"rewards/rejected": -4.027340412139893,
"step": 606
},
{
"epoch": 1.4032568277984356,
"grad_norm": 18.832219757797976,
"learning_rate": 1.2354760398586708e-07,
"logits/chosen": -1.0966627597808838,
"logits/rejected": -1.108534574508667,
"logps/chosen": -55.49364471435547,
"logps/rejected": -82.76998901367188,
"loss": 0.1459,
"rewards/accuracies": 0.9166666865348816,
"rewards/chosen": -0.10722073167562485,
"rewards/margins": 5.6869659423828125,
"rewards/rejected": -5.794186592102051,
"step": 608
},
{
"epoch": 1.4078728042056674,
"grad_norm": 14.9525121487636,
"learning_rate": 1.2181227539899468e-07,
"logits/chosen": -1.1296883821487427,
"logits/rejected": -1.1345244646072388,
"logps/chosen": -52.32283020019531,
"logps/rejected": -67.40359497070312,
"loss": 0.1857,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.009007109329104424,
"rewards/margins": 4.094158172607422,
"rewards/rejected": -4.103165626525879,
"step": 610
},
{
"epoch": 1.412488780612899,
"grad_norm": 14.336076086644082,
"learning_rate": 1.2008528609883557e-07,
"logits/chosen": -1.1148794889450073,
"logits/rejected": -1.125455379486084,
"logps/chosen": -54.79003143310547,
"logps/rejected": -75.44340515136719,
"loss": 0.1438,
"rewards/accuracies": 0.9305555820465088,
"rewards/chosen": 0.29885244369506836,
"rewards/margins": 5.630979537963867,
"rewards/rejected": -5.332127571105957,
"step": 612
},
{
"epoch": 1.4171047570201307,
"grad_norm": 19.871071765511616,
"learning_rate": 1.1836674843514042e-07,
"logits/chosen": -1.1470178365707397,
"logits/rejected": -1.1546311378479004,
"logps/chosen": -43.762474060058594,
"logps/rejected": -64.04544830322266,
"loss": 0.1737,
"rewards/accuracies": 0.9027777910232544,
"rewards/chosen": 0.18334037065505981,
"rewards/margins": 4.691874027252197,
"rewards/rejected": -4.508533477783203,
"step": 614
},
{
"epoch": 1.4217207334273625,
"grad_norm": 12.336389870346276,
"learning_rate": 1.1665677420783671e-07,
"logits/chosen": -1.1334505081176758,
"logits/rejected": -1.1354079246520996,
"logps/chosen": -48.85646057128906,
"logps/rejected": -58.98961639404297,
"loss": 0.1774,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": 0.5762940645217896,
"rewards/margins": 4.42154598236084,
"rewards/rejected": -3.84525203704834,
"step": 616
},
{
"epoch": 1.4263367098345943,
"grad_norm": 26.99839545918975,
"learning_rate": 1.149554746597553e-07,
"logits/chosen": -1.153773546218872,
"logits/rejected": -1.1617780923843384,
"logps/chosen": -51.52302551269531,
"logps/rejected": -71.93586730957031,
"loss": 0.2209,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": -0.06947656720876694,
"rewards/margins": 4.659074783325195,
"rewards/rejected": -4.728551387786865,
"step": 618
},
{
"epoch": 1.4309526862418258,
"grad_norm": 23.120670434819147,
"learning_rate": 1.1326296046939333e-07,
"logits/chosen": -1.2297728061676025,
"logits/rejected": -1.2327196598052979,
"logps/chosen": -46.15282440185547,
"logps/rejected": -57.13591766357422,
"loss": 0.2511,
"rewards/accuracies": 0.8055555820465088,
"rewards/chosen": 0.22867631912231445,
"rewards/margins": 3.8101589679718018,
"rewards/rejected": -3.5814828872680664,
"step": 620
},
{
"epoch": 1.4309526862418258,
"eval_logits/chosen": -1.1074106693267822,
"eval_logits/rejected": -1.1129966974258423,
"eval_logps/chosen": -48.4957389831543,
"eval_logps/rejected": -64.15252685546875,
"eval_loss": 0.23753519356250763,
"eval_rewards/accuracies": 0.8341013789176941,
"eval_rewards/chosen": 0.06884526461362839,
"eval_rewards/margins": 4.098559379577637,
"eval_rewards/rejected": -4.029714107513428,
"eval_runtime": 227.1817,
"eval_samples_per_second": 7.633,
"eval_steps_per_second": 1.91,
"step": 620
},
{
"epoch": 1.4355686626490576,
"grad_norm": 19.372274896588006,
"learning_rate": 1.1157934174371413e-07,
"logits/chosen": -1.1408151388168335,
"logits/rejected": -1.15198814868927,
"logps/chosen": -50.5245361328125,
"logps/rejected": -73.73757934570312,
"loss": 0.1935,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": 0.12286674976348877,
"rewards/margins": 4.926203727722168,
"rewards/rejected": -4.803337574005127,
"step": 622
},
{
"epoch": 1.4401846390562894,
"grad_norm": 17.679599039225316,
"learning_rate": 1.0990472801098419e-07,
"logits/chosen": -1.1729627847671509,
"logits/rejected": -1.1764029264450073,
"logps/chosen": -45.59742736816406,
"logps/rejected": -67.27173614501953,
"loss": 0.1504,
"rewards/accuracies": 0.9027777910232544,
"rewards/chosen": 0.4040929973125458,
"rewards/margins": 4.669921398162842,
"rewards/rejected": -4.265828609466553,
"step": 624
},
{
"epoch": 1.444800615463521,
"grad_norm": 21.626789388307476,
"learning_rate": 1.0823922821364795e-07,
"logits/chosen": -1.0937749147415161,
"logits/rejected": -1.0970505475997925,
"logps/chosen": -57.45598602294922,
"logps/rejected": -64.92636108398438,
"loss": 0.1922,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.20599400997161865,
"rewards/margins": 4.274693489074707,
"rewards/rejected": -4.068699836730957,
"step": 626
},
{
"epoch": 1.4494165918707527,
"grad_norm": 22.846453164687816,
"learning_rate": 1.0658295070124026e-07,
"logits/chosen": -1.1624855995178223,
"logits/rejected": -1.1643033027648926,
"logps/chosen": -54.79329299926758,
"logps/rejected": -61.6580810546875,
"loss": 0.2111,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": 0.43274542689323425,
"rewards/margins": 4.024546146392822,
"rewards/rejected": -3.5918006896972656,
"step": 628
},
{
"epoch": 1.4540325682779844,
"grad_norm": 35.15148920852289,
"learning_rate": 1.0493600322333762e-07,
"logits/chosen": -1.1498773097991943,
"logits/rejected": -1.1634445190429688,
"logps/chosen": -50.85679626464844,
"logps/rejected": -84.39616394042969,
"loss": 0.167,
"rewards/accuracies": 0.9027777910232544,
"rewards/chosen": -0.023858733475208282,
"rewards/margins": 6.123683929443359,
"rewards/rejected": -6.147542953491211,
"step": 630
},
{
"epoch": 1.458648544685216,
"grad_norm": 30.044775332799237,
"learning_rate": 1.0329849292254883e-07,
"logits/chosen": -1.0466785430908203,
"logits/rejected": -1.0532869100570679,
"logps/chosen": -50.80934143066406,
"logps/rejected": -70.91539001464844,
"loss": 0.2205,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": 0.08874227106571198,
"rewards/margins": 4.631314277648926,
"rewards/rejected": -4.542571544647217,
"step": 632
},
{
"epoch": 1.4632645210924478,
"grad_norm": 27.395602840016966,
"learning_rate": 1.0167052632754458e-07,
"logits/chosen": -1.1735262870788574,
"logits/rejected": -1.1718860864639282,
"logps/chosen": -46.975406646728516,
"logps/rejected": -56.96874237060547,
"loss": 0.2108,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": 0.012903611175715923,
"rewards/margins": 3.0641860961914062,
"rewards/rejected": -3.0512828826904297,
"step": 634
},
{
"epoch": 1.4678804974996795,
"grad_norm": 19.703963008444365,
"learning_rate": 1.0005220934612713e-07,
"logits/chosen": -1.0792059898376465,
"logits/rejected": -1.0779961347579956,
"logps/chosen": -54.955718994140625,
"logps/rejected": -60.88299560546875,
"loss": 0.1733,
"rewards/accuracies": 0.9166666865348816,
"rewards/chosen": 0.10138123482465744,
"rewards/margins": 3.779219627380371,
"rewards/rejected": -3.6778385639190674,
"step": 636
},
{
"epoch": 1.472496473906911,
"grad_norm": 26.332520125277515,
"learning_rate": 9.844364725834056e-08,
"logits/chosen": -1.1881197690963745,
"logits/rejected": -1.201986312866211,
"logps/chosen": -51.59233093261719,
"logps/rejected": -88.54263305664062,
"loss": 0.1426,
"rewards/accuracies": 0.9027777910232544,
"rewards/chosen": 0.19230936467647552,
"rewards/margins": 6.799825668334961,
"rewards/rejected": -6.607515811920166,
"step": 638
},
{
"epoch": 1.4771124503141428,
"grad_norm": 28.45092893361144,
"learning_rate": 9.68449447096217e-08,
"logits/chosen": -1.2595468759536743,
"logits/rejected": -1.2635498046875,
"logps/chosen": -45.43921661376953,
"logps/rejected": -59.12704849243164,
"loss": 0.3573,
"rewards/accuracies": 0.7916666865348816,
"rewards/chosen": 0.18244637548923492,
"rewards/margins": 3.4541211128234863,
"rewards/rejected": -3.271674394607544,
"step": 640
},
{
"epoch": 1.4771124503141428,
"eval_logits/chosen": -1.1004152297973633,
"eval_logits/rejected": -1.1063543558120728,
"eval_logps/chosen": -48.19259262084961,
"eval_logps/rejected": -63.969642639160156,
"eval_loss": 0.23769782483577728,
"eval_rewards/accuracies": 0.8346773982048035,
"eval_rewards/chosen": 0.22041727602481842,
"eval_rewards/margins": 4.15868616104126,
"eval_rewards/rejected": -3.9382688999176025,
"eval_runtime": 227.6304,
"eval_samples_per_second": 7.618,
"eval_steps_per_second": 1.907,
"step": 640
},
{
"epoch": 1.4817284267213746,
"grad_norm": 24.31958981085731,
"learning_rate": 9.525620570399259e-08,
"logits/chosen": -1.1648564338684082,
"logits/rejected": -1.1766667366027832,
"logps/chosen": -50.69769287109375,
"logps/rejected": -73.31265258789062,
"loss": 0.138,
"rewards/accuracies": 0.9305555820465088,
"rewards/chosen": 0.1818230152130127,
"rewards/margins": 4.770647048950195,
"rewards/rejected": -4.5888237953186035,
"step": 642
},
{
"epoch": 1.4863444031286062,
"grad_norm": 16.147422378148644,
"learning_rate": 9.36775335972943e-08,
"logits/chosen": -1.1746997833251953,
"logits/rejected": -1.2103776931762695,
"logps/chosen": -45.329200744628906,
"logps/rejected": -116.44515228271484,
"loss": 0.166,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.2069842666387558,
"rewards/margins": 9.259431838989258,
"rewards/rejected": -9.052447319030762,
"step": 644
},
{
"epoch": 1.490960379535838,
"grad_norm": 26.623831420383585,
"learning_rate": 9.210903109046284e-08,
"logits/chosen": -1.186785340309143,
"logits/rejected": -1.1978414058685303,
"logps/chosen": -49.64828872680664,
"logps/rejected": -75.10488891601562,
"loss": 0.1953,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": -0.27270856499671936,
"rewards/margins": 5.708976745605469,
"rewards/rejected": -5.981686115264893,
"step": 646
},
{
"epoch": 1.4955763559430697,
"grad_norm": 20.261642357856545,
"learning_rate": 9.05508002228485e-08,
"logits/chosen": -1.1485164165496826,
"logits/rejected": -1.1560351848602295,
"logps/chosen": -43.76812744140625,
"logps/rejected": -61.48163604736328,
"loss": 0.2165,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": 0.30232900381088257,
"rewards/margins": 4.651912689208984,
"rewards/rejected": -4.349584579467773,
"step": 648
},
{
"epoch": 1.5001923323503012,
"grad_norm": 16.434029065803497,
"learning_rate": 8.900294236557707e-08,
"logits/chosen": -1.1660001277923584,
"logits/rejected": -1.1689536571502686,
"logps/chosen": -43.022518157958984,
"logps/rejected": -55.89129638671875,
"loss": 0.2104,
"rewards/accuracies": 0.8055555820465088,
"rewards/chosen": 0.252067506313324,
"rewards/margins": 3.607164144515991,
"rewards/rejected": -3.3550968170166016,
"step": 650
},
{
"epoch": 1.504808308757533,
"grad_norm": 17.153074627752197,
"learning_rate": 8.746555821495561e-08,
"logits/chosen": -1.1337089538574219,
"logits/rejected": -1.149064540863037,
"logps/chosen": -49.65050506591797,
"logps/rejected": -73.29889678955078,
"loss": 0.168,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": 0.3097386956214905,
"rewards/margins": 5.331449031829834,
"rewards/rejected": -5.0217108726501465,
"step": 652
},
{
"epoch": 1.5094242851647648,
"grad_norm": 23.56988440345321,
"learning_rate": 8.593874778592122e-08,
"logits/chosen": -1.1784387826919556,
"logits/rejected": -1.1783757209777832,
"logps/chosen": -43.02362823486328,
"logps/rejected": -56.035400390625,
"loss": 0.1716,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": 0.2998148500919342,
"rewards/margins": 3.706061840057373,
"rewards/rejected": -3.4062466621398926,
"step": 654
},
{
"epoch": 1.5140402615719966,
"grad_norm": 24.44217268761521,
"learning_rate": 8.442261040553472e-08,
"logits/chosen": -1.1735872030258179,
"logits/rejected": -1.1768840551376343,
"logps/chosen": -50.59683609008789,
"logps/rejected": -56.11674880981445,
"loss": 0.1561,
"rewards/accuracies": 0.9444444179534912,
"rewards/chosen": 0.28288352489471436,
"rewards/margins": 3.881596088409424,
"rewards/rejected": -3.59871244430542,
"step": 656
},
{
"epoch": 1.518656237979228,
"grad_norm": 32.02127603424348,
"learning_rate": 8.291724470651903e-08,
"logits/chosen": -1.1492629051208496,
"logits/rejected": -1.1569753885269165,
"logps/chosen": -51.28215026855469,
"logps/rejected": -65.28805541992188,
"loss": 0.2696,
"rewards/accuracies": 0.7916666865348816,
"rewards/chosen": -0.23993118107318878,
"rewards/margins": 3.802645444869995,
"rewards/rejected": -4.042576313018799,
"step": 658
},
{
"epoch": 1.5232722143864597,
"grad_norm": 16.1030974632108,
"learning_rate": 8.14227486208423e-08,
"logits/chosen": -1.2377209663391113,
"logits/rejected": -1.2406808137893677,
"logps/chosen": -45.322593688964844,
"logps/rejected": -61.13237762451172,
"loss": 0.1727,
"rewards/accuracies": 0.9027777910232544,
"rewards/chosen": 0.44821304082870483,
"rewards/margins": 4.551301956176758,
"rewards/rejected": -4.103089332580566,
"step": 660
},
{
"epoch": 1.5232722143864597,
"eval_logits/chosen": -1.106886386871338,
"eval_logits/rejected": -1.1123414039611816,
"eval_logps/chosen": -48.28040313720703,
"eval_logps/rejected": -64.126708984375,
"eval_loss": 0.23655745387077332,
"eval_rewards/accuracies": 0.8346773982048035,
"eval_rewards/chosen": 0.17651285231113434,
"eval_rewards/margins": 4.193314552307129,
"eval_rewards/rejected": -4.016801834106445,
"eval_runtime": 227.3008,
"eval_samples_per_second": 7.629,
"eval_steps_per_second": 1.909,
"step": 660
},
{
"epoch": 1.5278881907936914,
"grad_norm": 26.89012407667588,
"learning_rate": 7.993921937334716e-08,
"logits/chosen": -1.1547244787216187,
"logits/rejected": -1.1549021005630493,
"logps/chosen": -48.07485580444336,
"logps/rejected": -55.9273567199707,
"loss": 0.2437,
"rewards/accuracies": 0.8055555820465088,
"rewards/chosen": 0.16911821067333221,
"rewards/margins": 3.7287042140960693,
"rewards/rejected": -3.5595860481262207,
"step": 662
},
{
"epoch": 1.5325041672009232,
"grad_norm": 9.277145884274647,
"learning_rate": 7.846675347542578e-08,
"logits/chosen": -1.0987221002578735,
"logits/rejected": -1.0982441902160645,
"logps/chosen": -44.44068908691406,
"logps/rejected": -55.658538818359375,
"loss": 0.1239,
"rewards/accuracies": 0.9027777910232544,
"rewards/chosen": 0.7777649760246277,
"rewards/margins": 4.145383834838867,
"rewards/rejected": -3.3676185607910156,
"step": 664
},
{
"epoch": 1.537120143608155,
"grad_norm": 23.47472041668218,
"learning_rate": 7.700544671874079e-08,
"logits/chosen": -1.1225018501281738,
"logits/rejected": -1.1206145286560059,
"logps/chosen": -55.324520111083984,
"logps/rejected": -59.72848892211914,
"loss": 0.1971,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": 0.16201752424240112,
"rewards/margins": 3.718522071838379,
"rewards/rejected": -3.556504487991333,
"step": 666
},
{
"epoch": 1.5417361200153867,
"grad_norm": 36.079809765281745,
"learning_rate": 7.555539416899437e-08,
"logits/chosen": -1.1977320909500122,
"logits/rejected": -1.2057994604110718,
"logps/chosen": -42.88581848144531,
"logps/rejected": -62.64973831176758,
"loss": 0.2275,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": 0.011536385864019394,
"rewards/margins": 4.496410369873047,
"rewards/rejected": -4.484873294830322,
"step": 668
},
{
"epoch": 1.5463520964226183,
"grad_norm": 24.671849788858164,
"learning_rate": 7.41166901597429e-08,
"logits/chosen": -1.115236520767212,
"logits/rejected": -1.117641806602478,
"logps/chosen": -48.77861785888672,
"logps/rejected": -63.02436065673828,
"loss": 0.1757,
"rewards/accuracies": 0.9027777910232544,
"rewards/chosen": 0.31542646884918213,
"rewards/margins": 4.368900775909424,
"rewards/rejected": -4.053474426269531,
"step": 670
},
{
"epoch": 1.5509680728298498,
"grad_norm": 22.124151348706462,
"learning_rate": 7.268942828626046e-08,
"logits/chosen": -1.201170802116394,
"logits/rejected": -1.2061718702316284,
"logps/chosen": -46.17606735229492,
"logps/rejected": -60.99459457397461,
"loss": 0.2017,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": 0.4047514796257019,
"rewards/margins": 4.501495361328125,
"rewards/rejected": -4.096743583679199,
"step": 672
},
{
"epoch": 1.5555840492370816,
"grad_norm": 24.405076004524442,
"learning_rate": 7.127370139945018e-08,
"logits/chosen": -1.1625523567199707,
"logits/rejected": -1.1654243469238281,
"logps/chosen": -48.26844024658203,
"logps/rejected": -65.458740234375,
"loss": 0.1711,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": 0.4592117667198181,
"rewards/margins": 4.522068500518799,
"rewards/rejected": -4.062856674194336,
"step": 674
},
{
"epoch": 1.5602000256443134,
"grad_norm": 18.612456705571446,
"learning_rate": 6.986960159980326e-08,
"logits/chosen": -1.1604725122451782,
"logits/rejected": -1.16167414188385,
"logps/chosen": -50.374324798583984,
"logps/rejected": -61.40808868408203,
"loss": 0.1842,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": 0.5715973377227783,
"rewards/margins": 3.9924135208129883,
"rewards/rejected": -3.42081618309021,
"step": 676
},
{
"epoch": 1.5648160020515451,
"grad_norm": 43.48667538563028,
"learning_rate": 6.847722023140776e-08,
"logits/chosen": -1.2388062477111816,
"logits/rejected": -1.234206199645996,
"logps/chosen": -44.29112243652344,
"logps/rejected": -51.78242492675781,
"loss": 0.2001,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": 0.6832877993583679,
"rewards/margins": 3.618730306625366,
"rewards/rejected": -2.9354422092437744,
"step": 678
},
{
"epoch": 1.569431978458777,
"grad_norm": 28.795198035373833,
"learning_rate": 6.709664787600616e-08,
"logits/chosen": -1.209147572517395,
"logits/rejected": -1.2093759775161743,
"logps/chosen": -42.70883560180664,
"logps/rejected": -50.379886627197266,
"loss": 0.2779,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": 0.21883456408977509,
"rewards/margins": 2.795126438140869,
"rewards/rejected": -2.576291799545288,
"step": 680
},
{
"epoch": 1.569431978458777,
"eval_logits/chosen": -1.118242621421814,
"eval_logits/rejected": -1.1229368448257446,
"eval_logps/chosen": -47.97200012207031,
"eval_logps/rejected": -63.929046630859375,
"eval_loss": 0.2367607206106186,
"eval_rewards/accuracies": 0.8312212228775024,
"eval_rewards/chosen": 0.33071503043174744,
"eval_rewards/margins": 4.248687744140625,
"eval_rewards/rejected": -3.9179720878601074,
"eval_runtime": 227.4731,
"eval_samples_per_second": 7.623,
"eval_steps_per_second": 1.908,
"step": 680
},
{
"epoch": 1.5740479548660085,
"grad_norm": 22.618944291226985,
"learning_rate": 6.572797434710219e-08,
"logits/chosen": -1.2124552726745605,
"logits/rejected": -1.2253483533859253,
"logps/chosen": -45.591331481933594,
"logps/rejected": -77.16690063476562,
"loss": 0.1948,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": 0.740224301815033,
"rewards/margins": 5.656050682067871,
"rewards/rejected": -4.91582727432251,
"step": 682
},
{
"epoch": 1.57866393127324,
"grad_norm": 17.206603905842275,
"learning_rate": 6.437128868411856e-08,
"logits/chosen": -1.1554473638534546,
"logits/rejected": -1.1552696228027344,
"logps/chosen": -45.49040222167969,
"logps/rejected": -53.40934753417969,
"loss": 0.2204,
"rewards/accuracies": 0.8194444179534912,
"rewards/chosen": 0.516470193862915,
"rewards/margins": 3.7008235454559326,
"rewards/rejected": -3.1843535900115967,
"step": 684
},
{
"epoch": 1.5832799076804718,
"grad_norm": 14.245504042199357,
"learning_rate": 6.302667914660384e-08,
"logits/chosen": -1.1610139608383179,
"logits/rejected": -1.1691855192184448,
"logps/chosen": -41.98570251464844,
"logps/rejected": -61.45694351196289,
"loss": 0.2166,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": 0.34538733959198,
"rewards/margins": 4.076152324676514,
"rewards/rejected": -3.7307653427124023,
"step": 686
},
{
"epoch": 1.5878958840877035,
"grad_norm": 25.659568066978697,
"learning_rate": 6.169423320849112e-08,
"logits/chosen": -1.139572024345398,
"logits/rejected": -1.1325372457504272,
"logps/chosen": -52.09668731689453,
"logps/rejected": -52.88841247558594,
"loss": 0.1753,
"rewards/accuracies": 0.9583333134651184,
"rewards/chosen": 0.5496838092803955,
"rewards/margins": 3.8007943630218506,
"rewards/rejected": -3.251110553741455,
"step": 688
},
{
"epoch": 1.5925118604949353,
"grad_norm": 23.501661963559606,
"learning_rate": 6.037403755240748e-08,
"logits/chosen": -1.1798467636108398,
"logits/rejected": -1.1862027645111084,
"logps/chosen": -51.716487884521484,
"logps/rejected": -67.60077667236328,
"loss": 0.175,
"rewards/accuracies": 0.9305555820465088,
"rewards/chosen": 0.23453055322170258,
"rewards/margins": 4.55530309677124,
"rewards/rejected": -4.320772171020508,
"step": 690
},
{
"epoch": 1.597127836902167,
"grad_norm": 17.275740083868094,
"learning_rate": 5.9066178064034326e-08,
"logits/chosen": -1.1528249979019165,
"logits/rejected": -1.1694761514663696,
"logps/chosen": -38.17667770385742,
"logps/rejected": -83.85364532470703,
"loss": 0.2264,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": 0.28412505984306335,
"rewards/margins": 6.359869003295898,
"rewards/rejected": -6.075743675231934,
"step": 692
},
{
"epoch": 1.6017438133093986,
"grad_norm": 26.873806928959727,
"learning_rate": 5.777073982652064e-08,
"logits/chosen": -1.1307650804519653,
"logits/rejected": -1.132928490638733,
"logps/chosen": -41.0783576965332,
"logps/rejected": -60.499267578125,
"loss": 0.2291,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": 0.24884945154190063,
"rewards/margins": 4.286365509033203,
"rewards/rejected": -4.037516117095947,
"step": 694
},
{
"epoch": 1.6063597897166302,
"grad_norm": 32.76563481878735,
"learning_rate": 5.6487807114947325e-08,
"logits/chosen": -1.133086085319519,
"logits/rejected": -1.149233102798462,
"logps/chosen": -49.21199035644531,
"logps/rejected": -82.91252899169922,
"loss": 0.1834,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": 0.2540854215621948,
"rewards/margins": 5.426255226135254,
"rewards/rejected": -5.172169208526611,
"step": 696
},
{
"epoch": 1.610975766123862,
"grad_norm": 30.041135081079663,
"learning_rate": 5.521746339084532e-08,
"logits/chosen": -1.1215559244155884,
"logits/rejected": -1.130250334739685,
"logps/chosen": -54.02631759643555,
"logps/rejected": -67.99839782714844,
"loss": 0.2581,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": 0.2139756679534912,
"rewards/margins": 4.321435928344727,
"rewards/rejected": -4.107460021972656,
"step": 698
},
{
"epoch": 1.6155917425310937,
"grad_norm": 21.854488700327504,
"learning_rate": 5.39597912967652e-08,
"logits/chosen": -1.131272315979004,
"logits/rejected": -1.1449000835418701,
"logps/chosen": -45.570159912109375,
"logps/rejected": -73.73873901367188,
"loss": 0.1902,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": 0.329825758934021,
"rewards/margins": 5.084650039672852,
"rewards/rejected": -4.754825115203857,
"step": 700
},
{
"epoch": 1.6155917425310937,
"eval_logits/chosen": -1.1056500673294067,
"eval_logits/rejected": -1.1112511157989502,
"eval_logps/chosen": -48.541831970214844,
"eval_logps/rejected": -64.42839050292969,
"eval_loss": 0.23584917187690735,
"eval_rewards/accuracies": 0.8323732614517212,
"eval_rewards/chosen": 0.04579799994826317,
"eval_rewards/margins": 4.213436603546143,
"eval_rewards/rejected": -4.167638778686523,
"eval_runtime": 227.3674,
"eval_samples_per_second": 7.626,
"eval_steps_per_second": 1.909,
"step": 700
}
],
"logging_steps": 2,
"max_steps": 866,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 20,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}