diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,8389 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9913544668587897, + "eval_steps": 173, + "global_step": 519, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0038424591738712775, + "grad_norm": 201.0, + "learning_rate": 0.0, + "logits/chosen": -3.849609375, + "logits/rejected": -3.916015625, + "logps/chosen": -331.75, + "logps/rejected": -160.3125, + "loss": 1.998, + "nll_loss": 1.03076171875, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.007684918347742555, + "grad_norm": 552.0, + "learning_rate": 1.923076923076923e-08, + "logits/chosen": -3.83984375, + "logits/rejected": -3.849609375, + "logps/chosen": -375.5, + "logps/rejected": -239.6875, + "loss": 2.002, + "nll_loss": 1.03271484375, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2 + }, + { + "epoch": 0.011527377521613832, + "grad_norm": 984.0, + "learning_rate": 3.846153846153846e-08, + "logits/chosen": -3.7890625, + "logits/rejected": -3.87890625, + "logps/chosen": -531.125, + "logps/rejected": -266.25, + "loss": 1.918, + "nll_loss": 0.93359375, + "rewards/accuracies": 0.28125, + "rewards/chosen": 0.05517578125, + "rewards/margins": -0.00506591796875, + "rewards/rejected": 0.060302734375, + "step": 3 + }, + { + "epoch": 0.01536983669548511, + "grad_norm": 6496.0, + "learning_rate": 5.7692307692307695e-08, + "logits/chosen": -3.890625, + "logits/rejected": -3.974609375, + "logps/chosen": -466.0, + "logps/rejected": -251.625, + "loss": 1.958, + "nll_loss": 0.90087890625, + "rewards/accuracies": 0.1875, + "rewards/chosen": -0.31854248046875, + "rewards/margins": -0.2250823974609375, + "rewards/rejected": -0.0929412841796875, + "step": 4 + }, + { + "epoch": 0.01921229586935639, + "grad_norm": 370.0, + "learning_rate": 7.692307692307692e-08, + "logits/chosen": -3.88671875, + "logits/rejected": -3.91796875, + "logps/chosen": -429.25, + "logps/rejected": -275.375, + "loss": 1.8857, + "nll_loss": 0.9248046875, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.1102294921875, + "rewards/margins": 0.070526123046875, + "rewards/rejected": 0.03985595703125, + "step": 5 + }, + { + "epoch": 0.023054755043227664, + "grad_norm": 356.0, + "learning_rate": 9.615384615384616e-08, + "logits/chosen": -3.830078125, + "logits/rejected": -3.861328125, + "logps/chosen": -415.0, + "logps/rejected": -236.875, + "loss": 2.0488, + "nll_loss": 1.05712890625, + "rewards/accuracies": 0.28125, + "rewards/chosen": -0.07568359375, + "rewards/margins": -0.044677734375, + "rewards/rejected": -0.03106689453125, + "step": 6 + }, + { + "epoch": 0.026897214217098942, + "grad_norm": 464.0, + "learning_rate": 1.1538461538461539e-07, + "logits/chosen": -3.81640625, + "logits/rejected": -3.90625, + "logps/chosen": -433.0, + "logps/rejected": -237.0, + "loss": 1.7891, + "nll_loss": 0.83935546875, + "rewards/accuracies": 0.46875, + "rewards/chosen": -0.079345703125, + "rewards/margins": 0.11199951171875, + "rewards/rejected": -0.1910400390625, + "step": 7 + }, + { + "epoch": 0.03073967339097022, + "grad_norm": 274.0, + "learning_rate": 1.346153846153846e-07, + "logits/chosen": -3.818359375, + "logits/rejected": -3.908203125, + "logps/chosen": -424.125, + "logps/rejected": -225.6875, + "loss": 1.9102, + "nll_loss": 0.9326171875, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.03765869140625, + "rewards/margins": -0.007080078125, + "rewards/rejected": 0.0447845458984375, + "step": 8 + }, + { + "epoch": 0.0345821325648415, + "grad_norm": 262.0, + "learning_rate": 1.5384615384615385e-07, + "logits/chosen": -3.89453125, + "logits/rejected": -3.916015625, + "logps/chosen": -319.25, + "logps/rejected": -228.0, + "loss": 2.0381, + "nll_loss": 1.0556640625, + "rewards/accuracies": 0.34375, + "rewards/chosen": -0.0657958984375, + "rewards/margins": -0.0295257568359375, + "rewards/rejected": -0.0361328125, + "step": 9 + }, + { + "epoch": 0.03842459173871278, + "grad_norm": 844.0, + "learning_rate": 1.7307692307692305e-07, + "logits/chosen": -3.767578125, + "logits/rejected": -3.76953125, + "logps/chosen": -418.25, + "logps/rejected": -229.4375, + "loss": 1.7871, + "nll_loss": 0.8037109375, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.020751953125, + "rewards/margins": -0.005279541015625, + "rewards/rejected": 0.025634765625, + "step": 10 + }, + { + "epoch": 0.04226705091258405, + "grad_norm": 660.0, + "learning_rate": 1.9230769230769231e-07, + "logits/chosen": -3.849609375, + "logits/rejected": -3.916015625, + "logps/chosen": -482.625, + "logps/rejected": -236.5625, + "loss": 1.9746, + "nll_loss": 1.0166015625, + "rewards/accuracies": 0.34375, + "rewards/chosen": 0.08978271484375, + "rewards/margins": 0.062255859375, + "rewards/rejected": 0.02752685546875, + "step": 11 + }, + { + "epoch": 0.04610951008645533, + "grad_norm": 294.0, + "learning_rate": 2.1153846153846152e-07, + "logits/chosen": -3.837890625, + "logits/rejected": -3.951171875, + "logps/chosen": -485.75, + "logps/rejected": -241.5, + "loss": 1.876, + "nll_loss": 0.908203125, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.075927734375, + "rewards/margins": 0.03619384765625, + "rewards/rejected": 0.039642333984375, + "step": 12 + }, + { + "epoch": 0.049951969260326606, + "grad_norm": 406.0, + "learning_rate": 2.3076923076923078e-07, + "logits/chosen": -3.865234375, + "logits/rejected": -3.896484375, + "logps/chosen": -420.875, + "logps/rejected": -273.875, + "loss": 1.7861, + "nll_loss": 0.76904296875, + "rewards/accuracies": 0.28125, + "rewards/chosen": -0.12744140625, + "rewards/margins": -0.10357666015625, + "rewards/rejected": -0.0240478515625, + "step": 13 + }, + { + "epoch": 0.053794428434197884, + "grad_norm": 344.0, + "learning_rate": 2.5e-07, + "logits/chosen": -3.86328125, + "logits/rejected": -3.919921875, + "logps/chosen": -423.0, + "logps/rejected": -220.0, + "loss": 1.9404, + "nll_loss": 0.93115234375, + "rewards/accuracies": 0.34375, + "rewards/chosen": -0.072509765625, + "rewards/margins": -0.134765625, + "rewards/rejected": 0.062103271484375, + "step": 14 + }, + { + "epoch": 0.05763688760806916, + "grad_norm": 450.0, + "learning_rate": 2.692307692307692e-07, + "logits/chosen": -3.814453125, + "logits/rejected": -3.865234375, + "logps/chosen": -510.5, + "logps/rejected": -235.375, + "loss": 1.8848, + "nll_loss": 0.83935546875, + "rewards/accuracies": 0.21875, + "rewards/chosen": -0.23681640625, + "rewards/margins": -0.24786376953125, + "rewards/rejected": 0.010345458984375, + "step": 15 + }, + { + "epoch": 0.06147934678194044, + "grad_norm": 350.0, + "learning_rate": 2.884615384615384e-07, + "logits/chosen": -3.859375, + "logits/rejected": -3.88671875, + "logps/chosen": -424.625, + "logps/rejected": -245.625, + "loss": 2.1211, + "nll_loss": 1.111572265625, + "rewards/accuracies": 0.28125, + "rewards/chosen": -0.001739501953125, + "rewards/margins": -0.14471435546875, + "rewards/rejected": 0.1431427001953125, + "step": 16 + }, + { + "epoch": 0.06532180595581172, + "grad_norm": 652.0, + "learning_rate": 3.076923076923077e-07, + "logits/chosen": -3.83203125, + "logits/rejected": -3.845703125, + "logps/chosen": -433.25, + "logps/rejected": -224.25, + "loss": 1.9551, + "nll_loss": 0.95361328125, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.075927734375, + "rewards/margins": -0.0660400390625, + "rewards/rejected": -0.0103759765625, + "step": 17 + }, + { + "epoch": 0.069164265129683, + "grad_norm": 712.0, + "learning_rate": 3.269230769230769e-07, + "logits/chosen": -3.8359375, + "logits/rejected": -3.873046875, + "logps/chosen": -456.625, + "logps/rejected": -248.0625, + "loss": 1.918, + "nll_loss": 0.9453125, + "rewards/accuracies": 0.34375, + "rewards/chosen": 0.0758056640625, + "rewards/margins": 0.027679443359375, + "rewards/rejected": 0.048309326171875, + "step": 18 + }, + { + "epoch": 0.07300672430355427, + "grad_norm": 496.0, + "learning_rate": 3.461538461538461e-07, + "logits/chosen": -3.802734375, + "logits/rejected": -3.90625, + "logps/chosen": -441.0, + "logps/rejected": -268.75, + "loss": 1.8633, + "nll_loss": 0.91162109375, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.02435302734375, + "rewards/margins": 0.09130859375, + "rewards/rejected": -0.0675048828125, + "step": 19 + }, + { + "epoch": 0.07684918347742556, + "grad_norm": 260.0, + "learning_rate": 3.6538461538461534e-07, + "logits/chosen": -3.82421875, + "logits/rejected": -3.8828125, + "logps/chosen": -429.625, + "logps/rejected": -198.125, + "loss": 1.9121, + "nll_loss": 0.978515625, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.072509765625, + "rewards/margins": 0.1571044921875, + "rewards/rejected": -0.084503173828125, + "step": 20 + }, + { + "epoch": 0.08069164265129683, + "grad_norm": 500.0, + "learning_rate": 3.8461538461538463e-07, + "logits/chosen": -3.810546875, + "logits/rejected": -3.94140625, + "logps/chosen": -469.875, + "logps/rejected": -213.25, + "loss": 1.8916, + "nll_loss": 0.9443359375, + "rewards/accuracies": 0.40625, + "rewards/chosen": -0.0640411376953125, + "rewards/margins": 0.11895751953125, + "rewards/rejected": -0.18243408203125, + "step": 21 + }, + { + "epoch": 0.0845341018251681, + "grad_norm": 402.0, + "learning_rate": 4.0384615384615386e-07, + "logits/chosen": -3.873046875, + "logits/rejected": -3.8984375, + "logps/chosen": -452.625, + "logps/rejected": -239.625, + "loss": 1.9355, + "nll_loss": 0.96923828125, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.04156494140625, + "rewards/margins": 0.04156494140625, + "rewards/rejected": -0.00048828125, + "step": 22 + }, + { + "epoch": 0.08837656099903939, + "grad_norm": 284.0, + "learning_rate": 4.2307692307692304e-07, + "logits/chosen": -3.705078125, + "logits/rejected": -3.82421875, + "logps/chosen": -423.875, + "logps/rejected": -177.375, + "loss": 1.9336, + "nll_loss": 0.93896484375, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.1396484375, + "rewards/margins": -0.009246826171875, + "rewards/rejected": 0.1484375, + "step": 23 + }, + { + "epoch": 0.09221902017291066, + "grad_norm": 255.0, + "learning_rate": 4.423076923076923e-07, + "logits/chosen": -3.865234375, + "logits/rejected": -3.919921875, + "logps/chosen": -422.25, + "logps/rejected": -182.9375, + "loss": 2.0938, + "nll_loss": 1.12353515625, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.127349853515625, + "rewards/margins": 0.0706329345703125, + "rewards/rejected": 0.056884765625, + "step": 24 + }, + { + "epoch": 0.09606147934678194, + "grad_norm": 824.0, + "learning_rate": 4.6153846153846156e-07, + "logits/chosen": -3.78125, + "logits/rejected": -3.81640625, + "logps/chosen": -460.5, + "logps/rejected": -213.125, + "loss": 1.9287, + "nll_loss": 0.91748046875, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.000244140625, + "rewards/margins": -0.02911376953125, + "rewards/rejected": 0.029205322265625, + "step": 25 + }, + { + "epoch": 0.09990393852065321, + "grad_norm": 318.0, + "learning_rate": 4.807692307692307e-07, + "logits/chosen": -3.890625, + "logits/rejected": -3.912109375, + "logps/chosen": -542.25, + "logps/rejected": -294.75, + "loss": 1.9258, + "nll_loss": 1.013671875, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.2484130859375, + "rewards/margins": 0.26861572265625, + "rewards/rejected": -0.0208740234375, + "step": 26 + }, + { + "epoch": 0.1037463976945245, + "grad_norm": 430.0, + "learning_rate": 5e-07, + "logits/chosen": -3.828125, + "logits/rejected": -3.939453125, + "logps/chosen": -460.25, + "logps/rejected": -222.375, + "loss": 1.9092, + "nll_loss": 0.94140625, + "rewards/accuracies": 0.28125, + "rewards/chosen": 0.120635986328125, + "rewards/margins": 0.02752685546875, + "rewards/rejected": 0.0932769775390625, + "step": 27 + }, + { + "epoch": 0.10758885686839577, + "grad_norm": 216.0, + "learning_rate": 5.192307692307692e-07, + "logits/chosen": -3.7890625, + "logits/rejected": -3.80078125, + "logps/chosen": -400.5, + "logps/rejected": -222.75, + "loss": 1.834, + "nll_loss": 0.8955078125, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.14508056640625, + "rewards/margins": 0.10870361328125, + "rewards/rejected": 0.03643798828125, + "step": 28 + }, + { + "epoch": 0.11143131604226705, + "grad_norm": 362.0, + "learning_rate": 5.384615384615384e-07, + "logits/chosen": -3.79296875, + "logits/rejected": -3.88671875, + "logps/chosen": -394.25, + "logps/rejected": -207.625, + "loss": 1.7959, + "nll_loss": 0.8515625, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.1758270263671875, + "rewards/margins": 0.11029052734375, + "rewards/rejected": 0.0654296875, + "step": 29 + }, + { + "epoch": 0.11527377521613832, + "grad_norm": 300.0, + "learning_rate": 5.576923076923077e-07, + "logits/chosen": -3.79296875, + "logits/rejected": -3.865234375, + "logps/chosen": -437.25, + "logps/rejected": -228.875, + "loss": 1.7559, + "nll_loss": 0.794921875, + "rewards/accuracies": 0.34375, + "rewards/chosen": 0.1015625, + "rewards/margins": 0.09283447265625, + "rewards/rejected": 0.0085296630859375, + "step": 30 + }, + { + "epoch": 0.11911623439000961, + "grad_norm": 348.0, + "learning_rate": 5.769230769230768e-07, + "logits/chosen": -3.953125, + "logits/rejected": -3.982421875, + "logps/chosen": -377.875, + "logps/rejected": -225.5625, + "loss": 1.9229, + "nll_loss": 0.97314453125, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.2431640625, + "rewards/margins": 0.094879150390625, + "rewards/rejected": 0.148651123046875, + "step": 31 + }, + { + "epoch": 0.12295869356388088, + "grad_norm": 316.0, + "learning_rate": 5.961538461538461e-07, + "logits/chosen": -3.841796875, + "logits/rejected": -3.94140625, + "logps/chosen": -485.5, + "logps/rejected": -237.625, + "loss": 1.8027, + "nll_loss": 0.87060546875, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.20697021484375, + "rewards/margins": 0.1358642578125, + "rewards/rejected": 0.070709228515625, + "step": 32 + }, + { + "epoch": 0.12680115273775217, + "grad_norm": 356.0, + "learning_rate": 6.153846153846154e-07, + "logits/chosen": -3.8984375, + "logits/rejected": -3.93359375, + "logps/chosen": -491.75, + "logps/rejected": -238.0, + "loss": 1.8301, + "nll_loss": 0.85791015625, + "rewards/accuracies": 0.28125, + "rewards/chosen": 0.098388671875, + "rewards/margins": 0.00518798828125, + "rewards/rejected": 0.093170166015625, + "step": 33 + }, + { + "epoch": 0.13064361191162344, + "grad_norm": 568.0, + "learning_rate": 6.346153846153845e-07, + "logits/chosen": -3.8671875, + "logits/rejected": -3.935546875, + "logps/chosen": -476.25, + "logps/rejected": -244.125, + "loss": 1.9102, + "nll_loss": 0.9716796875, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.383544921875, + "rewards/margins": 0.195648193359375, + "rewards/rejected": 0.187713623046875, + "step": 34 + }, + { + "epoch": 0.1344860710854947, + "grad_norm": 792.0, + "learning_rate": 6.538461538461538e-07, + "logits/chosen": -3.798828125, + "logits/rejected": -3.7578125, + "logps/chosen": -390.0, + "logps/rejected": -229.875, + "loss": 1.7861, + "nll_loss": 0.79736328125, + "rewards/accuracies": 0.34375, + "rewards/chosen": 0.12078857421875, + "rewards/margins": -0.0350341796875, + "rewards/rejected": 0.155364990234375, + "step": 35 + }, + { + "epoch": 0.138328530259366, + "grad_norm": 268.0, + "learning_rate": 6.730769230769231e-07, + "logits/chosen": -3.82421875, + "logits/rejected": -3.87109375, + "logps/chosen": -525.5, + "logps/rejected": -291.125, + "loss": 1.7363, + "nll_loss": 0.82958984375, + "rewards/accuracies": 0.53125, + "rewards/chosen": 0.4359130859375, + "rewards/margins": 0.311248779296875, + "rewards/rejected": 0.124420166015625, + "step": 36 + }, + { + "epoch": 0.14217098943323728, + "grad_norm": 330.0, + "learning_rate": 6.923076923076922e-07, + "logits/chosen": -3.8125, + "logits/rejected": -3.94921875, + "logps/chosen": -417.25, + "logps/rejected": -210.0, + "loss": 1.8779, + "nll_loss": 0.94140625, + "rewards/accuracies": 0.53125, + "rewards/chosen": 0.2935791015625, + "rewards/margins": 0.176239013671875, + "rewards/rejected": 0.11712646484375, + "step": 37 + }, + { + "epoch": 0.14601344860710855, + "grad_norm": 424.0, + "learning_rate": 7.115384615384616e-07, + "logits/chosen": -3.8515625, + "logits/rejected": -3.84765625, + "logps/chosen": -450.375, + "logps/rejected": -243.5, + "loss": 1.7471, + "nll_loss": 0.834716796875, + "rewards/accuracies": 0.46875, + "rewards/chosen": 0.6263427734375, + "rewards/margins": 0.33740234375, + "rewards/rejected": 0.289764404296875, + "step": 38 + }, + { + "epoch": 0.14985590778097982, + "grad_norm": 444.0, + "learning_rate": 7.307692307692307e-07, + "logits/chosen": -3.82421875, + "logits/rejected": -3.916015625, + "logps/chosen": -401.0, + "logps/rejected": -203.125, + "loss": 1.8438, + "nll_loss": 0.907470703125, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.343109130859375, + "rewards/margins": 0.17431640625, + "rewards/rejected": 0.169036865234375, + "step": 39 + }, + { + "epoch": 0.15369836695485112, + "grad_norm": 6432.0, + "learning_rate": 7.5e-07, + "logits/chosen": -3.828125, + "logits/rejected": -3.927734375, + "logps/chosen": -446.625, + "logps/rejected": -183.875, + "loss": 1.8359, + "nll_loss": 0.865478515625, + "rewards/accuracies": 0.46875, + "rewards/chosen": 0.12066650390625, + "rewards/margins": 0.0828857421875, + "rewards/rejected": 0.03790283203125, + "step": 40 + }, + { + "epoch": 0.1575408261287224, + "grad_norm": 256.0, + "learning_rate": 7.692307692307693e-07, + "logits/chosen": -3.826171875, + "logits/rejected": -3.93359375, + "logps/chosen": -499.0, + "logps/rejected": -236.625, + "loss": 1.8291, + "nll_loss": 0.90771484375, + "rewards/accuracies": 0.46875, + "rewards/chosen": 0.394775390625, + "rewards/margins": 0.2154541015625, + "rewards/rejected": 0.1793212890625, + "step": 41 + }, + { + "epoch": 0.16138328530259366, + "grad_norm": 680.0, + "learning_rate": 7.884615384615384e-07, + "logits/chosen": -3.798828125, + "logits/rejected": -3.80859375, + "logps/chosen": -465.5, + "logps/rejected": -310.875, + "loss": 1.8613, + "nll_loss": 0.83837890625, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4775390625, + "rewards/margins": -0.01446533203125, + "rewards/rejected": 0.49249267578125, + "step": 42 + }, + { + "epoch": 0.16522574447646493, + "grad_norm": 211.0, + "learning_rate": 8.076923076923077e-07, + "logits/chosen": -3.82421875, + "logits/rejected": -3.8203125, + "logps/chosen": -378.25, + "logps/rejected": -229.0, + "loss": 1.8418, + "nll_loss": 0.9267578125, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.384033203125, + "rewards/margins": 0.2515869140625, + "rewards/rejected": 0.132049560546875, + "step": 43 + }, + { + "epoch": 0.1690682036503362, + "grad_norm": 245.0, + "learning_rate": 8.269230769230768e-07, + "logits/chosen": -3.869140625, + "logits/rejected": -3.94140625, + "logps/chosen": -429.5, + "logps/rejected": -245.625, + "loss": 1.9141, + "nll_loss": 1.01123046875, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.4970703125, + "rewards/margins": 0.300994873046875, + "rewards/rejected": 0.196319580078125, + "step": 44 + }, + { + "epoch": 0.1729106628242075, + "grad_norm": 332.0, + "learning_rate": 8.461538461538461e-07, + "logits/chosen": -3.720703125, + "logits/rejected": -3.751953125, + "logps/chosen": -445.0, + "logps/rejected": -230.5, + "loss": 1.6924, + "nll_loss": 0.80322265625, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.4691162109375, + "rewards/margins": 0.35498046875, + "rewards/rejected": 0.11376953125, + "step": 45 + }, + { + "epoch": 0.17675312199807877, + "grad_norm": 872.0, + "learning_rate": 8.653846153846154e-07, + "logits/chosen": -3.8359375, + "logits/rejected": -3.91796875, + "logps/chosen": -362.875, + "logps/rejected": -232.25, + "loss": 1.791, + "nll_loss": 0.8330078125, + "rewards/accuracies": 0.46875, + "rewards/chosen": 0.17926025390625, + "rewards/margins": 0.0855712890625, + "rewards/rejected": 0.094085693359375, + "step": 46 + }, + { + "epoch": 0.18059558117195004, + "grad_norm": 214.0, + "learning_rate": 8.846153846153846e-07, + "logits/chosen": -3.845703125, + "logits/rejected": -3.84375, + "logps/chosen": -408.625, + "logps/rejected": -277.9375, + "loss": 1.7949, + "nll_loss": 0.8955078125, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.552734375, + "rewards/margins": 0.309478759765625, + "rewards/rejected": 0.2427978515625, + "step": 47 + }, + { + "epoch": 0.1844380403458213, + "grad_norm": 414.0, + "learning_rate": 9.038461538461538e-07, + "logits/chosen": -3.908203125, + "logits/rejected": -4.05859375, + "logps/chosen": -471.5, + "logps/rejected": -273.75, + "loss": 1.9482, + "nll_loss": 1.03076171875, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.517578125, + "rewards/margins": 0.333740234375, + "rewards/rejected": 0.183837890625, + "step": 48 + }, + { + "epoch": 0.1882804995196926, + "grad_norm": 230.0, + "learning_rate": 9.230769230769231e-07, + "logits/chosen": -3.873046875, + "logits/rejected": -3.873046875, + "logps/chosen": -438.75, + "logps/rejected": -196.3125, + "loss": 1.7773, + "nll_loss": 0.91552734375, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.92529296875, + "rewards/margins": 0.56561279296875, + "rewards/rejected": 0.360107421875, + "step": 49 + }, + { + "epoch": 0.19212295869356388, + "grad_norm": 268.0, + "learning_rate": 9.423076923076923e-07, + "logits/chosen": -3.826171875, + "logits/rejected": -3.90234375, + "logps/chosen": -409.75, + "logps/rejected": -201.375, + "loss": 1.9004, + "nll_loss": 1.02197265625, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.558837890625, + "rewards/margins": 0.357513427734375, + "rewards/rejected": 0.2017822265625, + "step": 50 + }, + { + "epoch": 0.19596541786743515, + "grad_norm": 222.0, + "learning_rate": 9.615384615384615e-07, + "logits/chosen": -3.7421875, + "logits/rejected": -3.8125, + "logps/chosen": -400.75, + "logps/rejected": -203.5, + "loss": 1.8877, + "nll_loss": 0.97314453125, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.578857421875, + "rewards/margins": 0.289703369140625, + "rewards/rejected": 0.289459228515625, + "step": 51 + }, + { + "epoch": 0.19980787704130643, + "grad_norm": 328.0, + "learning_rate": 9.807692307692306e-07, + "logits/chosen": -3.8125, + "logits/rejected": -3.91015625, + "logps/chosen": -535.25, + "logps/rejected": -238.625, + "loss": 1.7109, + "nll_loss": 0.853515625, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.81005859375, + "rewards/margins": 0.5323486328125, + "rewards/rejected": 0.277587890625, + "step": 52 + }, + { + "epoch": 0.20365033621517772, + "grad_norm": 204.0, + "learning_rate": 1e-06, + "logits/chosen": -3.865234375, + "logits/rejected": -3.869140625, + "logps/chosen": -518.0, + "logps/rejected": -265.25, + "loss": 1.7949, + "nll_loss": 0.9453125, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.7431640625, + "rewards/margins": 0.526611328125, + "rewards/rejected": 0.2174072265625, + "step": 53 + }, + { + "epoch": 0.207492795389049, + "grad_norm": 157.0, + "learning_rate": 9.978723404255318e-07, + "logits/chosen": -3.8046875, + "logits/rejected": -3.80859375, + "logps/chosen": -365.25, + "logps/rejected": -191.625, + "loss": 1.6094, + "nll_loss": 0.763916015625, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.649169921875, + "rewards/margins": 0.5145263671875, + "rewards/rejected": 0.1346435546875, + "step": 54 + }, + { + "epoch": 0.21133525456292027, + "grad_norm": 330.0, + "learning_rate": 9.957446808510637e-07, + "logits/chosen": -3.806640625, + "logits/rejected": -3.95703125, + "logps/chosen": -488.375, + "logps/rejected": -217.75, + "loss": 1.7256, + "nll_loss": 0.8583984375, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.7841796875, + "rewards/margins": 0.453125, + "rewards/rejected": 0.3309326171875, + "step": 55 + }, + { + "epoch": 0.21517771373679154, + "grad_norm": 212.0, + "learning_rate": 9.936170212765958e-07, + "logits/chosen": -3.958984375, + "logits/rejected": -3.94140625, + "logps/chosen": -594.0, + "logps/rejected": -280.75, + "loss": 1.7373, + "nll_loss": 0.97705078125, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.64453125, + "rewards/margins": 1.099853515625, + "rewards/rejected": 0.545318603515625, + "step": 56 + }, + { + "epoch": 0.21902017291066284, + "grad_norm": 306.0, + "learning_rate": 9.914893617021276e-07, + "logits/chosen": -3.7734375, + "logits/rejected": -3.833984375, + "logps/chosen": -504.25, + "logps/rejected": -262.4375, + "loss": 1.7422, + "nll_loss": 0.93359375, + "rewards/accuracies": 0.65625, + "rewards/chosen": 1.24462890625, + "rewards/margins": 0.76220703125, + "rewards/rejected": 0.483642578125, + "step": 57 + }, + { + "epoch": 0.2228626320845341, + "grad_norm": 300.0, + "learning_rate": 9.893617021276595e-07, + "logits/chosen": -3.80859375, + "logits/rejected": -3.904296875, + "logps/chosen": -441.75, + "logps/rejected": -294.25, + "loss": 1.8174, + "nll_loss": 0.9765625, + "rewards/accuracies": 0.6875, + "rewards/chosen": 1.0126953125, + "rewards/margins": 0.60302734375, + "rewards/rejected": 0.4107666015625, + "step": 58 + }, + { + "epoch": 0.22670509125840538, + "grad_norm": 532.0, + "learning_rate": 9.872340425531914e-07, + "logits/chosen": -3.89453125, + "logits/rejected": -3.998046875, + "logps/chosen": -554.5, + "logps/rejected": -272.75, + "loss": 1.7188, + "nll_loss": 0.95068359375, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.412109375, + "rewards/margins": 0.9609375, + "rewards/rejected": 0.449951171875, + "step": 59 + }, + { + "epoch": 0.23054755043227665, + "grad_norm": 246.0, + "learning_rate": 9.851063829787235e-07, + "logits/chosen": -3.787109375, + "logits/rejected": -3.921875, + "logps/chosen": -421.125, + "logps/rejected": -206.125, + "loss": 1.7148, + "nll_loss": 0.9423828125, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.0615234375, + "rewards/margins": 0.94189453125, + "rewards/rejected": 0.1209716796875, + "step": 60 + }, + { + "epoch": 0.23439000960614795, + "grad_norm": 402.0, + "learning_rate": 9.829787234042553e-07, + "logits/chosen": -3.73046875, + "logits/rejected": -3.8828125, + "logps/chosen": -379.375, + "logps/rejected": -232.75, + "loss": 1.7725, + "nll_loss": 0.916015625, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.9814453125, + "rewards/margins": 0.55731201171875, + "rewards/rejected": 0.4244384765625, + "step": 61 + }, + { + "epoch": 0.23823246878001922, + "grad_norm": 225.0, + "learning_rate": 9.808510638297872e-07, + "logits/chosen": -3.775390625, + "logits/rejected": -3.865234375, + "logps/chosen": -482.875, + "logps/rejected": -251.5, + "loss": 1.8809, + "nll_loss": 1.0751953125, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.24072265625, + "rewards/margins": 0.7862548828125, + "rewards/rejected": 0.454345703125, + "step": 62 + }, + { + "epoch": 0.2420749279538905, + "grad_norm": 194.0, + "learning_rate": 9.78723404255319e-07, + "logits/chosen": -3.83984375, + "logits/rejected": -3.86328125, + "logps/chosen": -449.0, + "logps/rejected": -204.6875, + "loss": 1.6621, + "nll_loss": 0.8916015625, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.21044921875, + "rewards/margins": 0.93170166015625, + "rewards/rejected": 0.2799072265625, + "step": 63 + }, + { + "epoch": 0.24591738712776176, + "grad_norm": 648.0, + "learning_rate": 9.765957446808511e-07, + "logits/chosen": -3.8828125, + "logits/rejected": -3.951171875, + "logps/chosen": -424.75, + "logps/rejected": -255.375, + "loss": 1.6562, + "nll_loss": 0.80712890625, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.93212890625, + "rewards/margins": 0.546630859375, + "rewards/rejected": 0.38623046875, + "step": 64 + }, + { + "epoch": 0.24975984630163303, + "grad_norm": 338.0, + "learning_rate": 9.74468085106383e-07, + "logits/chosen": -3.802734375, + "logits/rejected": -3.861328125, + "logps/chosen": -459.5, + "logps/rejected": -254.625, + "loss": 1.7275, + "nll_loss": 0.94287109375, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.396484375, + "rewards/margins": 0.883819580078125, + "rewards/rejected": 0.514404296875, + "step": 65 + }, + { + "epoch": 0.25360230547550433, + "grad_norm": 272.0, + "learning_rate": 9.723404255319149e-07, + "logits/chosen": -3.84765625, + "logits/rejected": -3.833984375, + "logps/chosen": -393.375, + "logps/rejected": -264.625, + "loss": 1.7061, + "nll_loss": 0.92138671875, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.35595703125, + "rewards/margins": 0.9586181640625, + "rewards/rejected": 0.3973388671875, + "step": 66 + }, + { + "epoch": 0.2574447646493756, + "grad_norm": 290.0, + "learning_rate": 9.702127659574467e-07, + "logits/chosen": -3.8046875, + "logits/rejected": -3.904296875, + "logps/chosen": -546.25, + "logps/rejected": -251.375, + "loss": 1.5322, + "nll_loss": 0.8447265625, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.0361328125, + "rewards/margins": 1.5693359375, + "rewards/rejected": 0.4716796875, + "step": 67 + }, + { + "epoch": 0.2612872238232469, + "grad_norm": 141.0, + "learning_rate": 9.680851063829786e-07, + "logits/chosen": -3.765625, + "logits/rejected": -3.748046875, + "logps/chosen": -440.5, + "logps/rejected": -211.875, + "loss": 1.6191, + "nll_loss": 0.84814453125, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.22216796875, + "rewards/margins": 1.02178955078125, + "rewards/rejected": 0.2000732421875, + "step": 68 + }, + { + "epoch": 0.26512968299711814, + "grad_norm": 334.0, + "learning_rate": 9.659574468085105e-07, + "logits/chosen": -3.90234375, + "logits/rejected": -3.9140625, + "logps/chosen": -458.0, + "logps/rejected": -236.125, + "loss": 1.7119, + "nll_loss": 0.91748046875, + "rewards/accuracies": 0.6875, + "rewards/chosen": 1.617919921875, + "rewards/margins": 0.8857421875, + "rewards/rejected": 0.7354736328125, + "step": 69 + }, + { + "epoch": 0.2689721421709894, + "grad_norm": 446.0, + "learning_rate": 9.638297872340426e-07, + "logits/chosen": -3.794921875, + "logits/rejected": -3.796875, + "logps/chosen": -415.625, + "logps/rejected": -207.375, + "loss": 1.8086, + "nll_loss": 0.98046875, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.29638671875, + "rewards/margins": 0.8109130859375, + "rewards/rejected": 0.486328125, + "step": 70 + }, + { + "epoch": 0.2728146013448607, + "grad_norm": 270.0, + "learning_rate": 9.617021276595744e-07, + "logits/chosen": -3.869140625, + "logits/rejected": -3.9453125, + "logps/chosen": -434.25, + "logps/rejected": -247.8125, + "loss": 1.7041, + "nll_loss": 0.8876953125, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.5185546875, + "rewards/margins": 0.822265625, + "rewards/rejected": 0.69708251953125, + "step": 71 + }, + { + "epoch": 0.276657060518732, + "grad_norm": 520.0, + "learning_rate": 9.595744680851063e-07, + "logits/chosen": -3.818359375, + "logits/rejected": -3.90234375, + "logps/chosen": -479.25, + "logps/rejected": -250.875, + "loss": 1.7227, + "nll_loss": 0.9169921875, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.48583984375, + "rewards/margins": 1.148681640625, + "rewards/rejected": 0.33642578125, + "step": 72 + }, + { + "epoch": 0.2804995196926033, + "grad_norm": 223.0, + "learning_rate": 9.574468085106384e-07, + "logits/chosen": -3.859375, + "logits/rejected": -3.896484375, + "logps/chosen": -463.75, + "logps/rejected": -237.9375, + "loss": 1.6895, + "nll_loss": 0.9013671875, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.5615234375, + "rewards/margins": 0.919921875, + "rewards/rejected": 0.64312744140625, + "step": 73 + }, + { + "epoch": 0.28434197886647455, + "grad_norm": 170.0, + "learning_rate": 9.553191489361702e-07, + "logits/chosen": -3.701171875, + "logits/rejected": -3.8046875, + "logps/chosen": -411.375, + "logps/rejected": -223.25, + "loss": 1.7793, + "nll_loss": 0.978515625, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.32763671875, + "rewards/margins": 0.823486328125, + "rewards/rejected": 0.50537109375, + "step": 74 + }, + { + "epoch": 0.2881844380403458, + "grad_norm": 624.0, + "learning_rate": 9.531914893617021e-07, + "logits/chosen": -3.7890625, + "logits/rejected": -3.8984375, + "logps/chosen": -453.25, + "logps/rejected": -228.5625, + "loss": 1.6973, + "nll_loss": 0.91943359375, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.42236328125, + "rewards/margins": 0.9970703125, + "rewards/rejected": 0.42431640625, + "step": 75 + }, + { + "epoch": 0.2920268972142171, + "grad_norm": 262.0, + "learning_rate": 9.51063829787234e-07, + "logits/chosen": -3.83984375, + "logits/rejected": -3.8515625, + "logps/chosen": -447.5, + "logps/rejected": -230.125, + "loss": 1.6963, + "nll_loss": 0.93115234375, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.6591796875, + "rewards/margins": 1.05126953125, + "rewards/rejected": 0.609130859375, + "step": 76 + }, + { + "epoch": 0.29586935638808837, + "grad_norm": 247.0, + "learning_rate": 9.489361702127659e-07, + "logits/chosen": -3.814453125, + "logits/rejected": -3.888671875, + "logps/chosen": -424.0, + "logps/rejected": -272.0, + "loss": 1.8164, + "nll_loss": 1.01904296875, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.693359375, + "rewards/margins": 0.8424072265625, + "rewards/rejected": 0.854736328125, + "step": 77 + }, + { + "epoch": 0.29971181556195964, + "grad_norm": 240.0, + "learning_rate": 9.468085106382978e-07, + "logits/chosen": -3.869140625, + "logits/rejected": -4.013671875, + "logps/chosen": -486.75, + "logps/rejected": -240.75, + "loss": 1.708, + "nll_loss": 1.0048828125, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.4267578125, + "rewards/margins": 1.7763671875, + "rewards/rejected": 0.65087890625, + "step": 78 + }, + { + "epoch": 0.3035542747358309, + "grad_norm": 131.0, + "learning_rate": 9.446808510638298e-07, + "logits/chosen": -3.80078125, + "logits/rejected": -3.88671875, + "logps/chosen": -398.75, + "logps/rejected": -195.625, + "loss": 1.6045, + "nll_loss": 0.86669921875, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.63720703125, + "rewards/margins": 1.23681640625, + "rewards/rejected": 0.4012451171875, + "step": 79 + }, + { + "epoch": 0.30739673390970224, + "grad_norm": 233.0, + "learning_rate": 9.425531914893617e-07, + "logits/chosen": -3.82421875, + "logits/rejected": -3.919921875, + "logps/chosen": -408.25, + "logps/rejected": -239.0, + "loss": 1.6426, + "nll_loss": 0.87939453125, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.8408203125, + "rewards/margins": 1.06689453125, + "rewards/rejected": 0.776123046875, + "step": 80 + }, + { + "epoch": 0.3112391930835735, + "grad_norm": 176.0, + "learning_rate": 9.404255319148936e-07, + "logits/chosen": -3.880859375, + "logits/rejected": -3.9296875, + "logps/chosen": -485.25, + "logps/rejected": -259.125, + "loss": 1.5156, + "nll_loss": 0.84033203125, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.0283203125, + "rewards/margins": 1.4931640625, + "rewards/rejected": 0.53704833984375, + "step": 81 + }, + { + "epoch": 0.3150816522574448, + "grad_norm": 402.0, + "learning_rate": 9.382978723404255e-07, + "logits/chosen": -3.8359375, + "logits/rejected": -3.923828125, + "logps/chosen": -415.375, + "logps/rejected": -200.125, + "loss": 1.707, + "nll_loss": 0.97998046875, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.74658203125, + "rewards/margins": 1.249267578125, + "rewards/rejected": 0.497161865234375, + "step": 82 + }, + { + "epoch": 0.31892411143131605, + "grad_norm": 222.0, + "learning_rate": 9.361702127659575e-07, + "logits/chosen": -3.8671875, + "logits/rejected": -3.87109375, + "logps/chosen": -493.125, + "logps/rejected": -297.0, + "loss": 1.7607, + "nll_loss": 0.9794921875, + "rewards/accuracies": 0.6875, + "rewards/chosen": 1.927734375, + "rewards/margins": 1.022216796875, + "rewards/rejected": 0.91009521484375, + "step": 83 + }, + { + "epoch": 0.3227665706051873, + "grad_norm": 2992.0, + "learning_rate": 9.340425531914892e-07, + "logits/chosen": -3.837890625, + "logits/rejected": -3.916015625, + "logps/chosen": -495.5, + "logps/rejected": -232.625, + "loss": 1.5625, + "nll_loss": 0.8759765625, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.7822265625, + "rewards/margins": 1.5244140625, + "rewards/rejected": 0.2606201171875, + "step": 84 + }, + { + "epoch": 0.3266090297790586, + "grad_norm": 240.0, + "learning_rate": 9.319148936170212e-07, + "logits/chosen": -3.802734375, + "logits/rejected": -3.83203125, + "logps/chosen": -569.75, + "logps/rejected": -259.75, + "loss": 1.5625, + "nll_loss": 0.8759765625, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.078125, + "rewards/margins": 1.6181640625, + "rewards/rejected": 0.4600830078125, + "step": 85 + }, + { + "epoch": 0.33045148895292986, + "grad_norm": 324.0, + "learning_rate": 9.297872340425531e-07, + "logits/chosen": -3.77734375, + "logits/rejected": -3.828125, + "logps/chosen": -386.0, + "logps/rejected": -245.625, + "loss": 1.8457, + "nll_loss": 1.02783203125, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.209228515625, + "rewards/margins": 0.81768798828125, + "rewards/rejected": 0.39013671875, + "step": 86 + }, + { + "epoch": 0.33429394812680113, + "grad_norm": 278.0, + "learning_rate": 9.27659574468085e-07, + "logits/chosen": -3.88671875, + "logits/rejected": -3.94140625, + "logps/chosen": -611.75, + "logps/rejected": -295.375, + "loss": 1.5273, + "nll_loss": 0.853515625, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8388671875, + "rewards/margins": 1.8564453125, + "rewards/rejected": 0.9833984375, + "step": 87 + }, + { + "epoch": 0.3381364073006724, + "grad_norm": 260.0, + "learning_rate": 9.255319148936169e-07, + "logits/chosen": -3.80078125, + "logits/rejected": -3.822265625, + "logps/chosen": -362.0, + "logps/rejected": -228.0, + "loss": 1.6787, + "nll_loss": 0.87060546875, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.53759765625, + "rewards/margins": 0.8037109375, + "rewards/rejected": 0.7355804443359375, + "step": 88 + }, + { + "epoch": 0.34197886647454373, + "grad_norm": 225.0, + "learning_rate": 9.234042553191489e-07, + "logits/chosen": -3.806640625, + "logits/rejected": -3.8828125, + "logps/chosen": -440.75, + "logps/rejected": -244.75, + "loss": 1.6602, + "nll_loss": 0.92333984375, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.98095703125, + "rewards/margins": 1.285888671875, + "rewards/rejected": 0.6971435546875, + "step": 89 + }, + { + "epoch": 0.345821325648415, + "grad_norm": 178.0, + "learning_rate": 9.212765957446809e-07, + "logits/chosen": -3.841796875, + "logits/rejected": -3.869140625, + "logps/chosen": -425.25, + "logps/rejected": -206.0, + "loss": 1.7734, + "nll_loss": 1.0419921875, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.939453125, + "rewards/margins": 1.39013671875, + "rewards/rejected": 0.552001953125, + "step": 90 + }, + { + "epoch": 0.34966378482228627, + "grad_norm": 832.0, + "learning_rate": 9.191489361702127e-07, + "logits/chosen": -3.806640625, + "logits/rejected": -3.8828125, + "logps/chosen": -468.0, + "logps/rejected": -242.75, + "loss": 1.7354, + "nll_loss": 1.03955078125, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.5537109375, + "rewards/margins": 1.72607421875, + "rewards/rejected": 0.8270263671875, + "step": 91 + }, + { + "epoch": 0.35350624399615754, + "grad_norm": 195.0, + "learning_rate": 9.170212765957447e-07, + "logits/chosen": -3.734375, + "logits/rejected": -3.810546875, + "logps/chosen": -395.875, + "logps/rejected": -225.75, + "loss": 1.5938, + "nll_loss": 0.85400390625, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.58544921875, + "rewards/margins": 1.1875, + "rewards/rejected": 0.400390625, + "step": 92 + }, + { + "epoch": 0.3573487031700288, + "grad_norm": 832.0, + "learning_rate": 9.148936170212766e-07, + "logits/chosen": -3.798828125, + "logits/rejected": -3.869140625, + "logps/chosen": -458.75, + "logps/rejected": -206.5625, + "loss": 1.6338, + "nll_loss": 0.88916015625, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.89453125, + "rewards/margins": 1.24462890625, + "rewards/rejected": 0.64971923828125, + "step": 93 + }, + { + "epoch": 0.3611911623439001, + "grad_norm": 153.0, + "learning_rate": 9.127659574468085e-07, + "logits/chosen": -3.734375, + "logits/rejected": -3.83203125, + "logps/chosen": -364.875, + "logps/rejected": -184.75, + "loss": 1.4531, + "nll_loss": 0.72216796875, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.5322265625, + "rewards/margins": 1.21240234375, + "rewards/rejected": 0.31890869140625, + "step": 94 + }, + { + "epoch": 0.36503362151777136, + "grad_norm": 177.0, + "learning_rate": 9.106382978723404e-07, + "logits/chosen": -3.8671875, + "logits/rejected": -3.890625, + "logps/chosen": -464.375, + "logps/rejected": -212.5, + "loss": 1.6123, + "nll_loss": 0.92822265625, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.1669921875, + "rewards/margins": 1.567138671875, + "rewards/rejected": 0.601806640625, + "step": 95 + }, + { + "epoch": 0.3688760806916426, + "grad_norm": 142.0, + "learning_rate": 9.085106382978724e-07, + "logits/chosen": -3.744140625, + "logits/rejected": -3.7578125, + "logps/chosen": -415.5, + "logps/rejected": -202.875, + "loss": 1.5967, + "nll_loss": 0.876953125, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.84375, + "rewards/margins": 1.455078125, + "rewards/rejected": 0.3896484375, + "step": 96 + }, + { + "epoch": 0.37271853986551395, + "grad_norm": 688.0, + "learning_rate": 9.063829787234041e-07, + "logits/chosen": -3.81640625, + "logits/rejected": -3.830078125, + "logps/chosen": -421.25, + "logps/rejected": -219.625, + "loss": 1.5859, + "nll_loss": 0.83154296875, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.98876953125, + "rewards/margins": 1.2889404296875, + "rewards/rejected": 0.704345703125, + "step": 97 + }, + { + "epoch": 0.3765609990393852, + "grad_norm": 139.0, + "learning_rate": 9.042553191489361e-07, + "logits/chosen": -3.7890625, + "logits/rejected": -3.873046875, + "logps/chosen": -358.5, + "logps/rejected": -222.5, + "loss": 1.5762, + "nll_loss": 0.8349609375, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.83935546875, + "rewards/margins": 1.26416015625, + "rewards/rejected": 0.576080322265625, + "step": 98 + }, + { + "epoch": 0.3804034582132565, + "grad_norm": 260.0, + "learning_rate": 9.02127659574468e-07, + "logits/chosen": -3.73046875, + "logits/rejected": -3.78125, + "logps/chosen": -516.5, + "logps/rejected": -262.125, + "loss": 1.6494, + "nll_loss": 0.9033203125, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.11328125, + "rewards/margins": 1.32177734375, + "rewards/rejected": 0.79345703125, + "step": 99 + }, + { + "epoch": 0.38424591738712777, + "grad_norm": 206.0, + "learning_rate": 9e-07, + "logits/chosen": -3.830078125, + "logits/rejected": -3.90234375, + "logps/chosen": -482.5, + "logps/rejected": -189.1875, + "loss": 1.6758, + "nll_loss": 1.04296875, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.2978515625, + "rewards/margins": 1.8857421875, + "rewards/rejected": 0.412139892578125, + "step": 100 + }, + { + "epoch": 0.38808837656099904, + "grad_norm": 185.0, + "learning_rate": 8.978723404255318e-07, + "logits/chosen": -3.7734375, + "logits/rejected": -3.845703125, + "logps/chosen": -412.75, + "logps/rejected": -194.8125, + "loss": 1.6689, + "nll_loss": 0.98193359375, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.828125, + "rewards/margins": 1.462890625, + "rewards/rejected": 0.3673095703125, + "step": 101 + }, + { + "epoch": 0.3919308357348703, + "grad_norm": 280.0, + "learning_rate": 8.957446808510638e-07, + "logits/chosen": -3.830078125, + "logits/rejected": -3.904296875, + "logps/chosen": -420.25, + "logps/rejected": -193.875, + "loss": 1.6377, + "nll_loss": 0.9443359375, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.99169921875, + "rewards/margins": 1.43017578125, + "rewards/rejected": 0.5598297119140625, + "step": 102 + }, + { + "epoch": 0.3957732949087416, + "grad_norm": 253.0, + "learning_rate": 8.936170212765957e-07, + "logits/chosen": -3.78515625, + "logits/rejected": -3.80859375, + "logps/chosen": -433.75, + "logps/rejected": -236.5, + "loss": 1.5654, + "nll_loss": 0.85791015625, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.103515625, + "rewards/margins": 1.560546875, + "rewards/rejected": 0.54339599609375, + "step": 103 + }, + { + "epoch": 0.39961575408261285, + "grad_norm": 163.0, + "learning_rate": 8.914893617021276e-07, + "logits/chosen": -3.6953125, + "logits/rejected": -3.7734375, + "logps/chosen": -376.5, + "logps/rejected": -212.5, + "loss": 1.5381, + "nll_loss": 0.79052734375, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.59912109375, + "rewards/margins": 1.1611328125, + "rewards/rejected": 0.439605712890625, + "step": 104 + }, + { + "epoch": 0.4034582132564842, + "grad_norm": 260.0, + "learning_rate": 8.893617021276595e-07, + "logits/chosen": -3.759765625, + "logits/rejected": -3.814453125, + "logps/chosen": -436.125, + "logps/rejected": -217.75, + "loss": 1.5127, + "nll_loss": 0.83203125, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.3046875, + "rewards/margins": 1.68896484375, + "rewards/rejected": 0.618316650390625, + "step": 105 + }, + { + "epoch": 0.40730067243035545, + "grad_norm": 179.0, + "learning_rate": 8.872340425531915e-07, + "logits/chosen": -3.818359375, + "logits/rejected": -3.919921875, + "logps/chosen": -437.875, + "logps/rejected": -230.125, + "loss": 1.5703, + "nll_loss": 0.88525390625, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.314453125, + "rewards/margins": 1.50537109375, + "rewards/rejected": 0.8076171875, + "step": 106 + }, + { + "epoch": 0.4111431316042267, + "grad_norm": 568.0, + "learning_rate": 8.851063829787234e-07, + "logits/chosen": -3.8046875, + "logits/rejected": -3.83203125, + "logps/chosen": -331.75, + "logps/rejected": -245.0, + "loss": 1.8213, + "nll_loss": 0.9931640625, + "rewards/accuracies": 0.71875, + "rewards/chosen": 1.51513671875, + "rewards/margins": 0.80279541015625, + "rewards/rejected": 0.71295166015625, + "step": 107 + }, + { + "epoch": 0.414985590778098, + "grad_norm": 262.0, + "learning_rate": 8.829787234042553e-07, + "logits/chosen": -3.845703125, + "logits/rejected": -3.845703125, + "logps/chosen": -526.5, + "logps/rejected": -339.125, + "loss": 1.6846, + "nll_loss": 0.95263671875, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.5693359375, + "rewards/margins": 1.49658203125, + "rewards/rejected": 1.076416015625, + "step": 108 + }, + { + "epoch": 0.41882804995196926, + "grad_norm": 239.0, + "learning_rate": 8.808510638297872e-07, + "logits/chosen": -3.86328125, + "logits/rejected": -3.890625, + "logps/chosen": -380.0, + "logps/rejected": -252.0, + "loss": 1.583, + "nll_loss": 0.806640625, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.0732421875, + "rewards/margins": 1.227813720703125, + "rewards/rejected": 0.84814453125, + "step": 109 + }, + { + "epoch": 0.42267050912584053, + "grad_norm": 181.0, + "learning_rate": 8.787234042553191e-07, + "logits/chosen": -3.8203125, + "logits/rejected": -3.84765625, + "logps/chosen": -431.0, + "logps/rejected": -215.5, + "loss": 1.5312, + "nll_loss": 0.83447265625, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.2900390625, + "rewards/margins": 1.55859375, + "rewards/rejected": 0.7327880859375, + "step": 110 + }, + { + "epoch": 0.4265129682997118, + "grad_norm": 216.0, + "learning_rate": 8.76595744680851e-07, + "logits/chosen": -3.82421875, + "logits/rejected": -3.787109375, + "logps/chosen": -496.5, + "logps/rejected": -243.0, + "loss": 1.4531, + "nll_loss": 0.82568359375, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6123046875, + "rewards/margins": 2.0751953125, + "rewards/rejected": 0.53875732421875, + "step": 111 + }, + { + "epoch": 0.4303554274735831, + "grad_norm": 334.0, + "learning_rate": 8.744680851063829e-07, + "logits/chosen": -3.8359375, + "logits/rejected": -3.85546875, + "logps/chosen": -498.0, + "logps/rejected": -272.125, + "loss": 1.6562, + "nll_loss": 0.93603515625, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.3095703125, + "rewards/margins": 1.46484375, + "rewards/rejected": 0.842041015625, + "step": 112 + }, + { + "epoch": 0.43419788664745435, + "grad_norm": 156.0, + "learning_rate": 8.723404255319149e-07, + "logits/chosen": -3.86328125, + "logits/rejected": -3.857421875, + "logps/chosen": -418.0, + "logps/rejected": -274.375, + "loss": 1.5752, + "nll_loss": 0.84716796875, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.09619140625, + "rewards/margins": 1.38818359375, + "rewards/rejected": 0.7090301513671875, + "step": 113 + }, + { + "epoch": 0.43804034582132567, + "grad_norm": 272.0, + "learning_rate": 8.702127659574467e-07, + "logits/chosen": -3.7734375, + "logits/rejected": -3.837890625, + "logps/chosen": -408.25, + "logps/rejected": -222.0, + "loss": 1.5449, + "nll_loss": 0.82666015625, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.9853515625, + "rewards/margins": 1.350830078125, + "rewards/rejected": 0.634185791015625, + "step": 114 + }, + { + "epoch": 0.44188280499519694, + "grad_norm": 280.0, + "learning_rate": 8.680851063829787e-07, + "logits/chosen": -3.806640625, + "logits/rejected": -3.849609375, + "logps/chosen": -415.75, + "logps/rejected": -229.875, + "loss": 1.7617, + "nll_loss": 1.0244140625, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.1650390625, + "rewards/margins": 1.3291015625, + "rewards/rejected": 0.83917236328125, + "step": 115 + }, + { + "epoch": 0.4457252641690682, + "grad_norm": 203.0, + "learning_rate": 8.659574468085106e-07, + "logits/chosen": -3.81640625, + "logits/rejected": -3.912109375, + "logps/chosen": -489.75, + "logps/rejected": -260.25, + "loss": 1.5342, + "nll_loss": 0.83984375, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.8349609375, + "rewards/margins": 2.1044921875, + "rewards/rejected": 0.730224609375, + "step": 116 + }, + { + "epoch": 0.4495677233429395, + "grad_norm": 221.0, + "learning_rate": 8.638297872340426e-07, + "logits/chosen": -3.77734375, + "logits/rejected": -3.90625, + "logps/chosen": -452.0, + "logps/rejected": -225.6875, + "loss": 1.6465, + "nll_loss": 1.00146484375, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.2255859375, + "rewards/margins": 1.85791015625, + "rewards/rejected": 0.367401123046875, + "step": 117 + }, + { + "epoch": 0.45341018251681076, + "grad_norm": 193.0, + "learning_rate": 8.617021276595744e-07, + "logits/chosen": -3.84375, + "logits/rejected": -3.875, + "logps/chosen": -476.5, + "logps/rejected": -241.375, + "loss": 1.5322, + "nll_loss": 0.88427734375, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.4482421875, + "rewards/margins": 1.818359375, + "rewards/rejected": 0.6304931640625, + "step": 118 + }, + { + "epoch": 0.457252641690682, + "grad_norm": 244.0, + "learning_rate": 8.595744680851064e-07, + "logits/chosen": -3.763671875, + "logits/rejected": -3.837890625, + "logps/chosen": -415.25, + "logps/rejected": -211.375, + "loss": 1.5801, + "nll_loss": 0.89697265625, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.9833984375, + "rewards/margins": 1.55419921875, + "rewards/rejected": 0.43115234375, + "step": 119 + }, + { + "epoch": 0.4610951008645533, + "grad_norm": 156.0, + "learning_rate": 8.574468085106383e-07, + "logits/chosen": -3.775390625, + "logits/rejected": -3.9296875, + "logps/chosen": -435.5, + "logps/rejected": -182.9375, + "loss": 1.6582, + "nll_loss": 1.03955078125, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.2822265625, + "rewards/margins": 1.94140625, + "rewards/rejected": 0.343353271484375, + "step": 120 + }, + { + "epoch": 0.46493756003842457, + "grad_norm": 130.0, + "learning_rate": 8.553191489361702e-07, + "logits/chosen": -3.857421875, + "logits/rejected": -3.822265625, + "logps/chosen": -374.0, + "logps/rejected": -227.0625, + "loss": 1.5195, + "nll_loss": 0.79638671875, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.83447265625, + "rewards/margins": 1.39111328125, + "rewards/rejected": 0.441497802734375, + "step": 121 + }, + { + "epoch": 0.4687800192122959, + "grad_norm": 196.0, + "learning_rate": 8.53191489361702e-07, + "logits/chosen": -3.853515625, + "logits/rejected": -3.833984375, + "logps/chosen": -524.5, + "logps/rejected": -263.875, + "loss": 1.5488, + "nll_loss": 0.900390625, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.013671875, + "rewards/margins": 2.142333984375, + "rewards/rejected": 0.869903564453125, + "step": 122 + }, + { + "epoch": 0.47262247838616717, + "grad_norm": 716.0, + "learning_rate": 8.51063829787234e-07, + "logits/chosen": -3.80859375, + "logits/rejected": -3.841796875, + "logps/chosen": -375.75, + "logps/rejected": -198.75, + "loss": 1.6484, + "nll_loss": 0.9248046875, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.9375, + "rewards/margins": 1.326171875, + "rewards/rejected": 0.6134033203125, + "step": 123 + }, + { + "epoch": 0.47646493756003844, + "grad_norm": 644.0, + "learning_rate": 8.489361702127658e-07, + "logits/chosen": -3.82421875, + "logits/rejected": -3.8359375, + "logps/chosen": -335.0, + "logps/rejected": -193.625, + "loss": 1.6699, + "nll_loss": 0.92333984375, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.8984375, + "rewards/margins": 1.21142578125, + "rewards/rejected": 0.6873779296875, + "step": 124 + }, + { + "epoch": 0.4803073967339097, + "grad_norm": 1040.0, + "learning_rate": 8.468085106382978e-07, + "logits/chosen": -3.759765625, + "logits/rejected": -3.935546875, + "logps/chosen": -372.5, + "logps/rejected": -206.625, + "loss": 1.582, + "nll_loss": 0.8134765625, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.759521484375, + "rewards/margins": 1.14892578125, + "rewards/rejected": 0.610595703125, + "step": 125 + }, + { + "epoch": 0.484149855907781, + "grad_norm": 306.0, + "learning_rate": 8.446808510638298e-07, + "logits/chosen": -3.841796875, + "logits/rejected": -3.8671875, + "logps/chosen": -513.5, + "logps/rejected": -284.75, + "loss": 1.5312, + "nll_loss": 0.8662109375, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8369140625, + "rewards/margins": 1.92578125, + "rewards/rejected": 0.9114990234375, + "step": 126 + }, + { + "epoch": 0.48799231508165225, + "grad_norm": 528.0, + "learning_rate": 8.425531914893617e-07, + "logits/chosen": -3.76171875, + "logits/rejected": -3.802734375, + "logps/chosen": -482.0, + "logps/rejected": -220.875, + "loss": 1.7197, + "nll_loss": 0.92041015625, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.3447265625, + "rewards/margins": 1.479736328125, + "rewards/rejected": 0.8623046875, + "step": 127 + }, + { + "epoch": 0.4918347742555235, + "grad_norm": 222.0, + "learning_rate": 8.404255319148936e-07, + "logits/chosen": -3.759765625, + "logits/rejected": -3.865234375, + "logps/chosen": -451.25, + "logps/rejected": -213.0, + "loss": 1.71, + "nll_loss": 1.0576171875, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.408203125, + "rewards/margins": 1.86767578125, + "rewards/rejected": 0.5413818359375, + "step": 128 + }, + { + "epoch": 0.4956772334293948, + "grad_norm": 166.0, + "learning_rate": 8.382978723404255e-07, + "logits/chosen": -3.814453125, + "logits/rejected": -3.84765625, + "logps/chosen": -480.5, + "logps/rejected": -206.0625, + "loss": 1.5322, + "nll_loss": 0.8984375, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.5556640625, + "rewards/margins": 2.03125, + "rewards/rejected": 0.525115966796875, + "step": 129 + }, + { + "epoch": 0.49951969260326606, + "grad_norm": 242.0, + "learning_rate": 8.361702127659575e-07, + "logits/chosen": -3.7890625, + "logits/rejected": -3.81640625, + "logps/chosen": -407.5, + "logps/rejected": -200.375, + "loss": 1.5566, + "nll_loss": 0.8857421875, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.3193359375, + "rewards/margins": 1.712158203125, + "rewards/rejected": 0.608642578125, + "step": 130 + }, + { + "epoch": 0.5033621517771374, + "grad_norm": 199.0, + "learning_rate": 8.340425531914893e-07, + "logits/chosen": -3.8125, + "logits/rejected": -3.828125, + "logps/chosen": -436.75, + "logps/rejected": -217.75, + "loss": 1.6855, + "nll_loss": 1.021484375, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.21875, + "rewards/margins": 1.65380859375, + "rewards/rejected": 0.5675048828125, + "step": 131 + }, + { + "epoch": 0.5072046109510087, + "grad_norm": 201.0, + "learning_rate": 8.319148936170213e-07, + "logits/chosen": -3.93359375, + "logits/rejected": -3.87109375, + "logps/chosen": -496.75, + "logps/rejected": -270.875, + "loss": 1.6523, + "nll_loss": 1.0185546875, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.2138671875, + "rewards/margins": 2.18994140625, + "rewards/rejected": 1.02911376953125, + "step": 132 + }, + { + "epoch": 0.5110470701248799, + "grad_norm": 356.0, + "learning_rate": 8.297872340425532e-07, + "logits/chosen": -3.87890625, + "logits/rejected": -3.845703125, + "logps/chosen": -429.0, + "logps/rejected": -217.9375, + "loss": 1.7441, + "nll_loss": 1.0126953125, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.2900390625, + "rewards/margins": 1.745849609375, + "rewards/rejected": 0.5435791015625, + "step": 133 + }, + { + "epoch": 0.5148895292987512, + "grad_norm": 916.0, + "learning_rate": 8.27659574468085e-07, + "logits/chosen": -3.8046875, + "logits/rejected": -3.8046875, + "logps/chosen": -443.75, + "logps/rejected": -265.375, + "loss": 1.5186, + "nll_loss": 0.869384765625, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.380859375, + "rewards/margins": 1.884765625, + "rewards/rejected": 0.49267578125, + "step": 134 + }, + { + "epoch": 0.5187319884726225, + "grad_norm": 245.0, + "learning_rate": 8.255319148936169e-07, + "logits/chosen": -3.767578125, + "logits/rejected": -3.794921875, + "logps/chosen": -478.0, + "logps/rejected": -258.125, + "loss": 1.4658, + "nll_loss": 0.86181640625, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0078125, + "rewards/margins": 2.462890625, + "rewards/rejected": 0.547607421875, + "step": 135 + }, + { + "epoch": 0.5225744476464937, + "grad_norm": 135.0, + "learning_rate": 8.234042553191489e-07, + "logits/chosen": -3.818359375, + "logits/rejected": -3.794921875, + "logps/chosen": -411.75, + "logps/rejected": -227.5, + "loss": 1.4697, + "nll_loss": 0.78857421875, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.4384765625, + "rewards/margins": 1.73291015625, + "rewards/rejected": 0.7071533203125, + "step": 136 + }, + { + "epoch": 0.526416906820365, + "grad_norm": 272.0, + "learning_rate": 8.212765957446808e-07, + "logits/chosen": -3.818359375, + "logits/rejected": -3.908203125, + "logps/chosen": -428.5, + "logps/rejected": -201.5, + "loss": 1.4756, + "nll_loss": 0.8212890625, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.50390625, + "rewards/margins": 1.955810546875, + "rewards/rejected": 0.5482177734375, + "step": 137 + }, + { + "epoch": 0.5302593659942363, + "grad_norm": 362.0, + "learning_rate": 8.191489361702127e-07, + "logits/chosen": -3.7265625, + "logits/rejected": -3.787109375, + "logps/chosen": -477.625, + "logps/rejected": -222.25, + "loss": 1.5947, + "nll_loss": 0.9228515625, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.46875, + "rewards/margins": 1.81732177734375, + "rewards/rejected": 0.6524658203125, + "step": 138 + }, + { + "epoch": 0.5341018251681076, + "grad_norm": 136.0, + "learning_rate": 8.170212765957446e-07, + "logits/chosen": -3.802734375, + "logits/rejected": -3.806640625, + "logps/chosen": -334.0, + "logps/rejected": -194.0, + "loss": 1.6787, + "nll_loss": 0.97802734375, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.7744140625, + "rewards/margins": 1.4150390625, + "rewards/rejected": 0.36053466796875, + "step": 139 + }, + { + "epoch": 0.5379442843419788, + "grad_norm": 326.0, + "learning_rate": 8.148936170212766e-07, + "logits/chosen": -3.802734375, + "logits/rejected": -3.83984375, + "logps/chosen": -524.0, + "logps/rejected": -260.0, + "loss": 1.4521, + "nll_loss": 0.86328125, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.01171875, + "rewards/margins": 2.3251953125, + "rewards/rejected": 0.69134521484375, + "step": 140 + }, + { + "epoch": 0.5417867435158501, + "grad_norm": 184.0, + "learning_rate": 8.127659574468084e-07, + "logits/chosen": -3.767578125, + "logits/rejected": -3.771484375, + "logps/chosen": -489.75, + "logps/rejected": -273.625, + "loss": 1.4404, + "nll_loss": 0.81787109375, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.955078125, + "rewards/margins": 2.04150390625, + "rewards/rejected": 0.913299560546875, + "step": 141 + }, + { + "epoch": 0.5456292026897214, + "grad_norm": 154.0, + "learning_rate": 8.106382978723404e-07, + "logits/chosen": -3.7890625, + "logits/rejected": -3.8203125, + "logps/chosen": -298.75, + "logps/rejected": -211.125, + "loss": 1.7109, + "nll_loss": 0.96142578125, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.9013671875, + "rewards/margins": 1.1875, + "rewards/rejected": 0.7139892578125, + "step": 142 + }, + { + "epoch": 0.5494716618635928, + "grad_norm": 524.0, + "learning_rate": 8.085106382978723e-07, + "logits/chosen": -3.87109375, + "logits/rejected": -3.95703125, + "logps/chosen": -451.25, + "logps/rejected": -257.875, + "loss": 1.7227, + "nll_loss": 0.9521484375, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.51171875, + "rewards/margins": 1.38330078125, + "rewards/rejected": 1.128662109375, + "step": 143 + }, + { + "epoch": 0.553314121037464, + "grad_norm": 167.0, + "learning_rate": 8.063829787234043e-07, + "logits/chosen": -3.8046875, + "logits/rejected": -3.833984375, + "logps/chosen": -449.375, + "logps/rejected": -251.375, + "loss": 1.541, + "nll_loss": 0.94580078125, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7060546875, + "rewards/margins": 2.216796875, + "rewards/rejected": 0.494140625, + "step": 144 + }, + { + "epoch": 0.5571565802113353, + "grad_norm": 219.0, + "learning_rate": 8.042553191489362e-07, + "logits/chosen": -3.74609375, + "logits/rejected": -3.841796875, + "logps/chosen": -576.0, + "logps/rejected": -244.625, + "loss": 1.4922, + "nll_loss": 0.8291015625, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2265625, + "rewards/margins": 2.53125, + "rewards/rejected": 0.6923828125, + "step": 145 + }, + { + "epoch": 0.5609990393852066, + "grad_norm": 193.0, + "learning_rate": 8.02127659574468e-07, + "logits/chosen": -3.7734375, + "logits/rejected": -3.763671875, + "logps/chosen": -420.0, + "logps/rejected": -241.875, + "loss": 1.6504, + "nll_loss": 0.9228515625, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7861328125, + "rewards/margins": 1.5780029296875, + "rewards/rejected": 1.208892822265625, + "step": 146 + }, + { + "epoch": 0.5648414985590778, + "grad_norm": 192.0, + "learning_rate": 8e-07, + "logits/chosen": -3.783203125, + "logits/rejected": -3.9140625, + "logps/chosen": -393.5, + "logps/rejected": -205.875, + "loss": 1.5732, + "nll_loss": 0.92822265625, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.501953125, + "rewards/margins": 1.74072265625, + "rewards/rejected": 0.760040283203125, + "step": 147 + }, + { + "epoch": 0.5686839577329491, + "grad_norm": 149.0, + "learning_rate": 7.978723404255318e-07, + "logits/chosen": -3.7734375, + "logits/rejected": -3.79296875, + "logps/chosen": -394.75, + "logps/rejected": -230.25, + "loss": 1.5508, + "nll_loss": 0.86376953125, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.474609375, + "rewards/margins": 1.82373046875, + "rewards/rejected": 0.6494140625, + "step": 148 + }, + { + "epoch": 0.5725264169068204, + "grad_norm": 235.0, + "learning_rate": 7.957446808510638e-07, + "logits/chosen": -3.7890625, + "logits/rejected": -3.74609375, + "logps/chosen": -389.875, + "logps/rejected": -216.875, + "loss": 1.583, + "nll_loss": 0.86181640625, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.1279296875, + "rewards/margins": 1.52880859375, + "rewards/rejected": 0.60052490234375, + "step": 149 + }, + { + "epoch": 0.5763688760806917, + "grad_norm": 255.0, + "learning_rate": 7.936170212765957e-07, + "logits/chosen": -3.759765625, + "logits/rejected": -3.76953125, + "logps/chosen": -428.75, + "logps/rejected": -234.875, + "loss": 1.6641, + "nll_loss": 0.98828125, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.5654296875, + "rewards/margins": 1.751953125, + "rewards/rejected": 0.81561279296875, + "step": 150 + }, + { + "epoch": 0.5802113352545629, + "grad_norm": 264.0, + "learning_rate": 7.914893617021276e-07, + "logits/chosen": -3.83203125, + "logits/rejected": -3.97265625, + "logps/chosen": -431.75, + "logps/rejected": -231.0, + "loss": 1.6074, + "nll_loss": 0.9482421875, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.5546875, + "rewards/margins": 1.7177734375, + "rewards/rejected": 0.83740234375, + "step": 151 + }, + { + "epoch": 0.5840537944284342, + "grad_norm": 207.0, + "learning_rate": 7.893617021276595e-07, + "logits/chosen": -3.818359375, + "logits/rejected": -3.833984375, + "logps/chosen": -501.25, + "logps/rejected": -279.0, + "loss": 1.4629, + "nll_loss": 0.83984375, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.1220703125, + "rewards/margins": 2.240234375, + "rewards/rejected": 0.882568359375, + "step": 152 + }, + { + "epoch": 0.5878962536023055, + "grad_norm": 704.0, + "learning_rate": 7.872340425531915e-07, + "logits/chosen": -3.833984375, + "logits/rejected": -3.875, + "logps/chosen": -543.5, + "logps/rejected": -278.5, + "loss": 1.6807, + "nll_loss": 0.96923828125, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.64501953125, + "rewards/margins": 1.675048828125, + "rewards/rejected": 0.9708251953125, + "step": 153 + }, + { + "epoch": 0.5917387127761767, + "grad_norm": 157.0, + "learning_rate": 7.851063829787234e-07, + "logits/chosen": -3.71875, + "logits/rejected": -3.8359375, + "logps/chosen": -485.625, + "logps/rejected": -243.5, + "loss": 1.5781, + "nll_loss": 0.96435546875, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8017578125, + "rewards/margins": 2.04345703125, + "rewards/rejected": 0.7607421875, + "step": 154 + }, + { + "epoch": 0.595581171950048, + "grad_norm": 756.0, + "learning_rate": 7.829787234042553e-07, + "logits/chosen": -3.7578125, + "logits/rejected": -3.80859375, + "logps/chosen": -449.75, + "logps/rejected": -244.0, + "loss": 1.6514, + "nll_loss": 0.81591796875, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.001953125, + "rewards/margins": 1.68798828125, + "rewards/rejected": 1.31463623046875, + "step": 155 + }, + { + "epoch": 0.5994236311239193, + "grad_norm": 214.0, + "learning_rate": 7.808510638297872e-07, + "logits/chosen": -3.732421875, + "logits/rejected": -3.837890625, + "logps/chosen": -518.625, + "logps/rejected": -285.25, + "loss": 1.5527, + "nll_loss": 0.91845703125, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.2080078125, + "rewards/margins": 2.19140625, + "rewards/rejected": 1.0189208984375, + "step": 156 + }, + { + "epoch": 0.6032660902977905, + "grad_norm": 162.0, + "learning_rate": 7.787234042553192e-07, + "logits/chosen": -3.79296875, + "logits/rejected": -3.849609375, + "logps/chosen": -495.75, + "logps/rejected": -224.5, + "loss": 1.5039, + "nll_loss": 0.8828125, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.91796875, + "rewards/margins": 2.1611328125, + "rewards/rejected": 0.7611083984375, + "step": 157 + }, + { + "epoch": 0.6071085494716618, + "grad_norm": 280.0, + "learning_rate": 7.765957446808509e-07, + "logits/chosen": -3.73046875, + "logits/rejected": -3.73828125, + "logps/chosen": -422.625, + "logps/rejected": -256.5, + "loss": 1.5732, + "nll_loss": 0.873046875, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.63671875, + "rewards/margins": 1.763671875, + "rewards/rejected": 0.87139892578125, + "step": 158 + }, + { + "epoch": 0.6109510086455331, + "grad_norm": 141.0, + "learning_rate": 7.744680851063829e-07, + "logits/chosen": -3.787109375, + "logits/rejected": -3.93359375, + "logps/chosen": -437.75, + "logps/rejected": -230.125, + "loss": 1.5537, + "nll_loss": 0.89453125, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.953125, + "rewards/margins": 1.92724609375, + "rewards/rejected": 1.03125, + "step": 159 + }, + { + "epoch": 0.6147934678194045, + "grad_norm": 169.0, + "learning_rate": 7.723404255319148e-07, + "logits/chosen": -3.7421875, + "logits/rejected": -3.8046875, + "logps/chosen": -481.25, + "logps/rejected": -259.625, + "loss": 1.4082, + "nll_loss": 0.78955078125, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.7470703125, + "rewards/margins": 2.177978515625, + "rewards/rejected": 0.57177734375, + "step": 160 + }, + { + "epoch": 0.6186359269932757, + "grad_norm": 197.0, + "learning_rate": 7.702127659574467e-07, + "logits/chosen": -3.76953125, + "logits/rejected": -3.75390625, + "logps/chosen": -438.875, + "logps/rejected": -212.4375, + "loss": 1.5586, + "nll_loss": 0.8818359375, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.44921875, + "rewards/margins": 1.86083984375, + "rewards/rejected": 0.590545654296875, + "step": 161 + }, + { + "epoch": 0.622478386167147, + "grad_norm": 165.0, + "learning_rate": 7.680851063829787e-07, + "logits/chosen": -3.8125, + "logits/rejected": -3.88671875, + "logps/chosen": -457.625, + "logps/rejected": -241.125, + "loss": 1.6582, + "nll_loss": 1.0185546875, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.80078125, + "rewards/margins": 2.031982421875, + "rewards/rejected": 0.771240234375, + "step": 162 + }, + { + "epoch": 0.6263208453410183, + "grad_norm": 236.0, + "learning_rate": 7.659574468085106e-07, + "logits/chosen": -3.861328125, + "logits/rejected": -3.84375, + "logps/chosen": -469.0, + "logps/rejected": -246.6875, + "loss": 1.7158, + "nll_loss": 0.99267578125, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.564453125, + "rewards/margins": 1.6572265625, + "rewards/rejected": 0.90728759765625, + "step": 163 + }, + { + "epoch": 0.6301633045148896, + "grad_norm": 189.0, + "learning_rate": 7.638297872340426e-07, + "logits/chosen": -3.783203125, + "logits/rejected": -3.826171875, + "logps/chosen": -413.0, + "logps/rejected": -217.5625, + "loss": 1.5645, + "nll_loss": 0.9404296875, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8408203125, + "rewards/margins": 2.01220703125, + "rewards/rejected": 0.83270263671875, + "step": 164 + }, + { + "epoch": 0.6340057636887608, + "grad_norm": 205.0, + "learning_rate": 7.617021276595744e-07, + "logits/chosen": -3.802734375, + "logits/rejected": -3.841796875, + "logps/chosen": -413.5, + "logps/rejected": -237.625, + "loss": 1.5508, + "nll_loss": 0.89404296875, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.5185546875, + "rewards/margins": 1.78564453125, + "rewards/rejected": 0.734466552734375, + "step": 165 + }, + { + "epoch": 0.6378482228626321, + "grad_norm": 134.0, + "learning_rate": 7.595744680851064e-07, + "logits/chosen": -3.70703125, + "logits/rejected": -3.740234375, + "logps/chosen": -388.75, + "logps/rejected": -207.625, + "loss": 1.5195, + "nll_loss": 0.86376953125, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.3662109375, + "rewards/margins": 1.86181640625, + "rewards/rejected": 0.5078125, + "step": 166 + }, + { + "epoch": 0.6416906820365034, + "grad_norm": 179.0, + "learning_rate": 7.574468085106383e-07, + "logits/chosen": -3.830078125, + "logits/rejected": -3.875, + "logps/chosen": -396.5, + "logps/rejected": -201.0, + "loss": 1.4912, + "nll_loss": 0.87060546875, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.5078125, + "rewards/margins": 2.03515625, + "rewards/rejected": 0.47314453125, + "step": 167 + }, + { + "epoch": 0.6455331412103746, + "grad_norm": 115.0, + "learning_rate": 7.553191489361702e-07, + "logits/chosen": -3.830078125, + "logits/rejected": -3.83984375, + "logps/chosen": -441.375, + "logps/rejected": -247.625, + "loss": 1.5264, + "nll_loss": 0.904296875, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.04296875, + "rewards/margins": 2.189453125, + "rewards/rejected": 0.8546142578125, + "step": 168 + }, + { + "epoch": 0.6493756003842459, + "grad_norm": 145.0, + "learning_rate": 7.531914893617021e-07, + "logits/chosen": -3.779296875, + "logits/rejected": -3.830078125, + "logps/chosen": -353.125, + "logps/rejected": -221.5625, + "loss": 1.4199, + "nll_loss": 0.734375, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7216796875, + "rewards/margins": 1.8662109375, + "rewards/rejected": 0.858154296875, + "step": 169 + }, + { + "epoch": 0.6532180595581172, + "grad_norm": 420.0, + "learning_rate": 7.510638297872341e-07, + "logits/chosen": -3.83203125, + "logits/rejected": -3.958984375, + "logps/chosen": -459.625, + "logps/rejected": -244.3125, + "loss": 1.5928, + "nll_loss": 0.97314453125, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.11181640625, + "rewards/margins": 2.3369140625, + "rewards/rejected": 0.77044677734375, + "step": 170 + }, + { + "epoch": 0.6570605187319885, + "grad_norm": 193.0, + "learning_rate": 7.489361702127658e-07, + "logits/chosen": -3.8125, + "logits/rejected": -3.841796875, + "logps/chosen": -501.0, + "logps/rejected": -260.9375, + "loss": 1.5576, + "nll_loss": 0.92529296875, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.12109375, + "rewards/margins": 2.169677734375, + "rewards/rejected": 0.94866943359375, + "step": 171 + }, + { + "epoch": 0.6609029779058597, + "grad_norm": 232.0, + "learning_rate": 7.468085106382978e-07, + "logits/chosen": -3.810546875, + "logits/rejected": -3.83984375, + "logps/chosen": -419.5, + "logps/rejected": -269.0, + "loss": 1.5303, + "nll_loss": 0.8681640625, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.69140625, + "rewards/margins": 1.7353515625, + "rewards/rejected": 0.9569091796875, + "step": 172 + }, + { + "epoch": 0.664745437079731, + "grad_norm": 159.0, + "learning_rate": 7.446808510638297e-07, + "logits/chosen": -3.79296875, + "logits/rejected": -3.91015625, + "logps/chosen": -473.0, + "logps/rejected": -190.75, + "loss": 1.5039, + "nll_loss": 0.92822265625, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.083984375, + "rewards/margins": 2.6005859375, + "rewards/rejected": 0.48876953125, + "step": 173 + }, + { + "epoch": 0.664745437079731, + "eval_logits/chosen": -3.7869317531585693, + "eval_logits/rejected": -3.843465805053711, + "eval_logps/chosen": -452.8636474609375, + "eval_logps/rejected": -244.22727966308594, + "eval_loss": 1.534204125404358, + "eval_nll_loss": 0.8988281488418579, + "eval_rewards/accuracies": 0.8719696998596191, + "eval_rewards/chosen": 2.9147372245788574, + "eval_rewards/margins": 2.097864866256714, + "eval_rewards/rejected": 0.8157692551612854, + "eval_runtime": 99.0053, + "eval_samples_per_second": 4.434, + "eval_steps_per_second": 1.111, + "step": 173 + }, + { + "epoch": 0.6685878962536023, + "grad_norm": 142.0, + "learning_rate": 7.425531914893617e-07, + "logits/chosen": -3.673828125, + "logits/rejected": -3.712890625, + "logps/chosen": -340.625, + "logps/rejected": -205.75, + "loss": 1.5381, + "nll_loss": 0.83642578125, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.04296875, + "rewards/margins": 1.58740234375, + "rewards/rejected": 0.453948974609375, + "step": 174 + }, + { + "epoch": 0.6724303554274735, + "grad_norm": 164.0, + "learning_rate": 7.404255319148935e-07, + "logits/chosen": -3.8125, + "logits/rejected": -3.8515625, + "logps/chosen": -495.5, + "logps/rejected": -224.0, + "loss": 1.4424, + "nll_loss": 0.8544921875, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.958984375, + "rewards/margins": 2.544921875, + "rewards/rejected": 0.41546630859375, + "step": 175 + }, + { + "epoch": 0.6762728146013448, + "grad_norm": 124.0, + "learning_rate": 7.382978723404255e-07, + "logits/chosen": -3.74609375, + "logits/rejected": -3.939453125, + "logps/chosen": -448.625, + "logps/rejected": -227.0, + "loss": 1.5391, + "nll_loss": 0.95166015625, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0302734375, + "rewards/margins": 2.4423828125, + "rewards/rejected": 0.58929443359375, + "step": 176 + }, + { + "epoch": 0.6801152737752162, + "grad_norm": 165.0, + "learning_rate": 7.361702127659574e-07, + "logits/chosen": -3.79296875, + "logits/rejected": -3.775390625, + "logps/chosen": -469.5, + "logps/rejected": -245.125, + "loss": 1.5039, + "nll_loss": 0.88037109375, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.03125, + "rewards/margins": 2.251953125, + "rewards/rejected": 0.782012939453125, + "step": 177 + }, + { + "epoch": 0.6839577329490875, + "grad_norm": 189.0, + "learning_rate": 7.340425531914893e-07, + "logits/chosen": -3.837890625, + "logits/rejected": -3.912109375, + "logps/chosen": -597.25, + "logps/rejected": -320.625, + "loss": 1.5342, + "nll_loss": 0.9814453125, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.861328125, + "rewards/margins": 2.94140625, + "rewards/rejected": 0.9156494140625, + "step": 178 + }, + { + "epoch": 0.6878001921229587, + "grad_norm": 126.0, + "learning_rate": 7.319148936170212e-07, + "logits/chosen": -3.8125, + "logits/rejected": -3.87890625, + "logps/chosen": -547.5, + "logps/rejected": -252.375, + "loss": 1.46, + "nll_loss": 0.880859375, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.619140625, + "rewards/margins": 2.7490234375, + "rewards/rejected": 0.87322998046875, + "step": 179 + }, + { + "epoch": 0.69164265129683, + "grad_norm": 146.0, + "learning_rate": 7.297872340425532e-07, + "logits/chosen": -3.826171875, + "logits/rejected": -3.857421875, + "logps/chosen": -368.25, + "logps/rejected": -194.0625, + "loss": 1.6357, + "nll_loss": 0.95263671875, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.2880859375, + "rewards/margins": 1.59130859375, + "rewards/rejected": 0.69732666015625, + "step": 180 + }, + { + "epoch": 0.6954851104707013, + "grad_norm": 161.0, + "learning_rate": 7.276595744680852e-07, + "logits/chosen": -3.8203125, + "logits/rejected": -3.90234375, + "logps/chosen": -583.5, + "logps/rejected": -281.25, + "loss": 1.4902, + "nll_loss": 0.93798828125, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.755859375, + "rewards/margins": 2.79296875, + "rewards/rejected": 0.96649169921875, + "step": 181 + }, + { + "epoch": 0.6993275696445725, + "grad_norm": 175.0, + "learning_rate": 7.25531914893617e-07, + "logits/chosen": -3.791015625, + "logits/rejected": -3.8359375, + "logps/chosen": -406.5, + "logps/rejected": -216.3125, + "loss": 1.6201, + "nll_loss": 0.98193359375, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.076171875, + "rewards/margins": 2.27734375, + "rewards/rejected": 0.8037109375, + "step": 182 + }, + { + "epoch": 0.7031700288184438, + "grad_norm": 206.0, + "learning_rate": 7.23404255319149e-07, + "logits/chosen": -3.7890625, + "logits/rejected": -3.833984375, + "logps/chosen": -464.0, + "logps/rejected": -291.75, + "loss": 1.5371, + "nll_loss": 0.85791015625, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7001953125, + "rewards/margins": 1.8291015625, + "rewards/rejected": 0.871002197265625, + "step": 183 + }, + { + "epoch": 0.7070124879923151, + "grad_norm": 223.0, + "learning_rate": 7.212765957446808e-07, + "logits/chosen": -3.859375, + "logits/rejected": -3.849609375, + "logps/chosen": -461.0, + "logps/rejected": -299.375, + "loss": 1.6543, + "nll_loss": 0.953125, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.890625, + "rewards/margins": 1.75732421875, + "rewards/rejected": 1.1343994140625, + "step": 184 + }, + { + "epoch": 0.7108549471661864, + "grad_norm": 189.0, + "learning_rate": 7.191489361702127e-07, + "logits/chosen": -3.7421875, + "logits/rejected": -3.806640625, + "logps/chosen": -451.25, + "logps/rejected": -214.75, + "loss": 1.7754, + "nll_loss": 1.09326171875, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.876953125, + "rewards/margins": 2.07373046875, + "rewards/rejected": 0.803802490234375, + "step": 185 + }, + { + "epoch": 0.7146974063400576, + "grad_norm": 219.0, + "learning_rate": 7.170212765957446e-07, + "logits/chosen": -3.716796875, + "logits/rejected": -3.701171875, + "logps/chosen": -384.75, + "logps/rejected": -228.9375, + "loss": 1.6006, + "nll_loss": 0.8798828125, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.453125, + "rewards/margins": 1.5185546875, + "rewards/rejected": 0.93548583984375, + "step": 186 + }, + { + "epoch": 0.7185398655139289, + "grad_norm": 242.0, + "learning_rate": 7.148936170212766e-07, + "logits/chosen": -3.744140625, + "logits/rejected": -3.771484375, + "logps/chosen": -470.25, + "logps/rejected": -244.875, + "loss": 1.5479, + "nll_loss": 0.93505859375, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.1416015625, + "rewards/margins": 2.41015625, + "rewards/rejected": 0.729248046875, + "step": 187 + }, + { + "epoch": 0.7223823246878002, + "grad_norm": 223.0, + "learning_rate": 7.127659574468084e-07, + "logits/chosen": -3.796875, + "logits/rejected": -3.8359375, + "logps/chosen": -395.0, + "logps/rejected": -217.625, + "loss": 1.5762, + "nll_loss": 0.9091796875, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.541015625, + "rewards/margins": 1.736328125, + "rewards/rejected": 0.8046875, + "step": 188 + }, + { + "epoch": 0.7262247838616714, + "grad_norm": 560.0, + "learning_rate": 7.106382978723404e-07, + "logits/chosen": -3.76171875, + "logits/rejected": -3.82421875, + "logps/chosen": -377.125, + "logps/rejected": -193.875, + "loss": 1.6006, + "nll_loss": 0.95166015625, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.66796875, + "rewards/margins": 1.99267578125, + "rewards/rejected": 0.68115234375, + "step": 189 + }, + { + "epoch": 0.7300672430355427, + "grad_norm": 148.0, + "learning_rate": 7.085106382978723e-07, + "logits/chosen": -3.767578125, + "logits/rejected": -3.818359375, + "logps/chosen": -476.25, + "logps/rejected": -241.0, + "loss": 1.4458, + "nll_loss": 0.8701171875, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.064453125, + "rewards/margins": 2.40869140625, + "rewards/rejected": 0.6568603515625, + "step": 190 + }, + { + "epoch": 0.733909702209414, + "grad_norm": 172.0, + "learning_rate": 7.063829787234043e-07, + "logits/chosen": -3.74609375, + "logits/rejected": -3.865234375, + "logps/chosen": -434.5, + "logps/rejected": -209.5, + "loss": 1.4482, + "nll_loss": 0.880859375, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.935546875, + "rewards/margins": 2.3828125, + "rewards/rejected": 0.554443359375, + "step": 191 + }, + { + "epoch": 0.7377521613832853, + "grad_norm": 164.0, + "learning_rate": 7.042553191489361e-07, + "logits/chosen": -3.830078125, + "logits/rejected": -3.833984375, + "logps/chosen": -439.75, + "logps/rejected": -231.0625, + "loss": 1.4863, + "nll_loss": 0.87353515625, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.2080078125, + "rewards/margins": 2.437255859375, + "rewards/rejected": 0.7763671875, + "step": 192 + }, + { + "epoch": 0.7415946205571565, + "grad_norm": 145.0, + "learning_rate": 7.021276595744681e-07, + "logits/chosen": -3.728515625, + "logits/rejected": -3.71484375, + "logps/chosen": -447.75, + "logps/rejected": -249.25, + "loss": 1.4814, + "nll_loss": 0.8681640625, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.884765625, + "rewards/margins": 2.1962890625, + "rewards/rejected": 0.68975830078125, + "step": 193 + }, + { + "epoch": 0.7454370797310279, + "grad_norm": 159.0, + "learning_rate": 7e-07, + "logits/chosen": -3.732421875, + "logits/rejected": -3.82421875, + "logps/chosen": -457.25, + "logps/rejected": -236.25, + "loss": 1.4473, + "nll_loss": 0.8447265625, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.908203125, + "rewards/margins": 2.3173828125, + "rewards/rejected": 0.59228515625, + "step": 194 + }, + { + "epoch": 0.7492795389048992, + "grad_norm": 192.0, + "learning_rate": 6.978723404255319e-07, + "logits/chosen": -3.734375, + "logits/rejected": -3.83203125, + "logps/chosen": -432.125, + "logps/rejected": -235.875, + "loss": 1.5566, + "nll_loss": 0.87109375, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7744140625, + "rewards/margins": 1.86865234375, + "rewards/rejected": 0.9033203125, + "step": 195 + }, + { + "epoch": 0.7531219980787704, + "grad_norm": 140.0, + "learning_rate": 6.957446808510637e-07, + "logits/chosen": -3.833984375, + "logits/rejected": -3.873046875, + "logps/chosen": -503.5, + "logps/rejected": -281.0, + "loss": 1.4258, + "nll_loss": 0.869140625, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.8203125, + "rewards/margins": 2.818359375, + "rewards/rejected": 1.002685546875, + "step": 196 + }, + { + "epoch": 0.7569644572526417, + "grad_norm": 135.0, + "learning_rate": 6.936170212765957e-07, + "logits/chosen": -3.75, + "logits/rejected": -3.8203125, + "logps/chosen": -487.0, + "logps/rejected": -233.875, + "loss": 1.3809, + "nll_loss": 0.7783203125, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.23828125, + "rewards/margins": 2.65234375, + "rewards/rejected": 0.58453369140625, + "step": 197 + }, + { + "epoch": 0.760806916426513, + "grad_norm": 179.0, + "learning_rate": 6.914893617021277e-07, + "logits/chosen": -3.822265625, + "logits/rejected": -3.8359375, + "logps/chosen": -527.125, + "logps/rejected": -236.375, + "loss": 1.5029, + "nll_loss": 0.93115234375, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.384765625, + "rewards/margins": 2.87188720703125, + "rewards/rejected": 0.515869140625, + "step": 198 + }, + { + "epoch": 0.7646493756003843, + "grad_norm": 372.0, + "learning_rate": 6.893617021276595e-07, + "logits/chosen": -3.72265625, + "logits/rejected": -3.75390625, + "logps/chosen": -544.25, + "logps/rejected": -271.875, + "loss": 1.4502, + "nll_loss": 0.8876953125, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.447265625, + "rewards/margins": 2.6640625, + "rewards/rejected": 0.782470703125, + "step": 199 + }, + { + "epoch": 0.7684918347742555, + "grad_norm": 150.0, + "learning_rate": 6.872340425531915e-07, + "logits/chosen": -3.828125, + "logits/rejected": -3.841796875, + "logps/chosen": -413.75, + "logps/rejected": -238.0625, + "loss": 1.5127, + "nll_loss": 0.90869140625, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.22265625, + "rewards/margins": 2.31689453125, + "rewards/rejected": 0.90435791015625, + "step": 200 + }, + { + "epoch": 0.7723342939481268, + "grad_norm": 175.0, + "learning_rate": 6.851063829787234e-07, + "logits/chosen": -3.759765625, + "logits/rejected": -3.7578125, + "logps/chosen": -374.25, + "logps/rejected": -212.125, + "loss": 1.4268, + "nll_loss": 0.8056640625, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.40234375, + "rewards/margins": 1.900390625, + "rewards/rejected": 0.500244140625, + "step": 201 + }, + { + "epoch": 0.7761767531219981, + "grad_norm": 178.0, + "learning_rate": 6.829787234042553e-07, + "logits/chosen": -3.767578125, + "logits/rejected": -3.8046875, + "logps/chosen": -609.25, + "logps/rejected": -293.25, + "loss": 1.3906, + "nll_loss": 0.890625, + "rewards/accuracies": 0.9375, + "rewards/chosen": 4.078125, + "rewards/margins": 3.3359375, + "rewards/rejected": 0.7420654296875, + "step": 202 + }, + { + "epoch": 0.7800192122958693, + "grad_norm": 1944.0, + "learning_rate": 6.808510638297872e-07, + "logits/chosen": -3.759765625, + "logits/rejected": -3.875, + "logps/chosen": -476.25, + "logps/rejected": -289.875, + "loss": 1.5664, + "nll_loss": 0.92919921875, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.255859375, + "rewards/margins": 2.330078125, + "rewards/rejected": 0.927581787109375, + "step": 203 + }, + { + "epoch": 0.7838616714697406, + "grad_norm": 110.5, + "learning_rate": 6.787234042553192e-07, + "logits/chosen": -3.712890625, + "logits/rejected": -3.833984375, + "logps/chosen": -457.125, + "logps/rejected": -239.375, + "loss": 1.3916, + "nll_loss": 0.81103515625, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8671875, + "rewards/margins": 2.27734375, + "rewards/rejected": 0.592254638671875, + "step": 204 + }, + { + "epoch": 0.7877041306436119, + "grad_norm": 238.0, + "learning_rate": 6.76595744680851e-07, + "logits/chosen": -3.76953125, + "logits/rejected": -3.865234375, + "logps/chosen": -440.875, + "logps/rejected": -240.625, + "loss": 1.6504, + "nll_loss": 0.99853515625, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.037109375, + "rewards/margins": 2.1580810546875, + "rewards/rejected": 0.881072998046875, + "step": 205 + }, + { + "epoch": 0.7915465898174832, + "grad_norm": 700.0, + "learning_rate": 6.74468085106383e-07, + "logits/chosen": -3.755859375, + "logits/rejected": -3.865234375, + "logps/chosen": -335.0, + "logps/rejected": -204.375, + "loss": 1.5771, + "nll_loss": 0.798095703125, + "rewards/accuracies": 0.71875, + "rewards/chosen": 1.99755859375, + "rewards/margins": 1.284423828125, + "rewards/rejected": 0.71728515625, + "step": 206 + }, + { + "epoch": 0.7953890489913544, + "grad_norm": 236.0, + "learning_rate": 6.723404255319149e-07, + "logits/chosen": -3.810546875, + "logits/rejected": -3.84765625, + "logps/chosen": -444.5, + "logps/rejected": -288.0, + "loss": 1.5977, + "nll_loss": 0.90478515625, + "rewards/accuracies": 0.78125, + "rewards/chosen": 3.228515625, + "rewards/margins": 1.951171875, + "rewards/rejected": 1.279541015625, + "step": 207 + }, + { + "epoch": 0.7992315081652257, + "grad_norm": 210.0, + "learning_rate": 6.702127659574468e-07, + "logits/chosen": -3.818359375, + "logits/rejected": -3.8984375, + "logps/chosen": -431.25, + "logps/rejected": -227.875, + "loss": 1.3926, + "nll_loss": 0.84521484375, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9912109375, + "rewards/margins": 2.4765625, + "rewards/rejected": 0.5184326171875, + "step": 208 + }, + { + "epoch": 0.803073967339097, + "grad_norm": 288.0, + "learning_rate": 6.680851063829786e-07, + "logits/chosen": -3.798828125, + "logits/rejected": -3.8359375, + "logps/chosen": -355.5, + "logps/rejected": -223.0, + "loss": 1.4746, + "nll_loss": 0.852294921875, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.63671875, + "rewards/margins": 1.96875, + "rewards/rejected": 0.664794921875, + "step": 209 + }, + { + "epoch": 0.8069164265129684, + "grad_norm": 280.0, + "learning_rate": 6.659574468085106e-07, + "logits/chosen": -3.849609375, + "logits/rejected": -3.828125, + "logps/chosen": -453.875, + "logps/rejected": -241.125, + "loss": 1.5791, + "nll_loss": 0.94970703125, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.060546875, + "rewards/margins": 2.365234375, + "rewards/rejected": 0.692626953125, + "step": 210 + }, + { + "epoch": 0.8107588856868396, + "grad_norm": 189.0, + "learning_rate": 6.638297872340425e-07, + "logits/chosen": -3.802734375, + "logits/rejected": -3.79296875, + "logps/chosen": -420.0, + "logps/rejected": -242.25, + "loss": 1.4805, + "nll_loss": 0.8671875, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.93017578125, + "rewards/margins": 2.37841796875, + "rewards/rejected": 0.5509033203125, + "step": 211 + }, + { + "epoch": 0.8146013448607109, + "grad_norm": 374.0, + "learning_rate": 6.617021276595744e-07, + "logits/chosen": -3.80859375, + "logits/rejected": -3.837890625, + "logps/chosen": -417.75, + "logps/rejected": -231.75, + "loss": 1.5596, + "nll_loss": 0.888671875, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.0, + "rewards/margins": 2.12109375, + "rewards/rejected": 0.8765869140625, + "step": 212 + }, + { + "epoch": 0.8184438040345822, + "grad_norm": 2320.0, + "learning_rate": 6.595744680851063e-07, + "logits/chosen": -3.791015625, + "logits/rejected": -3.9140625, + "logps/chosen": -352.375, + "logps/rejected": -208.5625, + "loss": 1.498, + "nll_loss": 0.82275390625, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.630859375, + "rewards/margins": 1.8463134765625, + "rewards/rejected": 0.7886962890625, + "step": 213 + }, + { + "epoch": 0.8222862632084534, + "grad_norm": 160.0, + "learning_rate": 6.574468085106383e-07, + "logits/chosen": -3.75390625, + "logits/rejected": -3.853515625, + "logps/chosen": -433.0, + "logps/rejected": -261.25, + "loss": 1.5693, + "nll_loss": 0.9521484375, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.052734375, + "rewards/margins": 2.2099609375, + "rewards/rejected": 0.843505859375, + "step": 214 + }, + { + "epoch": 0.8261287223823247, + "grad_norm": 264.0, + "learning_rate": 6.553191489361701e-07, + "logits/chosen": -3.8359375, + "logits/rejected": -3.85546875, + "logps/chosen": -412.25, + "logps/rejected": -240.25, + "loss": 1.5322, + "nll_loss": 0.84814453125, + "rewards/accuracies": 0.78125, + "rewards/chosen": 3.07421875, + "rewards/margins": 1.99176025390625, + "rewards/rejected": 1.083740234375, + "step": 215 + }, + { + "epoch": 0.829971181556196, + "grad_norm": 124.5, + "learning_rate": 6.531914893617021e-07, + "logits/chosen": -3.76953125, + "logits/rejected": -3.849609375, + "logps/chosen": -383.25, + "logps/rejected": -229.625, + "loss": 1.4883, + "nll_loss": 0.82568359375, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.2421875, + "rewards/margins": 1.88623046875, + "rewards/rejected": 0.35101318359375, + "step": 216 + }, + { + "epoch": 0.8338136407300673, + "grad_norm": 214.0, + "learning_rate": 6.510638297872341e-07, + "logits/chosen": -3.767578125, + "logits/rejected": -3.79296875, + "logps/chosen": -447.875, + "logps/rejected": -252.5, + "loss": 1.5195, + "nll_loss": 0.833984375, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.1201171875, + "rewards/margins": 2.15576171875, + "rewards/rejected": 0.96490478515625, + "step": 217 + }, + { + "epoch": 0.8376560999039385, + "grad_norm": 201.0, + "learning_rate": 6.48936170212766e-07, + "logits/chosen": -3.708984375, + "logits/rejected": -3.767578125, + "logps/chosen": -447.0, + "logps/rejected": -226.75, + "loss": 1.5039, + "nll_loss": 0.904296875, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.3291015625, + "rewards/margins": 2.4716796875, + "rewards/rejected": 0.85400390625, + "step": 218 + }, + { + "epoch": 0.8414985590778098, + "grad_norm": 312.0, + "learning_rate": 6.468085106382979e-07, + "logits/chosen": -3.767578125, + "logits/rejected": -3.791015625, + "logps/chosen": -484.5, + "logps/rejected": -235.625, + "loss": 1.3857, + "nll_loss": 0.87060546875, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.521484375, + "rewards/margins": 3.06884765625, + "rewards/rejected": 0.447265625, + "step": 219 + }, + { + "epoch": 0.8453410182516811, + "grad_norm": 193.0, + "learning_rate": 6.446808510638297e-07, + "logits/chosen": -3.88671875, + "logits/rejected": -3.833984375, + "logps/chosen": -442.875, + "logps/rejected": -253.125, + "loss": 1.4668, + "nll_loss": 0.84228515625, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2177734375, + "rewards/margins": 2.234375, + "rewards/rejected": 0.99041748046875, + "step": 220 + }, + { + "epoch": 0.8491834774255523, + "grad_norm": 109.5, + "learning_rate": 6.425531914893617e-07, + "logits/chosen": -3.69921875, + "logits/rejected": -3.748046875, + "logps/chosen": -456.25, + "logps/rejected": -241.75, + "loss": 1.3232, + "nll_loss": 0.8154296875, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.26171875, + "rewards/margins": 2.8974609375, + "rewards/rejected": 0.36376953125, + "step": 221 + }, + { + "epoch": 0.8530259365994236, + "grad_norm": 130.0, + "learning_rate": 6.404255319148935e-07, + "logits/chosen": -3.763671875, + "logits/rejected": -3.83984375, + "logps/chosen": -518.25, + "logps/rejected": -247.0, + "loss": 1.3779, + "nll_loss": 0.83251953125, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.48828125, + "rewards/margins": 2.8662109375, + "rewards/rejected": 0.62738037109375, + "step": 222 + }, + { + "epoch": 0.8568683957732949, + "grad_norm": 406.0, + "learning_rate": 6.382978723404255e-07, + "logits/chosen": -3.787109375, + "logits/rejected": -3.90625, + "logps/chosen": -402.875, + "logps/rejected": -212.375, + "loss": 1.4863, + "nll_loss": 0.88330078125, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.34130859375, + "rewards/margins": 2.467529296875, + "rewards/rejected": 0.874755859375, + "step": 223 + }, + { + "epoch": 0.8607108549471661, + "grad_norm": 310.0, + "learning_rate": 6.361702127659574e-07, + "logits/chosen": -3.826171875, + "logits/rejected": -3.8203125, + "logps/chosen": -348.125, + "logps/rejected": -229.125, + "loss": 1.5693, + "nll_loss": 0.9150390625, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8916015625, + "rewards/margins": 1.9091796875, + "rewards/rejected": 0.98388671875, + "step": 224 + }, + { + "epoch": 0.8645533141210374, + "grad_norm": 219.0, + "learning_rate": 6.340425531914893e-07, + "logits/chosen": -3.6953125, + "logits/rejected": -3.779296875, + "logps/chosen": -457.25, + "logps/rejected": -216.625, + "loss": 1.4912, + "nll_loss": 0.93798828125, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.94140625, + "rewards/margins": 2.544921875, + "rewards/rejected": 0.3935546875, + "step": 225 + }, + { + "epoch": 0.8683957732949087, + "grad_norm": 212.0, + "learning_rate": 6.319148936170212e-07, + "logits/chosen": -3.853515625, + "logits/rejected": -3.90234375, + "logps/chosen": -398.625, + "logps/rejected": -239.0, + "loss": 1.5586, + "nll_loss": 0.92822265625, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2861328125, + "rewards/margins": 2.166015625, + "rewards/rejected": 1.119873046875, + "step": 226 + }, + { + "epoch": 0.8722382324687801, + "grad_norm": 214.0, + "learning_rate": 6.297872340425532e-07, + "logits/chosen": -3.7421875, + "logits/rejected": -3.830078125, + "logps/chosen": -536.75, + "logps/rejected": -261.375, + "loss": 1.4053, + "nll_loss": 0.83740234375, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.732421875, + "rewards/margins": 3.10546875, + "rewards/rejected": 0.6224365234375, + "step": 227 + }, + { + "epoch": 0.8760806916426513, + "grad_norm": 236.0, + "learning_rate": 6.276595744680851e-07, + "logits/chosen": -3.6796875, + "logits/rejected": -3.748046875, + "logps/chosen": -400.25, + "logps/rejected": -227.5, + "loss": 1.5186, + "nll_loss": 0.96240234375, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8203125, + "rewards/margins": 2.5546875, + "rewards/rejected": 0.267059326171875, + "step": 228 + }, + { + "epoch": 0.8799231508165226, + "grad_norm": 179.0, + "learning_rate": 6.25531914893617e-07, + "logits/chosen": -3.818359375, + "logits/rejected": -3.8203125, + "logps/chosen": -481.5, + "logps/rejected": -249.5, + "loss": 1.5068, + "nll_loss": 0.90625, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.5986328125, + "rewards/margins": 2.740234375, + "rewards/rejected": 0.86077880859375, + "step": 229 + }, + { + "epoch": 0.8837656099903939, + "grad_norm": 237.0, + "learning_rate": 6.234042553191489e-07, + "logits/chosen": -3.802734375, + "logits/rejected": -3.8984375, + "logps/chosen": -448.625, + "logps/rejected": -230.125, + "loss": 1.6172, + "nll_loss": 0.94677734375, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.3154296875, + "rewards/margins": 2.23095703125, + "rewards/rejected": 1.08544921875, + "step": 230 + }, + { + "epoch": 0.8876080691642652, + "grad_norm": 268.0, + "learning_rate": 6.212765957446809e-07, + "logits/chosen": -3.80859375, + "logits/rejected": -3.857421875, + "logps/chosen": -636.25, + "logps/rejected": -313.625, + "loss": 1.4727, + "nll_loss": 0.94677734375, + "rewards/accuracies": 0.90625, + "rewards/chosen": 4.453125, + "rewards/margins": 3.326416015625, + "rewards/rejected": 1.132080078125, + "step": 231 + }, + { + "epoch": 0.8914505283381364, + "grad_norm": 502.0, + "learning_rate": 6.191489361702127e-07, + "logits/chosen": -3.822265625, + "logits/rejected": -3.828125, + "logps/chosen": -427.25, + "logps/rejected": -233.25, + "loss": 1.5098, + "nll_loss": 0.869140625, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0439453125, + "rewards/margins": 2.1865234375, + "rewards/rejected": 0.8575439453125, + "step": 232 + }, + { + "epoch": 0.8952929875120077, + "grad_norm": 168.0, + "learning_rate": 6.170212765957446e-07, + "logits/chosen": -3.7265625, + "logits/rejected": -3.84765625, + "logps/chosen": -488.5, + "logps/rejected": -262.25, + "loss": 1.5352, + "nll_loss": 0.92724609375, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.283203125, + "rewards/margins": 2.29248046875, + "rewards/rejected": 0.9892578125, + "step": 233 + }, + { + "epoch": 0.899135446685879, + "grad_norm": 172.0, + "learning_rate": 6.148936170212766e-07, + "logits/chosen": -3.765625, + "logits/rejected": -3.8671875, + "logps/chosen": -495.75, + "logps/rejected": -245.75, + "loss": 1.6201, + "nll_loss": 0.95849609375, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.51171875, + "rewards/margins": 2.415283203125, + "rewards/rejected": 1.10302734375, + "step": 234 + }, + { + "epoch": 0.9029779058597502, + "grad_norm": 180.0, + "learning_rate": 6.127659574468084e-07, + "logits/chosen": -3.74609375, + "logits/rejected": -3.818359375, + "logps/chosen": -412.75, + "logps/rejected": -279.5625, + "loss": 1.4961, + "nll_loss": 0.83837890625, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.83203125, + "rewards/margins": 2.001708984375, + "rewards/rejected": 0.83062744140625, + "step": 235 + }, + { + "epoch": 0.9068203650336215, + "grad_norm": 222.0, + "learning_rate": 6.106382978723404e-07, + "logits/chosen": -3.734375, + "logits/rejected": -3.7734375, + "logps/chosen": -469.5, + "logps/rejected": -232.125, + "loss": 1.375, + "nll_loss": 0.8193359375, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0224609375, + "rewards/margins": 2.7265625, + "rewards/rejected": 0.3011474609375, + "step": 236 + }, + { + "epoch": 0.9106628242074928, + "grad_norm": 223.0, + "learning_rate": 6.085106382978723e-07, + "logits/chosen": -3.78515625, + "logits/rejected": -3.7578125, + "logps/chosen": -414.625, + "logps/rejected": -197.75, + "loss": 1.6211, + "nll_loss": 1.01904296875, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.046875, + "rewards/margins": 2.3349609375, + "rewards/rejected": 0.7138671875, + "step": 237 + }, + { + "epoch": 0.914505283381364, + "grad_norm": 150.0, + "learning_rate": 6.063829787234043e-07, + "logits/chosen": -3.736328125, + "logits/rejected": -3.75, + "logps/chosen": -415.625, + "logps/rejected": -228.9375, + "loss": 1.4229, + "nll_loss": 0.84912109375, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2373046875, + "rewards/margins": 2.6064453125, + "rewards/rejected": 0.629852294921875, + "step": 238 + }, + { + "epoch": 0.9183477425552353, + "grad_norm": 364.0, + "learning_rate": 6.042553191489361e-07, + "logits/chosen": -3.748046875, + "logits/rejected": -3.857421875, + "logps/chosen": -426.75, + "logps/rejected": -213.9375, + "loss": 1.5977, + "nll_loss": 0.921875, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.125, + "rewards/margins": 2.17919921875, + "rewards/rejected": 0.9476318359375, + "step": 239 + }, + { + "epoch": 0.9221902017291066, + "grad_norm": 126.5, + "learning_rate": 6.021276595744681e-07, + "logits/chosen": -3.748046875, + "logits/rejected": -3.83203125, + "logps/chosen": -417.75, + "logps/rejected": -238.0, + "loss": 1.3906, + "nll_loss": 0.80126953125, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.173828125, + "rewards/margins": 2.333984375, + "rewards/rejected": 0.8408203125, + "step": 240 + }, + { + "epoch": 0.9260326609029779, + "grad_norm": 170.0, + "learning_rate": 6e-07, + "logits/chosen": -3.857421875, + "logits/rejected": -3.822265625, + "logps/chosen": -419.875, + "logps/rejected": -261.5, + "loss": 1.5698, + "nll_loss": 0.9013671875, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.12890625, + "rewards/margins": 2.24072265625, + "rewards/rejected": 0.89031982421875, + "step": 241 + }, + { + "epoch": 0.9298751200768491, + "grad_norm": 173.0, + "learning_rate": 5.978723404255319e-07, + "logits/chosen": -3.728515625, + "logits/rejected": -3.716796875, + "logps/chosen": -546.0, + "logps/rejected": -295.25, + "loss": 1.4941, + "nll_loss": 0.91845703125, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.43359375, + "rewards/margins": 2.650390625, + "rewards/rejected": 0.77935791015625, + "step": 242 + }, + { + "epoch": 0.9337175792507204, + "grad_norm": 252.0, + "learning_rate": 5.957446808510638e-07, + "logits/chosen": -3.837890625, + "logits/rejected": -3.79296875, + "logps/chosen": -439.75, + "logps/rejected": -273.0, + "loss": 1.4941, + "nll_loss": 0.892822265625, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.123046875, + "rewards/margins": 2.2939453125, + "rewards/rejected": 0.830169677734375, + "step": 243 + }, + { + "epoch": 0.9375600384245918, + "grad_norm": 348.0, + "learning_rate": 5.936170212765958e-07, + "logits/chosen": -3.767578125, + "logits/rejected": -3.81640625, + "logps/chosen": -383.75, + "logps/rejected": -236.625, + "loss": 1.5947, + "nll_loss": 0.88916015625, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.6728515625, + "rewards/margins": 1.78857421875, + "rewards/rejected": 0.884765625, + "step": 244 + }, + { + "epoch": 0.9414024975984631, + "grad_norm": 272.0, + "learning_rate": 5.914893617021275e-07, + "logits/chosen": -3.765625, + "logits/rejected": -3.873046875, + "logps/chosen": -418.375, + "logps/rejected": -207.25, + "loss": 1.4395, + "nll_loss": 0.78662109375, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0009765625, + "rewards/margins": 2.140625, + "rewards/rejected": 0.861083984375, + "step": 245 + }, + { + "epoch": 0.9452449567723343, + "grad_norm": 166.0, + "learning_rate": 5.893617021276595e-07, + "logits/chosen": -3.814453125, + "logits/rejected": -3.91796875, + "logps/chosen": -445.625, + "logps/rejected": -226.5, + "loss": 1.5156, + "nll_loss": 0.92138671875, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.5859375, + "rewards/margins": 2.66552734375, + "rewards/rejected": 0.921875, + "step": 246 + }, + { + "epoch": 0.9490874159462056, + "grad_norm": 114.0, + "learning_rate": 5.872340425531914e-07, + "logits/chosen": -3.70703125, + "logits/rejected": -3.748046875, + "logps/chosen": -413.75, + "logps/rejected": -219.25, + "loss": 1.3955, + "nll_loss": 0.7978515625, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.80859375, + "rewards/margins": 2.337158203125, + "rewards/rejected": 0.47113037109375, + "step": 247 + }, + { + "epoch": 0.9529298751200769, + "grad_norm": 173.0, + "learning_rate": 5.851063829787234e-07, + "logits/chosen": -3.796875, + "logits/rejected": -3.81640625, + "logps/chosen": -382.5, + "logps/rejected": -208.0, + "loss": 1.4805, + "nll_loss": 0.8876953125, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.3291015625, + "rewards/margins": 2.2783203125, + "rewards/rejected": 1.05224609375, + "step": 248 + }, + { + "epoch": 0.9567723342939481, + "grad_norm": 350.0, + "learning_rate": 5.829787234042552e-07, + "logits/chosen": -3.771484375, + "logits/rejected": -3.759765625, + "logps/chosen": -426.375, + "logps/rejected": -256.25, + "loss": 1.5254, + "nll_loss": 0.85986328125, + "rewards/accuracies": 0.71875, + "rewards/chosen": 3.10546875, + "rewards/margins": 2.10986328125, + "rewards/rejected": 0.995849609375, + "step": 249 + }, + { + "epoch": 0.9606147934678194, + "grad_norm": 239.0, + "learning_rate": 5.808510638297872e-07, + "logits/chosen": -3.8125, + "logits/rejected": -3.857421875, + "logps/chosen": -463.25, + "logps/rejected": -259.5, + "loss": 1.4238, + "nll_loss": 0.85400390625, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.490234375, + "rewards/margins": 2.708984375, + "rewards/rejected": 0.7867431640625, + "step": 250 + }, + { + "epoch": 0.9644572526416907, + "grad_norm": 88.5, + "learning_rate": 5.787234042553191e-07, + "logits/chosen": -3.72265625, + "logits/rejected": -3.763671875, + "logps/chosen": -334.3125, + "logps/rejected": -179.5625, + "loss": 1.4014, + "nll_loss": 0.8017578125, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6845703125, + "rewards/margins": 2.279541015625, + "rewards/rejected": 0.408447265625, + "step": 251 + }, + { + "epoch": 0.968299711815562, + "grad_norm": 288.0, + "learning_rate": 5.76595744680851e-07, + "logits/chosen": -3.763671875, + "logits/rejected": -3.845703125, + "logps/chosen": -394.5, + "logps/rejected": -216.125, + "loss": 1.665, + "nll_loss": 0.9833984375, + "rewards/accuracies": 0.78125, + "rewards/chosen": 3.4580078125, + "rewards/margins": 2.388671875, + "rewards/rejected": 1.0673828125, + "step": 252 + }, + { + "epoch": 0.9721421709894332, + "grad_norm": 140.0, + "learning_rate": 5.74468085106383e-07, + "logits/chosen": -3.837890625, + "logits/rejected": -3.90625, + "logps/chosen": -446.0, + "logps/rejected": -223.125, + "loss": 1.4834, + "nll_loss": 0.916015625, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.2734375, + "rewards/margins": 2.6962890625, + "rewards/rejected": 0.58251953125, + "step": 253 + }, + { + "epoch": 0.9759846301633045, + "grad_norm": 147.0, + "learning_rate": 5.723404255319149e-07, + "logits/chosen": -3.759765625, + "logits/rejected": -3.822265625, + "logps/chosen": -508.875, + "logps/rejected": -231.0, + "loss": 1.5293, + "nll_loss": 0.95458984375, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.048828125, + "rewards/margins": 2.52734375, + "rewards/rejected": 0.5234375, + "step": 254 + }, + { + "epoch": 0.9798270893371758, + "grad_norm": 106.0, + "learning_rate": 5.702127659574469e-07, + "logits/chosen": -3.701171875, + "logits/rejected": -3.787109375, + "logps/chosen": -416.0, + "logps/rejected": -214.75, + "loss": 1.2871, + "nll_loss": 0.78125, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.185546875, + "rewards/margins": 2.8427734375, + "rewards/rejected": 0.345458984375, + "step": 255 + }, + { + "epoch": 0.983669548511047, + "grad_norm": 246.0, + "learning_rate": 5.680851063829787e-07, + "logits/chosen": -3.72265625, + "logits/rejected": -3.794921875, + "logps/chosen": -353.875, + "logps/rejected": -213.625, + "loss": 1.4785, + "nll_loss": 0.84130859375, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7578125, + "rewards/margins": 1.93896484375, + "rewards/rejected": 0.821075439453125, + "step": 256 + }, + { + "epoch": 0.9875120076849183, + "grad_norm": 231.0, + "learning_rate": 5.659574468085107e-07, + "logits/chosen": -3.76171875, + "logits/rejected": -3.83203125, + "logps/chosen": -402.375, + "logps/rejected": -225.375, + "loss": 1.4424, + "nll_loss": 0.84765625, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.1884765625, + "rewards/margins": 2.45263671875, + "rewards/rejected": 0.73870849609375, + "step": 257 + }, + { + "epoch": 0.9913544668587896, + "grad_norm": 172.0, + "learning_rate": 5.638297872340425e-07, + "logits/chosen": -3.728515625, + "logits/rejected": -3.748046875, + "logps/chosen": -381.875, + "logps/rejected": -219.5, + "loss": 1.5361, + "nll_loss": 0.88671875, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.556640625, + "rewards/margins": 1.82421875, + "rewards/rejected": 0.732421875, + "step": 258 + }, + { + "epoch": 0.9951969260326609, + "grad_norm": 160.0, + "learning_rate": 5.617021276595744e-07, + "logits/chosen": -3.84375, + "logits/rejected": -3.857421875, + "logps/chosen": -449.25, + "logps/rejected": -246.25, + "loss": 1.4092, + "nll_loss": 0.84765625, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.162109375, + "rewards/margins": 2.453125, + "rewards/rejected": 0.70648193359375, + "step": 259 + }, + { + "epoch": 0.9990393852065321, + "grad_norm": 130.0, + "learning_rate": 5.595744680851063e-07, + "logits/chosen": -3.75, + "logits/rejected": -3.861328125, + "logps/chosen": -514.375, + "logps/rejected": -217.25, + "loss": 1.4355, + "nll_loss": 0.92822265625, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.310546875, + "rewards/margins": 2.84375, + "rewards/rejected": 0.46923828125, + "step": 260 + }, + { + "epoch": 1.0, + "grad_norm": 231.0, + "learning_rate": 5.574468085106383e-07, + "logits/chosen": -3.9609375, + "logits/rejected": -3.765625, + "logps/chosen": -552.0, + "logps/rejected": -335.0, + "loss": 1.4102, + "nll_loss": 0.9296875, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.40625, + "rewards/margins": 3.0859375, + "rewards/rejected": 1.322265625, + "step": 261 + }, + { + "epoch": 1.0038424591738713, + "grad_norm": 252.0, + "learning_rate": 5.553191489361701e-07, + "logits/chosen": -3.818359375, + "logits/rejected": -3.873046875, + "logps/chosen": -325.625, + "logps/rejected": -180.375, + "loss": 1.5205, + "nll_loss": 0.88525390625, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.5703125, + "rewards/margins": 1.873046875, + "rewards/rejected": 0.699951171875, + "step": 262 + }, + { + "epoch": 1.0076849183477425, + "grad_norm": 176.0, + "learning_rate": 5.531914893617021e-07, + "logits/chosen": -3.830078125, + "logits/rejected": -3.86328125, + "logps/chosen": -518.25, + "logps/rejected": -283.75, + "loss": 1.293, + "nll_loss": 0.7685546875, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.5771484375, + "rewards/margins": 3.1064453125, + "rewards/rejected": 0.469970703125, + "step": 263 + }, + { + "epoch": 1.0115273775216138, + "grad_norm": 139.0, + "learning_rate": 5.51063829787234e-07, + "logits/chosen": -3.787109375, + "logits/rejected": -3.806640625, + "logps/chosen": -367.125, + "logps/rejected": -232.75, + "loss": 1.5107, + "nll_loss": 0.869384765625, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.53515625, + "rewards/margins": 1.85888671875, + "rewards/rejected": 0.681884765625, + "step": 264 + }, + { + "epoch": 1.015369836695485, + "grad_norm": 142.0, + "learning_rate": 5.48936170212766e-07, + "logits/chosen": -3.841796875, + "logits/rejected": -3.826171875, + "logps/chosen": -461.75, + "logps/rejected": -248.125, + "loss": 1.4277, + "nll_loss": 0.88818359375, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.869140625, + "rewards/margins": 2.9560546875, + "rewards/rejected": 0.91015625, + "step": 265 + }, + { + "epoch": 1.0192122958693564, + "grad_norm": 302.0, + "learning_rate": 5.468085106382978e-07, + "logits/chosen": -3.740234375, + "logits/rejected": -3.771484375, + "logps/chosen": -358.0, + "logps/rejected": -237.75, + "loss": 1.6299, + "nll_loss": 0.90478515625, + "rewards/accuracies": 0.71875, + "rewards/chosen": 2.98828125, + "rewards/margins": 1.845947265625, + "rewards/rejected": 1.139373779296875, + "step": 266 + }, + { + "epoch": 1.0230547550432276, + "grad_norm": 156.0, + "learning_rate": 5.446808510638298e-07, + "logits/chosen": -3.71484375, + "logits/rejected": -3.796875, + "logps/chosen": -513.75, + "logps/rejected": -263.625, + "loss": 1.4443, + "nll_loss": 0.884765625, + "rewards/accuracies": 0.84375, + "rewards/chosen": 4.13671875, + "rewards/margins": 3.052734375, + "rewards/rejected": 1.07733154296875, + "step": 267 + }, + { + "epoch": 1.026897214217099, + "grad_norm": 215.0, + "learning_rate": 5.425531914893617e-07, + "logits/chosen": -3.8046875, + "logits/rejected": -3.833984375, + "logps/chosen": -425.75, + "logps/rejected": -242.625, + "loss": 1.4424, + "nll_loss": 0.88037109375, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.7275390625, + "rewards/margins": 2.9228515625, + "rewards/rejected": 0.81048583984375, + "step": 268 + }, + { + "epoch": 1.0307396733909702, + "grad_norm": 151.0, + "learning_rate": 5.404255319148936e-07, + "logits/chosen": -3.76171875, + "logits/rejected": -3.845703125, + "logps/chosen": -392.75, + "logps/rejected": -261.0625, + "loss": 1.6299, + "nll_loss": 0.9404296875, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.9375, + "rewards/margins": 1.89678955078125, + "rewards/rejected": 1.040771484375, + "step": 269 + }, + { + "epoch": 1.0345821325648414, + "grad_norm": 139.0, + "learning_rate": 5.382978723404255e-07, + "logits/chosen": -3.806640625, + "logits/rejected": -3.814453125, + "logps/chosen": -400.625, + "logps/rejected": -227.125, + "loss": 1.3994, + "nll_loss": 0.8740234375, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.451171875, + "rewards/margins": 2.810546875, + "rewards/rejected": 0.63812255859375, + "step": 270 + }, + { + "epoch": 1.0384245917387127, + "grad_norm": 179.0, + "learning_rate": 5.361702127659574e-07, + "logits/chosen": -3.783203125, + "logits/rejected": -3.822265625, + "logps/chosen": -460.0, + "logps/rejected": -234.4375, + "loss": 1.4453, + "nll_loss": 0.86962890625, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.400390625, + "rewards/margins": 2.62744140625, + "rewards/rejected": 0.77667236328125, + "step": 271 + }, + { + "epoch": 1.042267050912584, + "grad_norm": 216.0, + "learning_rate": 5.340425531914894e-07, + "logits/chosen": -3.83203125, + "logits/rejected": -3.828125, + "logps/chosen": -401.75, + "logps/rejected": -246.375, + "loss": 1.5518, + "nll_loss": 0.92236328125, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.27734375, + "rewards/margins": 2.30322265625, + "rewards/rejected": 0.97509765625, + "step": 272 + }, + { + "epoch": 1.0461095100864553, + "grad_norm": 144.0, + "learning_rate": 5.319148936170212e-07, + "logits/chosen": -3.7421875, + "logits/rejected": -3.82421875, + "logps/chosen": -377.0, + "logps/rejected": -213.125, + "loss": 1.5278, + "nll_loss": 0.89501953125, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.064453125, + "rewards/margins": 2.4580078125, + "rewards/rejected": 0.606781005859375, + "step": 273 + }, + { + "epoch": 1.0499519692603265, + "grad_norm": 138.0, + "learning_rate": 5.297872340425532e-07, + "logits/chosen": -3.69921875, + "logits/rejected": -3.751953125, + "logps/chosen": -431.0, + "logps/rejected": -229.625, + "loss": 1.4463, + "nll_loss": 0.91943359375, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.4638671875, + "rewards/margins": 2.994140625, + "rewards/rejected": 0.4691162109375, + "step": 274 + }, + { + "epoch": 1.0537944284341978, + "grad_norm": 151.0, + "learning_rate": 5.276595744680851e-07, + "logits/chosen": -3.79296875, + "logits/rejected": -3.705078125, + "logps/chosen": -467.625, + "logps/rejected": -252.125, + "loss": 1.5674, + "nll_loss": 0.96826171875, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.7138671875, + "rewards/margins": 2.79833984375, + "rewards/rejected": 0.9154052734375, + "step": 275 + }, + { + "epoch": 1.057636887608069, + "grad_norm": 328.0, + "learning_rate": 5.25531914893617e-07, + "logits/chosen": -3.796875, + "logits/rejected": -3.89453125, + "logps/chosen": -366.75, + "logps/rejected": -191.0625, + "loss": 1.4707, + "nll_loss": 0.91357421875, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.318359375, + "rewards/margins": 2.5791015625, + "rewards/rejected": 0.736785888671875, + "step": 276 + }, + { + "epoch": 1.0614793467819403, + "grad_norm": 130.0, + "learning_rate": 5.234042553191489e-07, + "logits/chosen": -3.724609375, + "logits/rejected": -3.7890625, + "logps/chosen": -468.0, + "logps/rejected": -240.75, + "loss": 1.2998, + "nll_loss": 0.759033203125, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.421875, + "rewards/margins": 2.904296875, + "rewards/rejected": 0.518829345703125, + "step": 277 + }, + { + "epoch": 1.0653218059558118, + "grad_norm": 112.0, + "learning_rate": 5.212765957446809e-07, + "logits/chosen": -3.744140625, + "logits/rejected": -3.705078125, + "logps/chosen": -320.375, + "logps/rejected": -189.875, + "loss": 1.5957, + "nll_loss": 0.904296875, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.3212890625, + "rewards/margins": 1.68212890625, + "rewards/rejected": 0.639892578125, + "step": 278 + }, + { + "epoch": 1.069164265129683, + "grad_norm": 225.0, + "learning_rate": 5.191489361702127e-07, + "logits/chosen": -3.796875, + "logits/rejected": -3.8046875, + "logps/chosen": -415.75, + "logps/rejected": -224.375, + "loss": 1.5049, + "nll_loss": 0.93310546875, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.71484375, + "rewards/margins": 2.912109375, + "rewards/rejected": 0.80029296875, + "step": 279 + }, + { + "epoch": 1.0730067243035544, + "grad_norm": 143.0, + "learning_rate": 5.170212765957447e-07, + "logits/chosen": -3.796875, + "logits/rejected": -3.91015625, + "logps/chosen": -426.5, + "logps/rejected": -226.1875, + "loss": 1.6201, + "nll_loss": 1.0205078125, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0986328125, + "rewards/margins": 2.354736328125, + "rewards/rejected": 0.746337890625, + "step": 280 + }, + { + "epoch": 1.0768491834774256, + "grad_norm": 129.0, + "learning_rate": 5.148936170212766e-07, + "logits/chosen": -3.74609375, + "logits/rejected": -3.810546875, + "logps/chosen": -347.0, + "logps/rejected": -220.6875, + "loss": 1.4082, + "nll_loss": 0.7880859375, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8203125, + "rewards/margins": 1.935546875, + "rewards/rejected": 0.8857421875, + "step": 281 + }, + { + "epoch": 1.080691642651297, + "grad_norm": 175.0, + "learning_rate": 5.127659574468085e-07, + "logits/chosen": -3.791015625, + "logits/rejected": -3.82421875, + "logps/chosen": -439.625, + "logps/rejected": -226.625, + "loss": 1.3604, + "nll_loss": 0.845703125, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.6162109375, + "rewards/margins": 3.140625, + "rewards/rejected": 0.4779052734375, + "step": 282 + }, + { + "epoch": 1.0845341018251682, + "grad_norm": 100.0, + "learning_rate": 5.106382978723403e-07, + "logits/chosen": -3.7890625, + "logits/rejected": -3.79296875, + "logps/chosen": -408.0, + "logps/rejected": -230.125, + "loss": 1.4941, + "nll_loss": 0.9453125, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.333984375, + "rewards/margins": 2.759765625, + "rewards/rejected": 0.5758056640625, + "step": 283 + }, + { + "epoch": 1.0883765609990395, + "grad_norm": 122.0, + "learning_rate": 5.085106382978723e-07, + "logits/chosen": -3.8828125, + "logits/rejected": -3.873046875, + "logps/chosen": -377.125, + "logps/rejected": -226.125, + "loss": 1.6309, + "nll_loss": 0.9638671875, + "rewards/accuracies": 0.78125, + "rewards/chosen": 3.1318359375, + "rewards/margins": 2.3363037109375, + "rewards/rejected": 0.794921875, + "step": 284 + }, + { + "epoch": 1.0922190201729107, + "grad_norm": 153.0, + "learning_rate": 5.063829787234042e-07, + "logits/chosen": -3.69140625, + "logits/rejected": -3.6875, + "logps/chosen": -486.375, + "logps/rejected": -264.25, + "loss": 1.4033, + "nll_loss": 0.90283203125, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.88671875, + "rewards/margins": 3.4814453125, + "rewards/rejected": 0.39849853515625, + "step": 285 + }, + { + "epoch": 1.096061479346782, + "grad_norm": 142.0, + "learning_rate": 5.042553191489361e-07, + "logits/chosen": -3.763671875, + "logits/rejected": -3.783203125, + "logps/chosen": -444.625, + "logps/rejected": -213.0, + "loss": 1.4658, + "nll_loss": 0.904296875, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.30078125, + "rewards/margins": 2.635498046875, + "rewards/rejected": 0.66351318359375, + "step": 286 + }, + { + "epoch": 1.0999039385206533, + "grad_norm": 104.5, + "learning_rate": 5.02127659574468e-07, + "logits/chosen": -3.798828125, + "logits/rejected": -3.818359375, + "logps/chosen": -346.5, + "logps/rejected": -181.8125, + "loss": 1.4375, + "nll_loss": 0.8544921875, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7783203125, + "rewards/margins": 2.35302734375, + "rewards/rejected": 0.423004150390625, + "step": 287 + }, + { + "epoch": 1.1037463976945245, + "grad_norm": 136.0, + "learning_rate": 5e-07, + "logits/chosen": -3.701171875, + "logits/rejected": -3.70703125, + "logps/chosen": -324.25, + "logps/rejected": -179.0, + "loss": 1.6572, + "nll_loss": 0.99755859375, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.390625, + "rewards/margins": 1.822509765625, + "rewards/rejected": 0.57208251953125, + "step": 288 + }, + { + "epoch": 1.1075888568683958, + "grad_norm": 326.0, + "learning_rate": 4.978723404255318e-07, + "logits/chosen": -3.875, + "logits/rejected": -3.912109375, + "logps/chosen": -420.0, + "logps/rejected": -219.625, + "loss": 1.4521, + "nll_loss": 0.88525390625, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0458984375, + "rewards/margins": 2.4677734375, + "rewards/rejected": 0.5817413330078125, + "step": 289 + }, + { + "epoch": 1.111431316042267, + "grad_norm": 126.0, + "learning_rate": 4.957446808510638e-07, + "logits/chosen": -3.767578125, + "logits/rejected": -3.818359375, + "logps/chosen": -462.25, + "logps/rejected": -241.125, + "loss": 1.4785, + "nll_loss": 0.9462890625, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.751953125, + "rewards/margins": 3.093994140625, + "rewards/rejected": 0.6513671875, + "step": 290 + }, + { + "epoch": 1.1152737752161384, + "grad_norm": 197.0, + "learning_rate": 4.936170212765957e-07, + "logits/chosen": -3.7109375, + "logits/rejected": -3.8203125, + "logps/chosen": -421.125, + "logps/rejected": -228.1875, + "loss": 1.4873, + "nll_loss": 0.87939453125, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.64794921875, + "rewards/margins": 2.419921875, + "rewards/rejected": 0.2265625, + "step": 291 + }, + { + "epoch": 1.1191162343900096, + "grad_norm": 160.0, + "learning_rate": 4.914893617021277e-07, + "logits/chosen": -3.83203125, + "logits/rejected": -3.869140625, + "logps/chosen": -441.5, + "logps/rejected": -235.6875, + "loss": 1.4844, + "nll_loss": 0.9443359375, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.609375, + "rewards/margins": 3.1533203125, + "rewards/rejected": 0.4490509033203125, + "step": 292 + }, + { + "epoch": 1.122958693563881, + "grad_norm": 199.0, + "learning_rate": 4.893617021276595e-07, + "logits/chosen": -3.7890625, + "logits/rejected": -3.7734375, + "logps/chosen": -433.75, + "logps/rejected": -292.5, + "loss": 1.5029, + "nll_loss": 0.90087890625, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.3173828125, + "rewards/margins": 2.44921875, + "rewards/rejected": 0.86474609375, + "step": 293 + }, + { + "epoch": 1.1268011527377522, + "grad_norm": 318.0, + "learning_rate": 4.872340425531915e-07, + "logits/chosen": -3.75390625, + "logits/rejected": -3.8359375, + "logps/chosen": -474.5, + "logps/rejected": -275.5, + "loss": 1.4531, + "nll_loss": 0.91064453125, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.7109375, + "rewards/margins": 2.7900390625, + "rewards/rejected": 0.9208984375, + "step": 294 + }, + { + "epoch": 1.1306436119116234, + "grad_norm": 202.0, + "learning_rate": 4.851063829787234e-07, + "logits/chosen": -3.861328125, + "logits/rejected": -3.82421875, + "logps/chosen": -448.0, + "logps/rejected": -272.625, + "loss": 1.5283, + "nll_loss": 0.927734375, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.861328125, + "rewards/margins": 2.765625, + "rewards/rejected": 1.096923828125, + "step": 295 + }, + { + "epoch": 1.1344860710854947, + "grad_norm": 446.0, + "learning_rate": 4.829787234042552e-07, + "logits/chosen": -3.7890625, + "logits/rejected": -3.8359375, + "logps/chosen": -384.5, + "logps/rejected": -193.375, + "loss": 1.2949, + "nll_loss": 0.80615234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.546875, + "rewards/margins": 3.169921875, + "rewards/rejected": 0.371612548828125, + "step": 296 + }, + { + "epoch": 1.138328530259366, + "grad_norm": 146.0, + "learning_rate": 4.808510638297872e-07, + "logits/chosen": -3.80078125, + "logits/rejected": -3.82421875, + "logps/chosen": -470.5, + "logps/rejected": -258.25, + "loss": 1.4336, + "nll_loss": 0.9345703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.0390625, + "rewards/margins": 3.1181640625, + "rewards/rejected": 0.919677734375, + "step": 297 + }, + { + "epoch": 1.1421709894332372, + "grad_norm": 147.0, + "learning_rate": 4.787234042553192e-07, + "logits/chosen": -3.751953125, + "logits/rejected": -3.8125, + "logps/chosen": -396.25, + "logps/rejected": -219.8125, + "loss": 1.4688, + "nll_loss": 0.890625, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9970703125, + "rewards/margins": 2.521484375, + "rewards/rejected": 0.4781494140625, + "step": 298 + }, + { + "epoch": 1.1460134486071085, + "grad_norm": 149.0, + "learning_rate": 4.7659574468085105e-07, + "logits/chosen": -3.765625, + "logits/rejected": -3.751953125, + "logps/chosen": -413.5, + "logps/rejected": -253.875, + "loss": 1.4736, + "nll_loss": 0.8662109375, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8896484375, + "rewards/margins": 2.1279296875, + "rewards/rejected": 0.763916015625, + "step": 299 + }, + { + "epoch": 1.1498559077809798, + "grad_norm": 125.0, + "learning_rate": 4.7446808510638297e-07, + "logits/chosen": -3.830078125, + "logits/rejected": -3.921875, + "logps/chosen": -456.75, + "logps/rejected": -270.5, + "loss": 1.416, + "nll_loss": 0.85400390625, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.8818359375, + "rewards/margins": 2.9375, + "rewards/rejected": 0.945556640625, + "step": 300 + }, + { + "epoch": 1.153698366954851, + "grad_norm": 516.0, + "learning_rate": 4.723404255319149e-07, + "logits/chosen": -3.8203125, + "logits/rejected": -3.84765625, + "logps/chosen": -398.25, + "logps/rejected": -243.25, + "loss": 1.6289, + "nll_loss": 0.96875, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.3984375, + "rewards/margins": 2.277099609375, + "rewards/rejected": 1.1224365234375, + "step": 301 + }, + { + "epoch": 1.1575408261287223, + "grad_norm": 140.0, + "learning_rate": 4.702127659574468e-07, + "logits/chosen": -3.7421875, + "logits/rejected": -3.810546875, + "logps/chosen": -525.25, + "logps/rejected": -239.125, + "loss": 1.3584, + "nll_loss": 0.94287109375, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.474609375, + "rewards/margins": 3.923828125, + "rewards/rejected": 0.553863525390625, + "step": 302 + }, + { + "epoch": 1.1613832853025936, + "grad_norm": 147.0, + "learning_rate": 4.6808510638297873e-07, + "logits/chosen": -3.791015625, + "logits/rejected": -3.849609375, + "logps/chosen": -424.375, + "logps/rejected": -250.375, + "loss": 1.4697, + "nll_loss": 0.88232421875, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.6875, + "rewards/margins": 2.640625, + "rewards/rejected": 1.04754638671875, + "step": 303 + }, + { + "epoch": 1.1652257444764649, + "grad_norm": 157.0, + "learning_rate": 4.659574468085106e-07, + "logits/chosen": -3.740234375, + "logits/rejected": -3.77734375, + "logps/chosen": -481.0, + "logps/rejected": -247.9375, + "loss": 1.3711, + "nll_loss": 0.873046875, + "rewards/accuracies": 0.9375, + "rewards/chosen": 4.05078125, + "rewards/margins": 3.162109375, + "rewards/rejected": 0.8905029296875, + "step": 304 + }, + { + "epoch": 1.1690682036503361, + "grad_norm": 166.0, + "learning_rate": 4.638297872340425e-07, + "logits/chosen": -3.736328125, + "logits/rejected": -3.77734375, + "logps/chosen": -446.0, + "logps/rejected": -236.875, + "loss": 1.3345, + "nll_loss": 0.81884765625, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.701171875, + "rewards/margins": 2.9482421875, + "rewards/rejected": 0.755615234375, + "step": 305 + }, + { + "epoch": 1.1729106628242074, + "grad_norm": 118.0, + "learning_rate": 4.6170212765957444e-07, + "logits/chosen": -3.751953125, + "logits/rejected": -3.7578125, + "logps/chosen": -398.75, + "logps/rejected": -220.75, + "loss": 1.4619, + "nll_loss": 0.88427734375, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2724609375, + "rewards/margins": 2.5166015625, + "rewards/rejected": 0.7598876953125, + "step": 306 + }, + { + "epoch": 1.1767531219980787, + "grad_norm": 153.0, + "learning_rate": 4.5957446808510636e-07, + "logits/chosen": -3.724609375, + "logits/rejected": -3.80859375, + "logps/chosen": -352.125, + "logps/rejected": -216.625, + "loss": 1.4014, + "nll_loss": 0.80712890625, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.912109375, + "rewards/margins": 2.2841796875, + "rewards/rejected": 0.632568359375, + "step": 307 + }, + { + "epoch": 1.18059558117195, + "grad_norm": 111.5, + "learning_rate": 4.574468085106383e-07, + "logits/chosen": -3.80078125, + "logits/rejected": -3.89453125, + "logps/chosen": -446.0, + "logps/rejected": -210.5, + "loss": 1.3701, + "nll_loss": 0.880859375, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.4189453125, + "rewards/margins": 3.3046875, + "rewards/rejected": 0.1109619140625, + "step": 308 + }, + { + "epoch": 1.1844380403458212, + "grad_norm": 152.0, + "learning_rate": 4.553191489361702e-07, + "logits/chosen": -3.751953125, + "logits/rejected": -3.890625, + "logps/chosen": -375.125, + "logps/rejected": -230.75, + "loss": 1.5977, + "nll_loss": 0.93408203125, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8759765625, + "rewards/margins": 1.8955078125, + "rewards/rejected": 0.98260498046875, + "step": 309 + }, + { + "epoch": 1.1882804995196925, + "grad_norm": 652.0, + "learning_rate": 4.5319148936170207e-07, + "logits/chosen": -3.775390625, + "logits/rejected": -3.876953125, + "logps/chosen": -496.0, + "logps/rejected": -240.5, + "loss": 1.5557, + "nll_loss": 1.03759765625, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.943359375, + "rewards/margins": 3.06640625, + "rewards/rejected": 0.87890625, + "step": 310 + }, + { + "epoch": 1.192122958693564, + "grad_norm": 242.0, + "learning_rate": 4.51063829787234e-07, + "logits/chosen": -3.716796875, + "logits/rejected": -3.759765625, + "logps/chosen": -466.625, + "logps/rejected": -230.0, + "loss": 1.3955, + "nll_loss": 0.87255859375, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.6015625, + "rewards/margins": 3.0654296875, + "rewards/rejected": 0.536376953125, + "step": 311 + }, + { + "epoch": 1.195965417867435, + "grad_norm": 390.0, + "learning_rate": 4.489361702127659e-07, + "logits/chosen": -3.828125, + "logits/rejected": -3.900390625, + "logps/chosen": -566.5, + "logps/rejected": -309.625, + "loss": 1.4814, + "nll_loss": 0.86474609375, + "rewards/accuracies": 0.84375, + "rewards/chosen": 5.751953125, + "rewards/margins": 3.49951171875, + "rewards/rejected": 2.24951171875, + "step": 312 + }, + { + "epoch": 1.1998078770413065, + "grad_norm": 452.0, + "learning_rate": 4.4680851063829783e-07, + "logits/chosen": -3.82421875, + "logits/rejected": -3.833984375, + "logps/chosen": -522.75, + "logps/rejected": -299.75, + "loss": 1.6963, + "nll_loss": 1.01513671875, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.76171875, + "rewards/margins": 2.6083984375, + "rewards/rejected": 1.155517578125, + "step": 313 + }, + { + "epoch": 1.2036503362151778, + "grad_norm": 1576.0, + "learning_rate": 4.4468085106382975e-07, + "logits/chosen": -3.802734375, + "logits/rejected": -3.85546875, + "logps/chosen": -430.875, + "logps/rejected": -230.125, + "loss": 1.4121, + "nll_loss": 0.85986328125, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1748046875, + "rewards/margins": 2.5478515625, + "rewards/rejected": 0.6265869140625, + "step": 314 + }, + { + "epoch": 1.207492795389049, + "grad_norm": 133.0, + "learning_rate": 4.425531914893617e-07, + "logits/chosen": -3.82421875, + "logits/rejected": -3.923828125, + "logps/chosen": -390.125, + "logps/rejected": -284.0, + "loss": 1.5303, + "nll_loss": 0.92431640625, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.013671875, + "rewards/margins": 3.7646484375, + "rewards/rejected": -0.7373046875, + "step": 315 + }, + { + "epoch": 1.2113352545629203, + "grad_norm": 824.0, + "learning_rate": 4.404255319148936e-07, + "logits/chosen": -3.71484375, + "logits/rejected": -3.712890625, + "logps/chosen": -399.375, + "logps/rejected": -222.125, + "loss": 1.6523, + "nll_loss": 1.03857421875, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6875, + "rewards/margins": 1.97265625, + "rewards/rejected": 0.71649169921875, + "step": 316 + }, + { + "epoch": 1.2151777137367916, + "grad_norm": 158.0, + "learning_rate": 4.382978723404255e-07, + "logits/chosen": -3.724609375, + "logits/rejected": -3.720703125, + "logps/chosen": -334.375, + "logps/rejected": -171.75, + "loss": 1.3071, + "nll_loss": 0.75, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6494140625, + "rewards/margins": 2.4697265625, + "rewards/rejected": 0.17755126953125, + "step": 317 + }, + { + "epoch": 1.219020172910663, + "grad_norm": 100.0, + "learning_rate": 4.3617021276595744e-07, + "logits/chosen": -3.712890625, + "logits/rejected": -3.814453125, + "logps/chosen": -425.75, + "logps/rejected": -225.375, + "loss": 1.4238, + "nll_loss": 0.91650390625, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.310546875, + "rewards/margins": 2.978515625, + "rewards/rejected": 0.3358154296875, + "step": 318 + }, + { + "epoch": 1.2228626320845342, + "grad_norm": 172.0, + "learning_rate": 4.3404255319148936e-07, + "logits/chosen": -3.77734375, + "logits/rejected": -3.8125, + "logps/chosen": -497.375, + "logps/rejected": -220.5625, + "loss": 1.5586, + "nll_loss": 1.03662109375, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.90625, + "rewards/margins": 3.109375, + "rewards/rejected": 0.800506591796875, + "step": 319 + }, + { + "epoch": 1.2267050912584054, + "grad_norm": 210.0, + "learning_rate": 4.319148936170213e-07, + "logits/chosen": -3.82421875, + "logits/rejected": -3.90234375, + "logps/chosen": -482.0, + "logps/rejected": -265.125, + "loss": 1.3574, + "nll_loss": 0.853515625, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.751953125, + "rewards/margins": 3.07421875, + "rewards/rejected": 0.68212890625, + "step": 320 + }, + { + "epoch": 1.2305475504322767, + "grad_norm": 232.0, + "learning_rate": 4.297872340425532e-07, + "logits/chosen": -3.830078125, + "logits/rejected": -3.8828125, + "logps/chosen": -427.75, + "logps/rejected": -252.625, + "loss": 1.4141, + "nll_loss": 0.83203125, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.677734375, + "rewards/margins": 2.798828125, + "rewards/rejected": 0.8818359375, + "step": 321 + }, + { + "epoch": 1.234390009606148, + "grad_norm": 768.0, + "learning_rate": 4.276595744680851e-07, + "logits/chosen": -3.78515625, + "logits/rejected": -3.6796875, + "logps/chosen": -448.75, + "logps/rejected": -296.375, + "loss": 1.4736, + "nll_loss": 0.94091796875, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.634765625, + "rewards/margins": 3.001953125, + "rewards/rejected": 0.62884521484375, + "step": 322 + }, + { + "epoch": 1.2382324687800192, + "grad_norm": 140.0, + "learning_rate": 4.25531914893617e-07, + "logits/chosen": -3.771484375, + "logits/rejected": -3.79296875, + "logps/chosen": -463.125, + "logps/rejected": -242.5, + "loss": 1.3281, + "nll_loss": 0.830078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.7841796875, + "rewards/margins": 3.16796875, + "rewards/rejected": 0.6173095703125, + "step": 323 + }, + { + "epoch": 1.2420749279538905, + "grad_norm": 199.0, + "learning_rate": 4.234042553191489e-07, + "logits/chosen": -3.84765625, + "logits/rejected": -3.900390625, + "logps/chosen": -424.0, + "logps/rejected": -233.0, + "loss": 1.501, + "nll_loss": 0.9853515625, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.41015625, + "rewards/margins": 2.8115234375, + "rewards/rejected": 0.5987548828125, + "step": 324 + }, + { + "epoch": 1.2459173871277618, + "grad_norm": 117.0, + "learning_rate": 4.2127659574468083e-07, + "logits/chosen": -3.77734375, + "logits/rejected": -3.806640625, + "logps/chosen": -437.75, + "logps/rejected": -218.125, + "loss": 1.3887, + "nll_loss": 0.86279296875, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.91015625, + "rewards/margins": 3.0966796875, + "rewards/rejected": 0.812255859375, + "step": 325 + }, + { + "epoch": 1.249759846301633, + "grad_norm": 205.0, + "learning_rate": 4.1914893617021275e-07, + "logits/chosen": -3.79296875, + "logits/rejected": -3.83203125, + "logps/chosen": -397.125, + "logps/rejected": -258.375, + "loss": 1.5479, + "nll_loss": 0.93994140625, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.140625, + "rewards/margins": 2.1328125, + "rewards/rejected": 1.0125732421875, + "step": 326 + }, + { + "epoch": 1.2536023054755043, + "grad_norm": 384.0, + "learning_rate": 4.1702127659574467e-07, + "logits/chosen": -3.791015625, + "logits/rejected": -3.92578125, + "logps/chosen": -506.25, + "logps/rejected": -238.0625, + "loss": 1.4082, + "nll_loss": 0.904296875, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.923828125, + "rewards/margins": 3.4775390625, + "rewards/rejected": 0.450531005859375, + "step": 327 + }, + { + "epoch": 1.2574447646493756, + "grad_norm": 254.0, + "learning_rate": 4.148936170212766e-07, + "logits/chosen": -3.83203125, + "logits/rejected": -3.8203125, + "logps/chosen": -413.0, + "logps/rejected": -200.6875, + "loss": 1.3945, + "nll_loss": 0.85400390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.18359375, + "rewards/margins": 2.630859375, + "rewards/rejected": 0.551025390625, + "step": 328 + }, + { + "epoch": 1.2612872238232469, + "grad_norm": 131.0, + "learning_rate": 4.1276595744680846e-07, + "logits/chosen": -3.810546875, + "logits/rejected": -3.787109375, + "logps/chosen": -409.0, + "logps/rejected": -221.375, + "loss": 1.2715, + "nll_loss": 0.8203125, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.5869140625, + "rewards/margins": 3.2490234375, + "rewards/rejected": 0.33172607421875, + "step": 329 + }, + { + "epoch": 1.2651296829971181, + "grad_norm": 174.0, + "learning_rate": 4.106382978723404e-07, + "logits/chosen": -3.7578125, + "logits/rejected": -3.791015625, + "logps/chosen": -462.5, + "logps/rejected": -227.375, + "loss": 1.3652, + "nll_loss": 0.873046875, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.701171875, + "rewards/margins": 3.0458984375, + "rewards/rejected": 0.65032958984375, + "step": 330 + }, + { + "epoch": 1.2689721421709894, + "grad_norm": 187.0, + "learning_rate": 4.085106382978723e-07, + "logits/chosen": -3.77734375, + "logits/rejected": -3.767578125, + "logps/chosen": -365.125, + "logps/rejected": -272.75, + "loss": 1.7578, + "nll_loss": 1.02001953125, + "rewards/accuracies": 0.78125, + "rewards/chosen": 3.068359375, + "rewards/margins": 1.67138671875, + "rewards/rejected": 1.393798828125, + "step": 331 + }, + { + "epoch": 1.2728146013448607, + "grad_norm": 500.0, + "learning_rate": 4.063829787234042e-07, + "logits/chosen": -3.75390625, + "logits/rejected": -3.857421875, + "logps/chosen": -374.75, + "logps/rejected": -262.375, + "loss": 1.6846, + "nll_loss": 0.904296875, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.4794921875, + "rewards/margins": 1.3084716796875, + "rewards/rejected": 1.16796875, + "step": 332 + }, + { + "epoch": 1.276657060518732, + "grad_norm": 131.0, + "learning_rate": 4.0425531914893614e-07, + "logits/chosen": -3.84375, + "logits/rejected": -3.80078125, + "logps/chosen": -398.25, + "logps/rejected": -251.25, + "loss": 1.4033, + "nll_loss": 0.7705078125, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.2783203125, + "rewards/margins": 2.5927734375, + "rewards/rejected": 0.68408203125, + "step": 333 + }, + { + "epoch": 1.2804995196926032, + "grad_norm": 157.0, + "learning_rate": 4.021276595744681e-07, + "logits/chosen": -3.697265625, + "logits/rejected": -3.716796875, + "logps/chosen": -422.75, + "logps/rejected": -260.375, + "loss": 1.4355, + "nll_loss": 0.8720703125, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.359375, + "rewards/margins": 2.59375, + "rewards/rejected": 0.765869140625, + "step": 334 + }, + { + "epoch": 1.2843419788664745, + "grad_norm": 322.0, + "learning_rate": 4e-07, + "logits/chosen": -3.77734375, + "logits/rejected": -3.83984375, + "logps/chosen": -420.5, + "logps/rejected": -257.5, + "loss": 1.5049, + "nll_loss": 0.91650390625, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.6259765625, + "rewards/margins": 2.541015625, + "rewards/rejected": 1.08428955078125, + "step": 335 + }, + { + "epoch": 1.2881844380403458, + "grad_norm": 163.0, + "learning_rate": 3.978723404255319e-07, + "logits/chosen": -3.763671875, + "logits/rejected": -3.759765625, + "logps/chosen": -469.75, + "logps/rejected": -242.875, + "loss": 1.5342, + "nll_loss": 0.97216796875, + "rewards/accuracies": 0.90625, + "rewards/chosen": 4.38671875, + "rewards/margins": 3.173828125, + "rewards/rejected": 1.216796875, + "step": 336 + }, + { + "epoch": 1.292026897214217, + "grad_norm": 109.0, + "learning_rate": 3.957446808510638e-07, + "logits/chosen": -3.82421875, + "logits/rejected": -3.8046875, + "logps/chosen": -375.375, + "logps/rejected": -231.0, + "loss": 1.4824, + "nll_loss": 0.90478515625, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.17578125, + "rewards/margins": 2.517578125, + "rewards/rejected": 0.65863037109375, + "step": 337 + }, + { + "epoch": 1.2958693563880883, + "grad_norm": 130.0, + "learning_rate": 3.9361702127659574e-07, + "logits/chosen": -3.728515625, + "logits/rejected": -3.787109375, + "logps/chosen": -381.875, + "logps/rejected": -210.375, + "loss": 1.499, + "nll_loss": 0.916015625, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8798828125, + "rewards/margins": 2.272705078125, + "rewards/rejected": 0.607666015625, + "step": 338 + }, + { + "epoch": 1.2997118155619596, + "grad_norm": 226.0, + "learning_rate": 3.9148936170212766e-07, + "logits/chosen": -3.705078125, + "logits/rejected": -3.705078125, + "logps/chosen": -353.75, + "logps/rejected": -218.375, + "loss": 1.5723, + "nll_loss": 0.89892578125, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.970703125, + "rewards/margins": 1.94580078125, + "rewards/rejected": 1.0252685546875, + "step": 339 + }, + { + "epoch": 1.3035542747358309, + "grad_norm": 107.0, + "learning_rate": 3.893617021276596e-07, + "logits/chosen": -3.703125, + "logits/rejected": -3.791015625, + "logps/chosen": -460.5, + "logps/rejected": -256.0, + "loss": 1.231, + "nll_loss": 0.775390625, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.919921875, + "rewards/margins": 3.4453125, + "rewards/rejected": 0.4830322265625, + "step": 340 + }, + { + "epoch": 1.3073967339097021, + "grad_norm": 124.5, + "learning_rate": 3.8723404255319145e-07, + "logits/chosen": -3.755859375, + "logits/rejected": -3.75, + "logps/chosen": -546.25, + "logps/rejected": -261.375, + "loss": 1.4619, + "nll_loss": 0.95166015625, + "rewards/accuracies": 0.90625, + "rewards/chosen": 4.07421875, + "rewards/margins": 3.35546875, + "rewards/rejected": 0.7132568359375, + "step": 341 + }, + { + "epoch": 1.3112391930835736, + "grad_norm": 1004.0, + "learning_rate": 3.8510638297872337e-07, + "logits/chosen": -3.763671875, + "logits/rejected": -3.826171875, + "logps/chosen": -302.75, + "logps/rejected": -217.3125, + "loss": 1.8906, + "nll_loss": 1.08544921875, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.9658203125, + "rewards/margins": 1.1220703125, + "rewards/rejected": 0.84307861328125, + "step": 342 + }, + { + "epoch": 1.3150816522574447, + "grad_norm": 266.0, + "learning_rate": 3.829787234042553e-07, + "logits/chosen": -3.81640625, + "logits/rejected": -3.85546875, + "logps/chosen": -408.5, + "logps/rejected": -242.125, + "loss": 1.499, + "nll_loss": 0.9228515625, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.435546875, + "rewards/margins": 2.6708984375, + "rewards/rejected": 0.76068115234375, + "step": 343 + }, + { + "epoch": 1.3189241114313162, + "grad_norm": 138.0, + "learning_rate": 3.808510638297872e-07, + "logits/chosen": -3.791015625, + "logits/rejected": -3.884765625, + "logps/chosen": -417.75, + "logps/rejected": -219.0, + "loss": 1.3809, + "nll_loss": 0.82666015625, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.3515625, + "rewards/margins": 2.716796875, + "rewards/rejected": 0.635009765625, + "step": 344 + }, + { + "epoch": 1.3227665706051872, + "grad_norm": 132.0, + "learning_rate": 3.7872340425531914e-07, + "logits/chosen": -3.734375, + "logits/rejected": -3.818359375, + "logps/chosen": -424.875, + "logps/rejected": -196.875, + "loss": 1.4971, + "nll_loss": 0.94189453125, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.4873046875, + "rewards/margins": 2.918212890625, + "rewards/rejected": 0.56536865234375, + "step": 345 + }, + { + "epoch": 1.3266090297790587, + "grad_norm": 264.0, + "learning_rate": 3.7659574468085106e-07, + "logits/chosen": -3.7578125, + "logits/rejected": -3.81640625, + "logps/chosen": -509.875, + "logps/rejected": -279.875, + "loss": 1.3369, + "nll_loss": 0.80810546875, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.484375, + "rewards/margins": 3.0107421875, + "rewards/rejected": 0.4742431640625, + "step": 346 + }, + { + "epoch": 1.3266090297790587, + "eval_logits/chosen": -3.7707386016845703, + "eval_logits/rejected": -3.823153495788574, + "eval_logps/chosen": -449.9454650878906, + "eval_logps/rejected": -244.11817932128906, + "eval_loss": 1.467628836631775, + "eval_nll_loss": 0.8924893736839294, + "eval_rewards/accuracies": 0.8901515007019043, + "eval_rewards/chosen": 3.547159194946289, + "eval_rewards/margins": 2.7079367637634277, + "eval_rewards/rejected": 0.8406782746315002, + "eval_runtime": 105.568, + "eval_samples_per_second": 4.158, + "eval_steps_per_second": 1.042, + "step": 346 + }, + { + "epoch": 1.3304514889529298, + "grad_norm": 132.0, + "learning_rate": 3.744680851063829e-07, + "logits/chosen": -3.837890625, + "logits/rejected": -3.873046875, + "logps/chosen": -434.625, + "logps/rejected": -233.4375, + "loss": 1.5273, + "nll_loss": 0.97607421875, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.611328125, + "rewards/margins": 2.88330078125, + "rewards/rejected": 0.726318359375, + "step": 347 + }, + { + "epoch": 1.3342939481268012, + "grad_norm": 212.0, + "learning_rate": 3.7234042553191484e-07, + "logits/chosen": -3.78125, + "logits/rejected": -3.765625, + "logps/chosen": -458.0, + "logps/rejected": -254.375, + "loss": 1.4951, + "nll_loss": 0.9189453125, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.2900390625, + "rewards/margins": 2.79833984375, + "rewards/rejected": 0.493408203125, + "step": 348 + }, + { + "epoch": 1.3381364073006723, + "grad_norm": 172.0, + "learning_rate": 3.7021276595744676e-07, + "logits/chosen": -3.7265625, + "logits/rejected": -3.75, + "logps/chosen": -403.75, + "logps/rejected": -216.75, + "loss": 1.3994, + "nll_loss": 0.82177734375, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.111328125, + "rewards/margins": 2.52880859375, + "rewards/rejected": 0.5870361328125, + "step": 349 + }, + { + "epoch": 1.3419788664745438, + "grad_norm": 158.0, + "learning_rate": 3.680851063829787e-07, + "logits/chosen": -3.73046875, + "logits/rejected": -3.80078125, + "logps/chosen": -388.75, + "logps/rejected": -224.875, + "loss": 1.5547, + "nll_loss": 0.9658203125, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.193359375, + "rewards/margins": 2.294921875, + "rewards/rejected": 0.89874267578125, + "step": 350 + }, + { + "epoch": 1.345821325648415, + "grad_norm": 199.0, + "learning_rate": 3.659574468085106e-07, + "logits/chosen": -3.76953125, + "logits/rejected": -3.912109375, + "logps/chosen": -346.375, + "logps/rejected": -180.6875, + "loss": 1.4014, + "nll_loss": 0.7705078125, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.30078125, + "rewards/margins": 2.39794921875, + "rewards/rejected": 0.9044189453125, + "step": 351 + }, + { + "epoch": 1.3496637848222863, + "grad_norm": 126.5, + "learning_rate": 3.638297872340426e-07, + "logits/chosen": -3.67578125, + "logits/rejected": -3.73046875, + "logps/chosen": -337.25, + "logps/rejected": -189.5, + "loss": 1.3066, + "nll_loss": 0.759765625, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6025390625, + "rewards/margins": 2.5107421875, + "rewards/rejected": 0.0919189453125, + "step": 352 + }, + { + "epoch": 1.3535062439961576, + "grad_norm": 235.0, + "learning_rate": 3.617021276595745e-07, + "logits/chosen": -3.794921875, + "logits/rejected": -3.84765625, + "logps/chosen": -466.25, + "logps/rejected": -270.625, + "loss": 1.3955, + "nll_loss": 0.86181640625, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.29296875, + "rewards/margins": 2.8525390625, + "rewards/rejected": 0.4505615234375, + "step": 353 + }, + { + "epoch": 1.3573487031700289, + "grad_norm": 1568.0, + "learning_rate": 3.5957446808510637e-07, + "logits/chosen": -3.70703125, + "logits/rejected": -3.78515625, + "logps/chosen": -518.5, + "logps/rejected": -246.375, + "loss": 1.4619, + "nll_loss": 0.8994140625, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.501953125, + "rewards/margins": 2.8076171875, + "rewards/rejected": 0.6917724609375, + "step": 354 + }, + { + "epoch": 1.3611911623439001, + "grad_norm": 130.0, + "learning_rate": 3.574468085106383e-07, + "logits/chosen": -3.814453125, + "logits/rejected": -3.869140625, + "logps/chosen": -482.875, + "logps/rejected": -281.5, + "loss": 1.4849, + "nll_loss": 0.91357421875, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.796875, + "rewards/margins": 2.937255859375, + "rewards/rejected": 0.86181640625, + "step": 355 + }, + { + "epoch": 1.3650336215177714, + "grad_norm": 114.0, + "learning_rate": 3.553191489361702e-07, + "logits/chosen": -3.826171875, + "logits/rejected": -3.8203125, + "logps/chosen": -576.5, + "logps/rejected": -302.5, + "loss": 1.3799, + "nll_loss": 0.89990234375, + "rewards/accuracies": 0.96875, + "rewards/chosen": 4.92578125, + "rewards/margins": 4.138671875, + "rewards/rejected": 0.7935791015625, + "step": 356 + }, + { + "epoch": 1.3688760806916427, + "grad_norm": 134.0, + "learning_rate": 3.5319148936170213e-07, + "logits/chosen": -3.708984375, + "logits/rejected": -3.791015625, + "logps/chosen": -375.875, + "logps/rejected": -212.25, + "loss": 1.5215, + "nll_loss": 0.90869140625, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8046875, + "rewards/margins": 2.072265625, + "rewards/rejected": 0.736572265625, + "step": 357 + }, + { + "epoch": 1.372718539865514, + "grad_norm": 88.0, + "learning_rate": 3.5106382978723405e-07, + "logits/chosen": -3.78515625, + "logits/rejected": -3.78515625, + "logps/chosen": -546.25, + "logps/rejected": -249.125, + "loss": 1.1406, + "nll_loss": 0.724853515625, + "rewards/accuracies": 0.96875, + "rewards/chosen": 4.2509765625, + "rewards/margins": 4.0986328125, + "rewards/rejected": 0.158203125, + "step": 358 + }, + { + "epoch": 1.3765609990393852, + "grad_norm": 226.0, + "learning_rate": 3.4893617021276597e-07, + "logits/chosen": -3.748046875, + "logits/rejected": -3.8125, + "logps/chosen": -464.625, + "logps/rejected": -224.75, + "loss": 1.2598, + "nll_loss": 0.81201171875, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.9619140625, + "rewards/margins": 3.470703125, + "rewards/rejected": 0.49176025390625, + "step": 359 + }, + { + "epoch": 1.3804034582132565, + "grad_norm": 121.0, + "learning_rate": 3.4680851063829784e-07, + "logits/chosen": -3.7421875, + "logits/rejected": -3.75390625, + "logps/chosen": -352.875, + "logps/rejected": -189.375, + "loss": 1.502, + "nll_loss": 0.904296875, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.603515625, + "rewards/margins": 2.2353515625, + "rewards/rejected": 0.3668060302734375, + "step": 360 + }, + { + "epoch": 1.3842459173871278, + "grad_norm": 106.5, + "learning_rate": 3.4468085106382976e-07, + "logits/chosen": -3.763671875, + "logits/rejected": -3.7890625, + "logps/chosen": -417.5, + "logps/rejected": -204.5, + "loss": 1.3105, + "nll_loss": 0.8134765625, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.7421875, + "rewards/margins": 3.220703125, + "rewards/rejected": 0.51953125, + "step": 361 + }, + { + "epoch": 1.388088376560999, + "grad_norm": 151.0, + "learning_rate": 3.425531914893617e-07, + "logits/chosen": -3.720703125, + "logits/rejected": -3.794921875, + "logps/chosen": -367.25, + "logps/rejected": -226.25, + "loss": 1.4775, + "nll_loss": 0.90966796875, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.19140625, + "rewards/margins": 2.513671875, + "rewards/rejected": 0.6826171875, + "step": 362 + }, + { + "epoch": 1.3919308357348703, + "grad_norm": 136.0, + "learning_rate": 3.404255319148936e-07, + "logits/chosen": -3.814453125, + "logits/rejected": -3.853515625, + "logps/chosen": -420.875, + "logps/rejected": -216.25, + "loss": 1.3384, + "nll_loss": 0.84130859375, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.783203125, + "rewards/margins": 3.1240234375, + "rewards/rejected": 0.6591796875, + "step": 363 + }, + { + "epoch": 1.3957732949087416, + "grad_norm": 146.0, + "learning_rate": 3.382978723404255e-07, + "logits/chosen": -3.708984375, + "logits/rejected": -3.72265625, + "logps/chosen": -426.125, + "logps/rejected": -247.0, + "loss": 1.4961, + "nll_loss": 0.96923828125, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.18359375, + "rewards/margins": 2.9052734375, + "rewards/rejected": 0.276580810546875, + "step": 364 + }, + { + "epoch": 1.3996157540826129, + "grad_norm": 117.5, + "learning_rate": 3.3617021276595744e-07, + "logits/chosen": -3.71484375, + "logits/rejected": -3.78515625, + "logps/chosen": -398.0, + "logps/rejected": -222.5, + "loss": 1.4443, + "nll_loss": 0.8564453125, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.376953125, + "rewards/margins": 2.5869140625, + "rewards/rejected": 0.78985595703125, + "step": 365 + }, + { + "epoch": 1.4034582132564841, + "grad_norm": 168.0, + "learning_rate": 3.340425531914893e-07, + "logits/chosen": -3.810546875, + "logits/rejected": -3.849609375, + "logps/chosen": -377.0, + "logps/rejected": -217.0, + "loss": 1.6006, + "nll_loss": 0.962890625, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.234375, + "rewards/margins": 2.22802734375, + "rewards/rejected": 1.00927734375, + "step": 366 + }, + { + "epoch": 1.4073006724303554, + "grad_norm": 163.0, + "learning_rate": 3.3191489361702123e-07, + "logits/chosen": -3.828125, + "logits/rejected": -3.87890625, + "logps/chosen": -439.125, + "logps/rejected": -221.0, + "loss": 1.4766, + "nll_loss": 0.92626953125, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.61328125, + "rewards/margins": 2.892578125, + "rewards/rejected": 0.722320556640625, + "step": 367 + }, + { + "epoch": 1.4111431316042267, + "grad_norm": 140.0, + "learning_rate": 3.2978723404255315e-07, + "logits/chosen": -3.734375, + "logits/rejected": -3.837890625, + "logps/chosen": -416.5, + "logps/rejected": -222.0, + "loss": 1.3984, + "nll_loss": 0.890625, + "rewards/accuracies": 0.9375, + "rewards/chosen": 4.490234375, + "rewards/margins": 3.509765625, + "rewards/rejected": 0.98284912109375, + "step": 368 + }, + { + "epoch": 1.414985590778098, + "grad_norm": 212.0, + "learning_rate": 3.2765957446808507e-07, + "logits/chosen": -3.8203125, + "logits/rejected": -3.8671875, + "logps/chosen": -520.5, + "logps/rejected": -225.5, + "loss": 1.3779, + "nll_loss": 0.9453125, + "rewards/accuracies": 0.9375, + "rewards/chosen": 4.25390625, + "rewards/margins": 3.818359375, + "rewards/rejected": 0.4375, + "step": 369 + }, + { + "epoch": 1.4188280499519692, + "grad_norm": 150.0, + "learning_rate": 3.2553191489361704e-07, + "logits/chosen": -3.814453125, + "logits/rejected": -3.84375, + "logps/chosen": -382.875, + "logps/rejected": -210.625, + "loss": 1.3022, + "nll_loss": 0.78564453125, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.205078125, + "rewards/margins": 2.7626953125, + "rewards/rejected": 0.44091796875, + "step": 370 + }, + { + "epoch": 1.4226705091258405, + "grad_norm": 108.0, + "learning_rate": 3.2340425531914897e-07, + "logits/chosen": -3.798828125, + "logits/rejected": -3.8125, + "logps/chosen": -347.375, + "logps/rejected": -177.4375, + "loss": 1.3652, + "nll_loss": 0.85693359375, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9873046875, + "rewards/margins": 2.8896484375, + "rewards/rejected": 0.1068115234375, + "step": 371 + }, + { + "epoch": 1.4265129682997117, + "grad_norm": 136.0, + "learning_rate": 3.2127659574468083e-07, + "logits/chosen": -3.87109375, + "logits/rejected": -3.875, + "logps/chosen": -476.75, + "logps/rejected": -257.375, + "loss": 1.3311, + "nll_loss": 0.8154296875, + "rewards/accuracies": 0.90625, + "rewards/chosen": 4.337890625, + "rewards/margins": 3.515625, + "rewards/rejected": 0.81591796875, + "step": 372 + }, + { + "epoch": 1.430355427473583, + "grad_norm": 209.0, + "learning_rate": 3.1914893617021275e-07, + "logits/chosen": -3.806640625, + "logits/rejected": -3.83203125, + "logps/chosen": -459.0, + "logps/rejected": -273.25, + "loss": 1.5186, + "nll_loss": 0.870361328125, + "rewards/accuracies": 0.8125, + "rewards/chosen": 4.2255859375, + "rewards/margins": 2.943359375, + "rewards/rejected": 1.2890625, + "step": 373 + }, + { + "epoch": 1.4341978866474543, + "grad_norm": 117.0, + "learning_rate": 3.170212765957447e-07, + "logits/chosen": -3.783203125, + "logits/rejected": -3.8515625, + "logps/chosen": -410.75, + "logps/rejected": -221.25, + "loss": 1.5859, + "nll_loss": 1.03955078125, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.107421875, + "rewards/margins": 2.560546875, + "rewards/rejected": 0.5440673828125, + "step": 374 + }, + { + "epoch": 1.4380403458213258, + "grad_norm": 110.5, + "learning_rate": 3.148936170212766e-07, + "logits/chosen": -3.76953125, + "logits/rejected": -3.76953125, + "logps/chosen": -471.375, + "logps/rejected": -225.125, + "loss": 1.3662, + "nll_loss": 0.8955078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.7314453125, + "rewards/margins": 3.283203125, + "rewards/rejected": 0.4498291015625, + "step": 375 + }, + { + "epoch": 1.4418828049951968, + "grad_norm": 161.0, + "learning_rate": 3.127659574468085e-07, + "logits/chosen": -3.759765625, + "logits/rejected": -3.7265625, + "logps/chosen": -422.875, + "logps/rejected": -218.0, + "loss": 1.3506, + "nll_loss": 0.8134765625, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.15625, + "rewards/margins": 2.7646484375, + "rewards/rejected": 0.390472412109375, + "step": 376 + }, + { + "epoch": 1.4457252641690683, + "grad_norm": 362.0, + "learning_rate": 3.1063829787234044e-07, + "logits/chosen": -3.693359375, + "logits/rejected": -3.728515625, + "logps/chosen": -411.25, + "logps/rejected": -279.125, + "loss": 1.457, + "nll_loss": 0.78955078125, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.998046875, + "rewards/margins": 1.990234375, + "rewards/rejected": 1.0076904296875, + "step": 377 + }, + { + "epoch": 1.4495677233429394, + "grad_norm": 139.0, + "learning_rate": 3.085106382978723e-07, + "logits/chosen": -3.826171875, + "logits/rejected": -3.798828125, + "logps/chosen": -391.5, + "logps/rejected": -253.25, + "loss": 1.4443, + "nll_loss": 0.93017578125, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3154296875, + "rewards/margins": 2.7861328125, + "rewards/rejected": 0.5325927734375, + "step": 378 + }, + { + "epoch": 1.4534101825168109, + "grad_norm": 139.0, + "learning_rate": 3.063829787234042e-07, + "logits/chosen": -3.833984375, + "logits/rejected": -3.849609375, + "logps/chosen": -511.0, + "logps/rejected": -270.875, + "loss": 1.4028, + "nll_loss": 0.830078125, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.9521484375, + "rewards/margins": 2.808349609375, + "rewards/rejected": 1.154541015625, + "step": 379 + }, + { + "epoch": 1.457252641690682, + "grad_norm": 144.0, + "learning_rate": 3.0425531914893615e-07, + "logits/chosen": -3.708984375, + "logits/rejected": -3.724609375, + "logps/chosen": -330.125, + "logps/rejected": -178.5, + "loss": 1.5957, + "nll_loss": 0.94384765625, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.669921875, + "rewards/margins": 2.0262451171875, + "rewards/rejected": 0.6431884765625, + "step": 380 + }, + { + "epoch": 1.4610951008645534, + "grad_norm": 132.0, + "learning_rate": 3.0212765957446807e-07, + "logits/chosen": -3.66015625, + "logits/rejected": -3.693359375, + "logps/chosen": -444.25, + "logps/rejected": -227.0625, + "loss": 1.501, + "nll_loss": 0.9794921875, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.435546875, + "rewards/margins": 2.986328125, + "rewards/rejected": 0.45013427734375, + "step": 381 + }, + { + "epoch": 1.4649375600384245, + "grad_norm": 350.0, + "learning_rate": 3e-07, + "logits/chosen": -3.71484375, + "logits/rejected": -3.716796875, + "logps/chosen": -413.125, + "logps/rejected": -245.625, + "loss": 1.5527, + "nll_loss": 0.8955078125, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.140625, + "rewards/margins": 2.13623046875, + "rewards/rejected": 1.0045166015625, + "step": 382 + }, + { + "epoch": 1.468780019212296, + "grad_norm": 122.5, + "learning_rate": 2.978723404255319e-07, + "logits/chosen": -3.73046875, + "logits/rejected": -3.7421875, + "logps/chosen": -371.5, + "logps/rejected": -210.5, + "loss": 1.4717, + "nll_loss": 0.8662109375, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.955078125, + "rewards/margins": 2.3955078125, + "rewards/rejected": 0.56134033203125, + "step": 383 + }, + { + "epoch": 1.4726224783861672, + "grad_norm": 332.0, + "learning_rate": 2.957446808510638e-07, + "logits/chosen": -3.78125, + "logits/rejected": -3.8046875, + "logps/chosen": -358.125, + "logps/rejected": -257.875, + "loss": 1.4961, + "nll_loss": 0.828125, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.185546875, + "rewards/margins": 1.892578125, + "rewards/rejected": 1.294189453125, + "step": 384 + }, + { + "epoch": 1.4764649375600385, + "grad_norm": 144.0, + "learning_rate": 2.936170212765957e-07, + "logits/chosen": -3.728515625, + "logits/rejected": -3.830078125, + "logps/chosen": -359.25, + "logps/rejected": -192.875, + "loss": 1.3262, + "nll_loss": 0.7802734375, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.283203125, + "rewards/margins": 2.6064453125, + "rewards/rejected": 0.679779052734375, + "step": 385 + }, + { + "epoch": 1.4803073967339098, + "grad_norm": 131.0, + "learning_rate": 2.914893617021276e-07, + "logits/chosen": -3.8671875, + "logits/rejected": -3.953125, + "logps/chosen": -456.125, + "logps/rejected": -230.25, + "loss": 1.417, + "nll_loss": 0.91162109375, + "rewards/accuracies": 0.9375, + "rewards/chosen": 4.0986328125, + "rewards/margins": 3.38134765625, + "rewards/rejected": 0.721435546875, + "step": 386 + }, + { + "epoch": 1.484149855907781, + "grad_norm": 204.0, + "learning_rate": 2.8936170212765954e-07, + "logits/chosen": -3.73828125, + "logits/rejected": -3.818359375, + "logps/chosen": -488.125, + "logps/rejected": -247.75, + "loss": 1.4922, + "nll_loss": 0.93603515625, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.921875, + "rewards/margins": 3.092041015625, + "rewards/rejected": 0.8319091796875, + "step": 387 + }, + { + "epoch": 1.4879923150816523, + "grad_norm": 270.0, + "learning_rate": 2.872340425531915e-07, + "logits/chosen": -3.826171875, + "logits/rejected": -3.8515625, + "logps/chosen": -502.0, + "logps/rejected": -229.625, + "loss": 1.4746, + "nll_loss": 0.986328125, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.97265625, + "rewards/margins": 3.47265625, + "rewards/rejected": 0.50445556640625, + "step": 388 + }, + { + "epoch": 1.4918347742555236, + "grad_norm": 132.0, + "learning_rate": 2.8510638297872343e-07, + "logits/chosen": -3.751953125, + "logits/rejected": -3.88671875, + "logps/chosen": -349.75, + "logps/rejected": -197.875, + "loss": 1.6855, + "nll_loss": 0.98388671875, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8115234375, + "rewards/margins": 1.86767578125, + "rewards/rejected": 0.94500732421875, + "step": 389 + }, + { + "epoch": 1.4956772334293948, + "grad_norm": 109.0, + "learning_rate": 2.8297872340425535e-07, + "logits/chosen": -3.783203125, + "logits/rejected": -3.828125, + "logps/chosen": -486.25, + "logps/rejected": -247.75, + "loss": 1.5098, + "nll_loss": 0.96826171875, + "rewards/accuracies": 0.84375, + "rewards/chosen": 4.1826171875, + "rewards/margins": 3.294921875, + "rewards/rejected": 0.888427734375, + "step": 390 + }, + { + "epoch": 1.4995196926032661, + "grad_norm": 572.0, + "learning_rate": 2.808510638297872e-07, + "logits/chosen": -3.859375, + "logits/rejected": -3.8515625, + "logps/chosen": -421.25, + "logps/rejected": -251.375, + "loss": 1.6006, + "nll_loss": 0.93798828125, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.6220703125, + "rewards/margins": 2.410400390625, + "rewards/rejected": 1.2132568359375, + "step": 391 + }, + { + "epoch": 1.5033621517771374, + "grad_norm": 122.5, + "learning_rate": 2.7872340425531914e-07, + "logits/chosen": -3.7578125, + "logits/rejected": -3.783203125, + "logps/chosen": -467.25, + "logps/rejected": -236.0, + "loss": 1.3867, + "nll_loss": 0.8779296875, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.9296875, + "rewards/margins": 3.3173828125, + "rewards/rejected": 0.610107421875, + "step": 392 + }, + { + "epoch": 1.5072046109510087, + "grad_norm": 167.0, + "learning_rate": 2.7659574468085106e-07, + "logits/chosen": -3.77734375, + "logits/rejected": -3.765625, + "logps/chosen": -426.5, + "logps/rejected": -215.125, + "loss": 1.3301, + "nll_loss": 0.8291015625, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.544921875, + "rewards/margins": 3.0537109375, + "rewards/rejected": 0.49139404296875, + "step": 393 + }, + { + "epoch": 1.51104707012488, + "grad_norm": 126.0, + "learning_rate": 2.74468085106383e-07, + "logits/chosen": -3.71484375, + "logits/rejected": -3.759765625, + "logps/chosen": -493.75, + "logps/rejected": -209.15625, + "loss": 1.3164, + "nll_loss": 0.85693359375, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.9072265625, + "rewards/margins": 3.3427734375, + "rewards/rejected": 0.5655517578125, + "step": 394 + }, + { + "epoch": 1.5148895292987512, + "grad_norm": 192.0, + "learning_rate": 2.723404255319149e-07, + "logits/chosen": -3.88671875, + "logits/rejected": -3.94140625, + "logps/chosen": -427.625, + "logps/rejected": -220.4375, + "loss": 1.583, + "nll_loss": 1.02294921875, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.814453125, + "rewards/margins": 2.814453125, + "rewards/rejected": 1.00390625, + "step": 395 + }, + { + "epoch": 1.5187319884726225, + "grad_norm": 254.0, + "learning_rate": 2.702127659574468e-07, + "logits/chosen": -3.80078125, + "logits/rejected": -3.8671875, + "logps/chosen": -444.75, + "logps/rejected": -249.25, + "loss": 1.396, + "nll_loss": 0.92919921875, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.712890625, + "rewards/margins": 3.16015625, + "rewards/rejected": 0.54913330078125, + "step": 396 + }, + { + "epoch": 1.5225744476464937, + "grad_norm": 161.0, + "learning_rate": 2.680851063829787e-07, + "logits/chosen": -3.80078125, + "logits/rejected": -3.8359375, + "logps/chosen": -389.875, + "logps/rejected": -191.25, + "loss": 1.4697, + "nll_loss": 0.9091796875, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9658203125, + "rewards/margins": 2.46484375, + "rewards/rejected": 0.49749755859375, + "step": 397 + }, + { + "epoch": 1.526416906820365, + "grad_norm": 104.5, + "learning_rate": 2.659574468085106e-07, + "logits/chosen": -3.728515625, + "logits/rejected": -3.802734375, + "logps/chosen": -506.0, + "logps/rejected": -252.3125, + "loss": 1.4785, + "nll_loss": 0.94140625, + "rewards/accuracies": 0.875, + "rewards/chosen": 4.009765625, + "rewards/margins": 3.05517578125, + "rewards/rejected": 0.9501953125, + "step": 398 + }, + { + "epoch": 1.5302593659942363, + "grad_norm": 118.0, + "learning_rate": 2.6382978723404253e-07, + "logits/chosen": -3.767578125, + "logits/rejected": -3.841796875, + "logps/chosen": -473.875, + "logps/rejected": -284.0, + "loss": 1.3027, + "nll_loss": 0.7841796875, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.810546875, + "rewards/margins": 3.0439453125, + "rewards/rejected": 0.768798828125, + "step": 399 + }, + { + "epoch": 1.5341018251681076, + "grad_norm": 105.5, + "learning_rate": 2.6170212765957445e-07, + "logits/chosen": -3.75, + "logits/rejected": -3.765625, + "logps/chosen": -444.375, + "logps/rejected": -214.0, + "loss": 1.3525, + "nll_loss": 0.84814453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3603515625, + "rewards/margins": 2.880859375, + "rewards/rejected": 0.48095703125, + "step": 400 + }, + { + "epoch": 1.5379442843419788, + "grad_norm": 91.0, + "learning_rate": 2.5957446808510637e-07, + "logits/chosen": -3.74609375, + "logits/rejected": -3.7890625, + "logps/chosen": -502.75, + "logps/rejected": -252.375, + "loss": 1.3247, + "nll_loss": 0.83984375, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.89453125, + "rewards/margins": 3.4228515625, + "rewards/rejected": 0.4725341796875, + "step": 401 + }, + { + "epoch": 1.54178674351585, + "grad_norm": 114.5, + "learning_rate": 2.574468085106383e-07, + "logits/chosen": -3.83203125, + "logits/rejected": -3.830078125, + "logps/chosen": -409.875, + "logps/rejected": -215.75, + "loss": 1.4785, + "nll_loss": 0.958984375, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.716796875, + "rewards/margins": 3.0224609375, + "rewards/rejected": 0.7022705078125, + "step": 402 + }, + { + "epoch": 1.5456292026897214, + "grad_norm": 176.0, + "learning_rate": 2.5531914893617016e-07, + "logits/chosen": -3.8359375, + "logits/rejected": -3.833984375, + "logps/chosen": -387.0, + "logps/rejected": -284.75, + "loss": 1.4336, + "nll_loss": 0.8095703125, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.2197265625, + "rewards/margins": 2.4893798828125, + "rewards/rejected": 0.728759765625, + "step": 403 + }, + { + "epoch": 1.5494716618635929, + "grad_norm": 116.0, + "learning_rate": 2.531914893617021e-07, + "logits/chosen": -3.6953125, + "logits/rejected": -3.720703125, + "logps/chosen": -375.5, + "logps/rejected": -208.125, + "loss": 1.4541, + "nll_loss": 0.8828125, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6044921875, + "rewards/margins": 2.2109375, + "rewards/rejected": 0.39691162109375, + "step": 404 + }, + { + "epoch": 1.553314121037464, + "grad_norm": 130.0, + "learning_rate": 2.51063829787234e-07, + "logits/chosen": -3.744140625, + "logits/rejected": -3.79296875, + "logps/chosen": -507.25, + "logps/rejected": -275.375, + "loss": 1.3979, + "nll_loss": 0.90673828125, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.98046875, + "rewards/margins": 3.3427734375, + "rewards/rejected": 0.6416015625, + "step": 405 + }, + { + "epoch": 1.5571565802113354, + "grad_norm": 139.0, + "learning_rate": 2.489361702127659e-07, + "logits/chosen": -3.783203125, + "logits/rejected": -3.85546875, + "logps/chosen": -443.0, + "logps/rejected": -254.25, + "loss": 1.3877, + "nll_loss": 0.87646484375, + "rewards/accuracies": 0.9375, + "rewards/chosen": 4.060546875, + "rewards/margins": 3.326171875, + "rewards/rejected": 0.732757568359375, + "step": 406 + }, + { + "epoch": 1.5609990393852065, + "grad_norm": 113.5, + "learning_rate": 2.4680851063829784e-07, + "logits/chosen": -3.724609375, + "logits/rejected": -3.814453125, + "logps/chosen": -426.0, + "logps/rejected": -203.1875, + "loss": 1.4102, + "nll_loss": 0.87890625, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3876953125, + "rewards/margins": 2.7890625, + "rewards/rejected": 0.596923828125, + "step": 407 + }, + { + "epoch": 1.564841498559078, + "grad_norm": 105.5, + "learning_rate": 2.4468085106382976e-07, + "logits/chosen": -3.669921875, + "logits/rejected": -3.6953125, + "logps/chosen": -444.25, + "logps/rejected": -219.375, + "loss": 1.4209, + "nll_loss": 0.87841796875, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.486328125, + "rewards/margins": 2.830078125, + "rewards/rejected": 0.65350341796875, + "step": 408 + }, + { + "epoch": 1.568683957732949, + "grad_norm": 166.0, + "learning_rate": 2.425531914893617e-07, + "logits/chosen": -3.791015625, + "logits/rejected": -3.85546875, + "logps/chosen": -502.75, + "logps/rejected": -230.125, + "loss": 1.4014, + "nll_loss": 0.892578125, + "rewards/accuracies": 0.96875, + "rewards/chosen": 4.05078125, + "rewards/margins": 3.19140625, + "rewards/rejected": 0.860107421875, + "step": 409 + }, + { + "epoch": 1.5725264169068205, + "grad_norm": 126.0, + "learning_rate": 2.404255319148936e-07, + "logits/chosen": -3.810546875, + "logits/rejected": -3.890625, + "logps/chosen": -519.0, + "logps/rejected": -268.25, + "loss": 1.4619, + "nll_loss": 0.92529296875, + "rewards/accuracies": 0.875, + "rewards/chosen": 4.51953125, + "rewards/margins": 3.6162109375, + "rewards/rejected": 0.906494140625, + "step": 410 + }, + { + "epoch": 1.5763688760806915, + "grad_norm": 126.5, + "learning_rate": 2.3829787234042553e-07, + "logits/chosen": -3.763671875, + "logits/rejected": -3.77734375, + "logps/chosen": -493.0, + "logps/rejected": -262.625, + "loss": 1.3809, + "nll_loss": 0.89208984375, + "rewards/accuracies": 0.875, + "rewards/chosen": 4.509765625, + "rewards/margins": 3.623046875, + "rewards/rejected": 0.885009765625, + "step": 411 + }, + { + "epoch": 1.580211335254563, + "grad_norm": 123.5, + "learning_rate": 2.3617021276595745e-07, + "logits/chosen": -3.71484375, + "logits/rejected": -3.77734375, + "logps/chosen": -434.875, + "logps/rejected": -198.8125, + "loss": 1.374, + "nll_loss": 0.8359375, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.552734375, + "rewards/margins": 3.3173828125, + "rewards/rejected": 0.236572265625, + "step": 412 + }, + { + "epoch": 1.584053794428434, + "grad_norm": 264.0, + "learning_rate": 2.3404255319148937e-07, + "logits/chosen": -3.759765625, + "logits/rejected": -3.798828125, + "logps/chosen": -519.5, + "logps/rejected": -236.625, + "loss": 1.3545, + "nll_loss": 0.88330078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.28125, + "rewards/margins": 3.822265625, + "rewards/rejected": 0.458251953125, + "step": 413 + }, + { + "epoch": 1.5878962536023056, + "grad_norm": 113.5, + "learning_rate": 2.3191489361702126e-07, + "logits/chosen": -3.7890625, + "logits/rejected": -3.861328125, + "logps/chosen": -427.25, + "logps/rejected": -206.75, + "loss": 1.418, + "nll_loss": 0.89794921875, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3095703125, + "rewards/margins": 2.966796875, + "rewards/rejected": 0.3388671875, + "step": 414 + }, + { + "epoch": 1.5917387127761766, + "grad_norm": 132.0, + "learning_rate": 2.2978723404255318e-07, + "logits/chosen": -3.779296875, + "logits/rejected": -3.8203125, + "logps/chosen": -387.375, + "logps/rejected": -233.25, + "loss": 1.4678, + "nll_loss": 0.8544921875, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.294921875, + "rewards/margins": 2.33642578125, + "rewards/rejected": 0.9560546875, + "step": 415 + }, + { + "epoch": 1.5955811719500481, + "grad_norm": 182.0, + "learning_rate": 2.276595744680851e-07, + "logits/chosen": -3.759765625, + "logits/rejected": -3.78515625, + "logps/chosen": -468.0, + "logps/rejected": -240.125, + "loss": 1.4258, + "nll_loss": 0.8779296875, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.7041015625, + "rewards/margins": 3.115234375, + "rewards/rejected": 0.5924072265625, + "step": 416 + }, + { + "epoch": 1.5994236311239192, + "grad_norm": 125.5, + "learning_rate": 2.25531914893617e-07, + "logits/chosen": -3.734375, + "logits/rejected": -3.802734375, + "logps/chosen": -442.75, + "logps/rejected": -203.625, + "loss": 1.4106, + "nll_loss": 0.87109375, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.4873046875, + "rewards/margins": 2.9384765625, + "rewards/rejected": 0.547119140625, + "step": 417 + }, + { + "epoch": 1.6032660902977907, + "grad_norm": 182.0, + "learning_rate": 2.2340425531914892e-07, + "logits/chosen": -3.796875, + "logits/rejected": -3.830078125, + "logps/chosen": -511.125, + "logps/rejected": -250.125, + "loss": 1.4727, + "nll_loss": 0.9560546875, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.84765625, + "rewards/margins": 3.044921875, + "rewards/rejected": 0.7982177734375, + "step": 418 + }, + { + "epoch": 1.6071085494716617, + "grad_norm": 108.0, + "learning_rate": 2.2127659574468084e-07, + "logits/chosen": -3.767578125, + "logits/rejected": -3.759765625, + "logps/chosen": -366.5, + "logps/rejected": -200.4375, + "loss": 1.4609, + "nll_loss": 0.9072265625, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.322265625, + "rewards/margins": 2.6875, + "rewards/rejected": 0.63739013671875, + "step": 419 + }, + { + "epoch": 1.6109510086455332, + "grad_norm": 704.0, + "learning_rate": 2.1914893617021276e-07, + "logits/chosen": -3.748046875, + "logits/rejected": -3.890625, + "logps/chosen": -446.75, + "logps/rejected": -241.125, + "loss": 1.417, + "nll_loss": 0.8505859375, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.6181640625, + "rewards/margins": 2.95849609375, + "rewards/rejected": 0.6580810546875, + "step": 420 + }, + { + "epoch": 1.6147934678194045, + "grad_norm": 348.0, + "learning_rate": 2.1702127659574468e-07, + "logits/chosen": -3.73828125, + "logits/rejected": -3.7734375, + "logps/chosen": -422.875, + "logps/rejected": -272.125, + "loss": 1.5107, + "nll_loss": 0.86865234375, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.462890625, + "rewards/margins": 2.4482421875, + "rewards/rejected": 1.01708984375, + "step": 421 + }, + { + "epoch": 1.6186359269932757, + "grad_norm": 147.0, + "learning_rate": 2.148936170212766e-07, + "logits/chosen": -3.81640625, + "logits/rejected": -3.861328125, + "logps/chosen": -464.5, + "logps/rejected": -246.875, + "loss": 1.4219, + "nll_loss": 0.8720703125, + "rewards/accuracies": 0.90625, + "rewards/chosen": 4.146484375, + "rewards/margins": 3.17578125, + "rewards/rejected": 0.97412109375, + "step": 422 + }, + { + "epoch": 1.622478386167147, + "grad_norm": 255.0, + "learning_rate": 2.127659574468085e-07, + "logits/chosen": -3.796875, + "logits/rejected": -3.859375, + "logps/chosen": -459.75, + "logps/rejected": -260.25, + "loss": 1.5791, + "nll_loss": 0.9384765625, + "rewards/accuracies": 0.84375, + "rewards/chosen": 4.189453125, + "rewards/margins": 2.90283203125, + "rewards/rejected": 1.284423828125, + "step": 423 + }, + { + "epoch": 1.6263208453410183, + "grad_norm": 98.5, + "learning_rate": 2.1063829787234041e-07, + "logits/chosen": -3.763671875, + "logits/rejected": -3.7890625, + "logps/chosen": -460.375, + "logps/rejected": -275.625, + "loss": 1.3672, + "nll_loss": 0.85009765625, + "rewards/accuracies": 0.9375, + "rewards/chosen": 4.361328125, + "rewards/margins": 3.3310546875, + "rewards/rejected": 1.035400390625, + "step": 424 + }, + { + "epoch": 1.6301633045148896, + "grad_norm": 195.0, + "learning_rate": 2.0851063829787233e-07, + "logits/chosen": -3.736328125, + "logits/rejected": -3.86328125, + "logps/chosen": -345.25, + "logps/rejected": -220.75, + "loss": 1.5742, + "nll_loss": 0.96875, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.91796875, + "rewards/margins": 2.271484375, + "rewards/rejected": 0.649169921875, + "step": 425 + }, + { + "epoch": 1.6340057636887608, + "grad_norm": 118.0, + "learning_rate": 2.0638297872340423e-07, + "logits/chosen": -3.736328125, + "logits/rejected": -3.814453125, + "logps/chosen": -524.5, + "logps/rejected": -233.5625, + "loss": 1.4639, + "nll_loss": 0.97705078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.1357421875, + "rewards/margins": 3.30859375, + "rewards/rejected": 0.828369140625, + "step": 426 + }, + { + "epoch": 1.637848222862632, + "grad_norm": 183.0, + "learning_rate": 2.0425531914893615e-07, + "logits/chosen": -3.66015625, + "logits/rejected": -3.634765625, + "logps/chosen": -435.75, + "logps/rejected": -208.75, + "loss": 1.4189, + "nll_loss": 0.8916015625, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.3037109375, + "rewards/margins": 2.921875, + "rewards/rejected": 0.383575439453125, + "step": 427 + }, + { + "epoch": 1.6416906820365034, + "grad_norm": 92.5, + "learning_rate": 2.0212765957446807e-07, + "logits/chosen": -3.740234375, + "logits/rejected": -3.818359375, + "logps/chosen": -566.0, + "logps/rejected": -280.875, + "loss": 1.3535, + "nll_loss": 0.900390625, + "rewards/accuracies": 0.90625, + "rewards/chosen": 4.9228515625, + "rewards/margins": 4.2615966796875, + "rewards/rejected": 0.66845703125, + "step": 428 + }, + { + "epoch": 1.6455331412103746, + "grad_norm": 117.0, + "learning_rate": 2e-07, + "logits/chosen": -3.7421875, + "logits/rejected": -3.80859375, + "logps/chosen": -435.875, + "logps/rejected": -245.0, + "loss": 1.4727, + "nll_loss": 0.91259765625, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.6484375, + "rewards/margins": 2.8193359375, + "rewards/rejected": 0.8323974609375, + "step": 429 + }, + { + "epoch": 1.649375600384246, + "grad_norm": 106.5, + "learning_rate": 1.978723404255319e-07, + "logits/chosen": -3.861328125, + "logits/rejected": -3.8046875, + "logps/chosen": -390.5, + "logps/rejected": -205.125, + "loss": 1.3311, + "nll_loss": 0.7919921875, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.353515625, + "rewards/margins": 2.81640625, + "rewards/rejected": 0.538330078125, + "step": 430 + }, + { + "epoch": 1.6532180595581172, + "grad_norm": 266.0, + "learning_rate": 1.9574468085106383e-07, + "logits/chosen": -3.71484375, + "logits/rejected": -3.75390625, + "logps/chosen": -454.25, + "logps/rejected": -245.625, + "loss": 1.3535, + "nll_loss": 0.78759765625, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.447265625, + "rewards/margins": 2.88671875, + "rewards/rejected": 0.5560302734375, + "step": 431 + }, + { + "epoch": 1.6570605187319885, + "grad_norm": 231.0, + "learning_rate": 1.9361702127659573e-07, + "logits/chosen": -3.734375, + "logits/rejected": -3.833984375, + "logps/chosen": -448.5, + "logps/rejected": -250.25, + "loss": 1.3496, + "nll_loss": 0.82373046875, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.68359375, + "rewards/margins": 2.9345703125, + "rewards/rejected": 0.756591796875, + "step": 432 + }, + { + "epoch": 1.6609029779058597, + "grad_norm": 255.0, + "learning_rate": 1.9148936170212765e-07, + "logits/chosen": -3.775390625, + "logits/rejected": -3.85546875, + "logps/chosen": -449.0, + "logps/rejected": -261.75, + "loss": 1.6406, + "nll_loss": 0.9970703125, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.998046875, + "rewards/margins": 2.28662109375, + "rewards/rejected": 0.714599609375, + "step": 433 + }, + { + "epoch": 1.664745437079731, + "grad_norm": 104.0, + "learning_rate": 1.8936170212765957e-07, + "logits/chosen": -3.76953125, + "logits/rejected": -3.8046875, + "logps/chosen": -403.375, + "logps/rejected": -184.4375, + "loss": 1.4404, + "nll_loss": 0.9072265625, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.490234375, + "rewards/margins": 3.00634765625, + "rewards/rejected": 0.489044189453125, + "step": 434 + }, + { + "epoch": 1.6685878962536023, + "grad_norm": 632.0, + "learning_rate": 1.8723404255319146e-07, + "logits/chosen": -3.91015625, + "logits/rejected": -3.8984375, + "logps/chosen": -469.25, + "logps/rejected": -237.625, + "loss": 1.4795, + "nll_loss": 0.86474609375, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.806640625, + "rewards/margins": 2.94580078125, + "rewards/rejected": 0.86248779296875, + "step": 435 + }, + { + "epoch": 1.6724303554274735, + "grad_norm": 404.0, + "learning_rate": 1.8510638297872338e-07, + "logits/chosen": -3.759765625, + "logits/rejected": -3.728515625, + "logps/chosen": -412.75, + "logps/rejected": -227.75, + "loss": 1.4463, + "nll_loss": 0.8984375, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.6640625, + "rewards/margins": 3.0322265625, + "rewards/rejected": 0.635986328125, + "step": 436 + }, + { + "epoch": 1.6762728146013448, + "grad_norm": 130.0, + "learning_rate": 1.829787234042553e-07, + "logits/chosen": -3.74609375, + "logits/rejected": -3.84375, + "logps/chosen": -395.875, + "logps/rejected": -232.9375, + "loss": 1.3989, + "nll_loss": 0.8173828125, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7410888671875, + "rewards/margins": 2.42333984375, + "rewards/rejected": 0.31817626953125, + "step": 437 + }, + { + "epoch": 1.680115273775216, + "grad_norm": 136.0, + "learning_rate": 1.8085106382978725e-07, + "logits/chosen": -3.853515625, + "logits/rejected": -3.8828125, + "logps/chosen": -562.25, + "logps/rejected": -276.5, + "loss": 1.3408, + "nll_loss": 0.8935546875, + "rewards/accuracies": 0.9375, + "rewards/chosen": 5.400390625, + "rewards/margins": 4.265625, + "rewards/rejected": 1.1341552734375, + "step": 438 + }, + { + "epoch": 1.6839577329490876, + "grad_norm": 145.0, + "learning_rate": 1.7872340425531914e-07, + "logits/chosen": -3.81640625, + "logits/rejected": -3.8203125, + "logps/chosen": -390.625, + "logps/rejected": -229.25, + "loss": 1.4639, + "nll_loss": 0.9052734375, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.6982421875, + "rewards/margins": 2.9183349609375, + "rewards/rejected": 0.778564453125, + "step": 439 + }, + { + "epoch": 1.6878001921229586, + "grad_norm": 434.0, + "learning_rate": 1.7659574468085106e-07, + "logits/chosen": -3.6953125, + "logits/rejected": -3.77734375, + "logps/chosen": -410.25, + "logps/rejected": -219.25, + "loss": 1.3936, + "nll_loss": 0.82861328125, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.3662109375, + "rewards/margins": 2.666015625, + "rewards/rejected": 0.7016754150390625, + "step": 440 + }, + { + "epoch": 1.6916426512968301, + "grad_norm": 96.0, + "learning_rate": 1.7446808510638299e-07, + "logits/chosen": -3.775390625, + "logits/rejected": -3.80078125, + "logps/chosen": -532.25, + "logps/rejected": -249.875, + "loss": 1.3936, + "nll_loss": 0.892578125, + "rewards/accuracies": 0.84375, + "rewards/chosen": 4.458984375, + "rewards/margins": 3.53515625, + "rewards/rejected": 0.92401123046875, + "step": 441 + }, + { + "epoch": 1.6954851104707012, + "grad_norm": 288.0, + "learning_rate": 1.7234042553191488e-07, + "logits/chosen": -3.787109375, + "logits/rejected": -3.736328125, + "logps/chosen": -457.0, + "logps/rejected": -248.0, + "loss": 1.3965, + "nll_loss": 0.83056640625, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.712890625, + "rewards/margins": 2.9560546875, + "rewards/rejected": 0.7581787109375, + "step": 442 + }, + { + "epoch": 1.6993275696445727, + "grad_norm": 290.0, + "learning_rate": 1.702127659574468e-07, + "logits/chosen": -3.751953125, + "logits/rejected": -3.8359375, + "logps/chosen": -462.25, + "logps/rejected": -236.125, + "loss": 1.4922, + "nll_loss": 0.873046875, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.8671875, + "rewards/margins": 2.802734375, + "rewards/rejected": 1.0596923828125, + "step": 443 + }, + { + "epoch": 1.7031700288184437, + "grad_norm": 163.0, + "learning_rate": 1.6808510638297872e-07, + "logits/chosen": -3.724609375, + "logits/rejected": -3.84375, + "logps/chosen": -464.5, + "logps/rejected": -263.125, + "loss": 1.3379, + "nll_loss": 0.83837890625, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.6171875, + "rewards/margins": 3.0029296875, + "rewards/rejected": 0.61822509765625, + "step": 444 + }, + { + "epoch": 1.7070124879923152, + "grad_norm": 152.0, + "learning_rate": 1.6595744680851062e-07, + "logits/chosen": -3.759765625, + "logits/rejected": -3.7734375, + "logps/chosen": -417.75, + "logps/rejected": -248.875, + "loss": 1.4473, + "nll_loss": 0.83740234375, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.736328125, + "rewards/margins": 2.5439453125, + "rewards/rejected": 1.193359375, + "step": 445 + }, + { + "epoch": 1.7108549471661862, + "grad_norm": 160.0, + "learning_rate": 1.6382978723404254e-07, + "logits/chosen": -3.76953125, + "logits/rejected": -3.83203125, + "logps/chosen": -650.5, + "logps/rejected": -293.75, + "loss": 1.2124, + "nll_loss": 0.798095703125, + "rewards/accuracies": 0.9375, + "rewards/chosen": 4.978515625, + "rewards/margins": 4.69140625, + "rewards/rejected": 0.29046630859375, + "step": 446 + }, + { + "epoch": 1.7146974063400577, + "grad_norm": 272.0, + "learning_rate": 1.6170212765957448e-07, + "logits/chosen": -3.796875, + "logits/rejected": -3.939453125, + "logps/chosen": -354.375, + "logps/rejected": -213.4375, + "loss": 1.4648, + "nll_loss": 0.82568359375, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.193359375, + "rewards/margins": 2.33349609375, + "rewards/rejected": 0.85638427734375, + "step": 447 + }, + { + "epoch": 1.7185398655139288, + "grad_norm": 175.0, + "learning_rate": 1.5957446808510638e-07, + "logits/chosen": -3.68359375, + "logits/rejected": -3.76171875, + "logps/chosen": -371.5, + "logps/rejected": -201.125, + "loss": 1.5176, + "nll_loss": 0.93505859375, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.92578125, + "rewards/margins": 2.5030517578125, + "rewards/rejected": 0.420318603515625, + "step": 448 + }, + { + "epoch": 1.7223823246878003, + "grad_norm": 152.0, + "learning_rate": 1.574468085106383e-07, + "logits/chosen": -3.75, + "logits/rejected": -3.7421875, + "logps/chosen": -489.75, + "logps/rejected": -257.3125, + "loss": 1.2979, + "nll_loss": 0.841796875, + "rewards/accuracies": 0.9375, + "rewards/chosen": 4.248046875, + "rewards/margins": 3.7919921875, + "rewards/rejected": 0.45709228515625, + "step": 449 + }, + { + "epoch": 1.7262247838616713, + "grad_norm": 135.0, + "learning_rate": 1.5531914893617022e-07, + "logits/chosen": -3.69921875, + "logits/rejected": -3.763671875, + "logps/chosen": -430.5, + "logps/rejected": -193.5, + "loss": 1.5215, + "nll_loss": 0.9658203125, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.150390625, + "rewards/margins": 2.7919921875, + "rewards/rejected": 0.361328125, + "step": 450 + }, + { + "epoch": 1.7300672430355428, + "grad_norm": 324.0, + "learning_rate": 1.531914893617021e-07, + "logits/chosen": -3.724609375, + "logits/rejected": -3.74609375, + "logps/chosen": -422.5, + "logps/rejected": -272.625, + "loss": 1.4307, + "nll_loss": 0.8271484375, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.3984375, + "rewards/margins": 2.58251953125, + "rewards/rejected": 0.81787109375, + "step": 451 + }, + { + "epoch": 1.7339097022094139, + "grad_norm": 140.0, + "learning_rate": 1.5106382978723403e-07, + "logits/chosen": -3.837890625, + "logits/rejected": -3.833984375, + "logps/chosen": -509.75, + "logps/rejected": -284.125, + "loss": 1.46, + "nll_loss": 0.978515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.427734375, + "rewards/margins": 3.361328125, + "rewards/rejected": 1.060791015625, + "step": 452 + }, + { + "epoch": 1.7377521613832854, + "grad_norm": 140.0, + "learning_rate": 1.4893617021276595e-07, + "logits/chosen": -3.779296875, + "logits/rejected": -3.77734375, + "logps/chosen": -505.75, + "logps/rejected": -230.5, + "loss": 1.4531, + "nll_loss": 0.97119140625, + "rewards/accuracies": 0.96875, + "rewards/chosen": 4.05859375, + "rewards/margins": 3.515625, + "rewards/rejected": 0.545867919921875, + "step": 453 + }, + { + "epoch": 1.7415946205571564, + "grad_norm": 892.0, + "learning_rate": 1.4680851063829785e-07, + "logits/chosen": -3.7578125, + "logits/rejected": -3.845703125, + "logps/chosen": -372.5, + "logps/rejected": -209.3125, + "loss": 1.4697, + "nll_loss": 0.84423828125, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.947265625, + "rewards/margins": 2.34814453125, + "rewards/rejected": 0.596588134765625, + "step": 454 + }, + { + "epoch": 1.745437079731028, + "grad_norm": 245.0, + "learning_rate": 1.4468085106382977e-07, + "logits/chosen": -3.78515625, + "logits/rejected": -3.8203125, + "logps/chosen": -448.75, + "logps/rejected": -226.25, + "loss": 1.3838, + "nll_loss": 0.8740234375, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.39453125, + "rewards/margins": 2.931640625, + "rewards/rejected": 0.4647216796875, + "step": 455 + }, + { + "epoch": 1.7492795389048992, + "grad_norm": 260.0, + "learning_rate": 1.4255319148936172e-07, + "logits/chosen": -3.84375, + "logits/rejected": -3.822265625, + "logps/chosen": -502.25, + "logps/rejected": -262.875, + "loss": 1.417, + "nll_loss": 0.94287109375, + "rewards/accuracies": 0.96875, + "rewards/chosen": 4.548828125, + "rewards/margins": 3.626953125, + "rewards/rejected": 0.9281005859375, + "step": 456 + }, + { + "epoch": 1.7531219980787704, + "grad_norm": 113.5, + "learning_rate": 1.404255319148936e-07, + "logits/chosen": -3.8515625, + "logits/rejected": -3.890625, + "logps/chosen": -519.5, + "logps/rejected": -276.25, + "loss": 1.2754, + "nll_loss": 0.8447265625, + "rewards/accuracies": 0.90625, + "rewards/chosen": 4.46484375, + "rewards/margins": 3.984375, + "rewards/rejected": 0.489501953125, + "step": 457 + }, + { + "epoch": 1.7569644572526417, + "grad_norm": 146.0, + "learning_rate": 1.3829787234042553e-07, + "logits/chosen": -3.85546875, + "logits/rejected": -3.86328125, + "logps/chosen": -446.625, + "logps/rejected": -261.375, + "loss": 1.4126, + "nll_loss": 0.896484375, + "rewards/accuracies": 0.875, + "rewards/chosen": 4.3984375, + "rewards/margins": 3.4736328125, + "rewards/rejected": 0.9244384765625, + "step": 458 + }, + { + "epoch": 1.760806916426513, + "grad_norm": 87.0, + "learning_rate": 1.3617021276595745e-07, + "logits/chosen": -3.802734375, + "logits/rejected": -3.828125, + "logps/chosen": -530.0, + "logps/rejected": -234.8125, + "loss": 1.2871, + "nll_loss": 0.869140625, + "rewards/accuracies": 0.9375, + "rewards/chosen": 4.158203125, + "rewards/margins": 3.876953125, + "rewards/rejected": 0.2840576171875, + "step": 459 + }, + { + "epoch": 1.7646493756003843, + "grad_norm": 112.5, + "learning_rate": 1.3404255319148934e-07, + "logits/chosen": -3.7265625, + "logits/rejected": -3.732421875, + "logps/chosen": -423.875, + "logps/rejected": -199.625, + "loss": 1.2959, + "nll_loss": 0.78173828125, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.28515625, + "rewards/margins": 2.9287109375, + "rewards/rejected": 0.3538818359375, + "step": 460 + }, + { + "epoch": 1.7684918347742555, + "grad_norm": 227.0, + "learning_rate": 1.3191489361702127e-07, + "logits/chosen": -3.880859375, + "logits/rejected": -3.85546875, + "logps/chosen": -495.5, + "logps/rejected": -284.625, + "loss": 1.4492, + "nll_loss": 0.890625, + "rewards/accuracies": 0.84375, + "rewards/chosen": 4.224609375, + "rewards/margins": 3.359375, + "rewards/rejected": 0.869873046875, + "step": 461 + }, + { + "epoch": 1.7723342939481268, + "grad_norm": 107.0, + "learning_rate": 1.2978723404255319e-07, + "logits/chosen": -3.732421875, + "logits/rejected": -3.75390625, + "logps/chosen": -339.375, + "logps/rejected": -181.75, + "loss": 1.457, + "nll_loss": 0.89404296875, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.748046875, + "rewards/margins": 2.4453125, + "rewards/rejected": 0.30487060546875, + "step": 462 + }, + { + "epoch": 1.776176753121998, + "grad_norm": 129.0, + "learning_rate": 1.2765957446808508e-07, + "logits/chosen": -3.724609375, + "logits/rejected": -3.75390625, + "logps/chosen": -460.0, + "logps/rejected": -212.1875, + "loss": 1.3037, + "nll_loss": 0.84130859375, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.96484375, + "rewards/margins": 3.5810546875, + "rewards/rejected": 0.381103515625, + "step": 463 + }, + { + "epoch": 1.7800192122958693, + "grad_norm": 168.0, + "learning_rate": 1.25531914893617e-07, + "logits/chosen": -3.74609375, + "logits/rejected": -3.85546875, + "logps/chosen": -432.75, + "logps/rejected": -242.375, + "loss": 1.4512, + "nll_loss": 0.90283203125, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.46875, + "rewards/margins": 2.7900390625, + "rewards/rejected": 0.67926025390625, + "step": 464 + }, + { + "epoch": 1.7838616714697406, + "grad_norm": 77.0, + "learning_rate": 1.2340425531914892e-07, + "logits/chosen": -3.7421875, + "logits/rejected": -3.78125, + "logps/chosen": -430.75, + "logps/rejected": -235.0, + "loss": 1.2861, + "nll_loss": 0.78955078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5029296875, + "rewards/margins": 3.24609375, + "rewards/rejected": 0.2626953125, + "step": 465 + }, + { + "epoch": 1.7877041306436119, + "grad_norm": 249.0, + "learning_rate": 1.2127659574468084e-07, + "logits/chosen": -3.79296875, + "logits/rejected": -3.80859375, + "logps/chosen": -519.125, + "logps/rejected": -268.375, + "loss": 1.4189, + "nll_loss": 0.923828125, + "rewards/accuracies": 0.875, + "rewards/chosen": 4.498046875, + "rewards/margins": 3.6123046875, + "rewards/rejected": 0.8900146484375, + "step": 466 + }, + { + "epoch": 1.7915465898174832, + "grad_norm": 208.0, + "learning_rate": 1.1914893617021276e-07, + "logits/chosen": -3.685546875, + "logits/rejected": -3.765625, + "logps/chosen": -342.875, + "logps/rejected": -181.375, + "loss": 1.4404, + "nll_loss": 0.853515625, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9697265625, + "rewards/margins": 2.3056640625, + "rewards/rejected": 0.6676025390625, + "step": 467 + }, + { + "epoch": 1.7953890489913544, + "grad_norm": 128.0, + "learning_rate": 1.1702127659574468e-07, + "logits/chosen": -3.71875, + "logits/rejected": -3.80078125, + "logps/chosen": -390.25, + "logps/rejected": -225.125, + "loss": 1.457, + "nll_loss": 0.89990234375, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.28515625, + "rewards/margins": 2.5126953125, + "rewards/rejected": 0.7784423828125, + "step": 468 + }, + { + "epoch": 1.7992315081652257, + "grad_norm": 162.0, + "learning_rate": 1.1489361702127659e-07, + "logits/chosen": -3.728515625, + "logits/rejected": -3.8515625, + "logps/chosen": -440.25, + "logps/rejected": -191.125, + "loss": 1.5098, + "nll_loss": 0.95751953125, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.453125, + "rewards/margins": 2.94384765625, + "rewards/rejected": 0.51129150390625, + "step": 469 + }, + { + "epoch": 1.803073967339097, + "grad_norm": 272.0, + "learning_rate": 1.127659574468085e-07, + "logits/chosen": -3.740234375, + "logits/rejected": -3.837890625, + "logps/chosen": -432.0, + "logps/rejected": -231.25, + "loss": 1.5537, + "nll_loss": 0.95263671875, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.44140625, + "rewards/margins": 2.662109375, + "rewards/rejected": 0.77899169921875, + "step": 470 + }, + { + "epoch": 1.8069164265129682, + "grad_norm": 121.0, + "learning_rate": 1.1063829787234042e-07, + "logits/chosen": -3.720703125, + "logits/rejected": -3.765625, + "logps/chosen": -406.25, + "logps/rejected": -240.25, + "loss": 1.3994, + "nll_loss": 0.78662109375, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.345703125, + "rewards/margins": 2.4853515625, + "rewards/rejected": 0.85931396484375, + "step": 471 + }, + { + "epoch": 1.8107588856868397, + "grad_norm": 187.0, + "learning_rate": 1.0851063829787234e-07, + "logits/chosen": -3.736328125, + "logits/rejected": -3.75390625, + "logps/chosen": -390.875, + "logps/rejected": -219.875, + "loss": 1.4443, + "nll_loss": 0.8671875, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.37109375, + "rewards/margins": 2.4462890625, + "rewards/rejected": 0.92083740234375, + "step": 472 + }, + { + "epoch": 1.8146013448607108, + "grad_norm": 101.0, + "learning_rate": 1.0638297872340425e-07, + "logits/chosen": -3.69921875, + "logits/rejected": -3.78125, + "logps/chosen": -501.25, + "logps/rejected": -236.125, + "loss": 1.3604, + "nll_loss": 0.880859375, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.71484375, + "rewards/margins": 3.1484375, + "rewards/rejected": 0.564971923828125, + "step": 473 + }, + { + "epoch": 1.8184438040345823, + "grad_norm": 167.0, + "learning_rate": 1.0425531914893617e-07, + "logits/chosen": -3.7890625, + "logits/rejected": -3.8828125, + "logps/chosen": -413.75, + "logps/rejected": -231.0, + "loss": 1.5225, + "nll_loss": 1.00146484375, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.38671875, + "rewards/margins": 2.8740234375, + "rewards/rejected": 0.513763427734375, + "step": 474 + }, + { + "epoch": 1.8222862632084533, + "grad_norm": 107.0, + "learning_rate": 1.0212765957446807e-07, + "logits/chosen": -3.763671875, + "logits/rejected": -3.796875, + "logps/chosen": -471.0, + "logps/rejected": -206.0, + "loss": 1.5391, + "nll_loss": 0.92529296875, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.119140625, + "rewards/margins": 2.5977783203125, + "rewards/rejected": 0.522705078125, + "step": 475 + }, + { + "epoch": 1.8261287223823248, + "grad_norm": 116.0, + "learning_rate": 1e-07, + "logits/chosen": -3.80859375, + "logits/rejected": -3.8984375, + "logps/chosen": -426.5, + "logps/rejected": -230.625, + "loss": 1.4375, + "nll_loss": 0.91796875, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.9765625, + "rewards/margins": 3.22265625, + "rewards/rejected": 0.7489013671875, + "step": 476 + }, + { + "epoch": 1.8299711815561959, + "grad_norm": 165.0, + "learning_rate": 9.787234042553192e-08, + "logits/chosen": -3.767578125, + "logits/rejected": -3.798828125, + "logps/chosen": -464.25, + "logps/rejected": -220.875, + "loss": 1.4404, + "nll_loss": 0.9189453125, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.859375, + "rewards/margins": 3.111328125, + "rewards/rejected": 0.740814208984375, + "step": 477 + }, + { + "epoch": 1.8338136407300674, + "grad_norm": 152.0, + "learning_rate": 9.574468085106382e-08, + "logits/chosen": -3.7578125, + "logits/rejected": -3.775390625, + "logps/chosen": -381.75, + "logps/rejected": -247.5, + "loss": 1.4512, + "nll_loss": 0.84912109375, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.6162109375, + "rewards/margins": 2.6474609375, + "rewards/rejected": 0.97265625, + "step": 478 + }, + { + "epoch": 1.8376560999039384, + "grad_norm": 179.0, + "learning_rate": 9.361702127659573e-08, + "logits/chosen": -3.8046875, + "logits/rejected": -3.8359375, + "logps/chosen": -439.5, + "logps/rejected": -258.625, + "loss": 1.5684, + "nll_loss": 0.99267578125, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.7119140625, + "rewards/margins": 2.69140625, + "rewards/rejected": 1.019287109375, + "step": 479 + }, + { + "epoch": 1.84149855907781, + "grad_norm": 112.5, + "learning_rate": 9.148936170212765e-08, + "logits/chosen": -3.70703125, + "logits/rejected": -3.763671875, + "logps/chosen": -427.875, + "logps/rejected": -241.875, + "loss": 1.3809, + "nll_loss": 0.865234375, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.013671875, + "rewards/margins": 2.7744140625, + "rewards/rejected": 0.24261474609375, + "step": 480 + }, + { + "epoch": 1.845341018251681, + "grad_norm": 153.0, + "learning_rate": 8.936170212765957e-08, + "logits/chosen": -3.79296875, + "logits/rejected": -3.8359375, + "logps/chosen": -555.25, + "logps/rejected": -283.0, + "loss": 1.4697, + "nll_loss": 0.96728515625, + "rewards/accuracies": 0.875, + "rewards/chosen": 4.291015625, + "rewards/margins": 3.6708984375, + "rewards/rejected": 0.61669921875, + "step": 481 + }, + { + "epoch": 1.8491834774255524, + "grad_norm": 276.0, + "learning_rate": 8.723404255319149e-08, + "logits/chosen": -3.771484375, + "logits/rejected": -3.822265625, + "logps/chosen": -403.25, + "logps/rejected": -208.75, + "loss": 1.3125, + "nll_loss": 0.80517578125, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.693359375, + "rewards/margins": 3.2197265625, + "rewards/rejected": 0.4735107421875, + "step": 482 + }, + { + "epoch": 1.8530259365994235, + "grad_norm": 133.0, + "learning_rate": 8.51063829787234e-08, + "logits/chosen": -3.767578125, + "logits/rejected": -3.80078125, + "logps/chosen": -450.75, + "logps/rejected": -239.5, + "loss": 1.5518, + "nll_loss": 0.93603515625, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.365234375, + "rewards/margins": 2.6416015625, + "rewards/rejected": 0.72613525390625, + "step": 483 + }, + { + "epoch": 1.856868395773295, + "grad_norm": 127.5, + "learning_rate": 8.297872340425531e-08, + "logits/chosen": -3.81640625, + "logits/rejected": -3.86328125, + "logps/chosen": -418.0, + "logps/rejected": -202.75, + "loss": 1.3896, + "nll_loss": 0.84228515625, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.205078125, + "rewards/margins": 2.86572265625, + "rewards/rejected": 0.341094970703125, + "step": 484 + }, + { + "epoch": 1.860710854947166, + "grad_norm": 116.0, + "learning_rate": 8.085106382978724e-08, + "logits/chosen": -3.765625, + "logits/rejected": -3.888671875, + "logps/chosen": -384.375, + "logps/rejected": -160.8125, + "loss": 1.4707, + "nll_loss": 0.9296875, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.486328125, + "rewards/margins": 2.9892578125, + "rewards/rejected": 0.4937744140625, + "step": 485 + }, + { + "epoch": 1.8645533141210375, + "grad_norm": 151.0, + "learning_rate": 7.872340425531915e-08, + "logits/chosen": -3.87109375, + "logits/rejected": -3.83203125, + "logps/chosen": -479.75, + "logps/rejected": -285.25, + "loss": 1.4648, + "nll_loss": 0.9111328125, + "rewards/accuracies": 0.84375, + "rewards/chosen": 4.103515625, + "rewards/margins": 3.06982421875, + "rewards/rejected": 1.033447265625, + "step": 486 + }, + { + "epoch": 1.8683957732949086, + "grad_norm": 110.0, + "learning_rate": 7.659574468085106e-08, + "logits/chosen": -3.77734375, + "logits/rejected": -3.80078125, + "logps/chosen": -472.5, + "logps/rejected": -253.125, + "loss": 1.3262, + "nll_loss": 0.83544921875, + "rewards/accuracies": 0.84375, + "rewards/chosen": 4.3857421875, + "rewards/margins": 3.591796875, + "rewards/rejected": 0.7962646484375, + "step": 487 + }, + { + "epoch": 1.87223823246878, + "grad_norm": 572.0, + "learning_rate": 7.446808510638298e-08, + "logits/chosen": -3.775390625, + "logits/rejected": -3.775390625, + "logps/chosen": -498.75, + "logps/rejected": -256.875, + "loss": 1.5732, + "nll_loss": 1.0615234375, + "rewards/accuracies": 0.875, + "rewards/chosen": 4.078125, + "rewards/margins": 3.443359375, + "rewards/rejected": 0.63232421875, + "step": 488 + }, + { + "epoch": 1.8760806916426513, + "grad_norm": 121.5, + "learning_rate": 7.234042553191488e-08, + "logits/chosen": -3.759765625, + "logits/rejected": -3.810546875, + "logps/chosen": -494.5, + "logps/rejected": -272.25, + "loss": 1.3965, + "nll_loss": 0.8525390625, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.6328125, + "rewards/margins": 3.0595703125, + "rewards/rejected": 0.57086181640625, + "step": 489 + }, + { + "epoch": 1.8799231508165226, + "grad_norm": 144.0, + "learning_rate": 7.02127659574468e-08, + "logits/chosen": -3.71484375, + "logits/rejected": -3.755859375, + "logps/chosen": -434.25, + "logps/rejected": -219.875, + "loss": 1.3643, + "nll_loss": 0.89892578125, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.408203125, + "rewards/margins": 3.1494140625, + "rewards/rejected": 0.25634765625, + "step": 490 + }, + { + "epoch": 1.8837656099903939, + "grad_norm": 153.0, + "learning_rate": 6.808510638297873e-08, + "logits/chosen": -3.736328125, + "logits/rejected": -3.7265625, + "logps/chosen": -401.0, + "logps/rejected": -235.875, + "loss": 1.5518, + "nll_loss": 0.97900390625, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.5810546875, + "rewards/margins": 2.81103515625, + "rewards/rejected": 0.772674560546875, + "step": 491 + }, + { + "epoch": 1.8876080691642652, + "grad_norm": 156.0, + "learning_rate": 6.595744680851063e-08, + "logits/chosen": -3.71875, + "logits/rejected": -3.712890625, + "logps/chosen": -372.625, + "logps/rejected": -220.1875, + "loss": 1.4111, + "nll_loss": 0.8154296875, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.021484375, + "rewards/margins": 2.31884765625, + "rewards/rejected": 0.7039794921875, + "step": 492 + }, + { + "epoch": 1.8914505283381364, + "grad_norm": 129.0, + "learning_rate": 6.382978723404254e-08, + "logits/chosen": -3.775390625, + "logits/rejected": -3.8046875, + "logps/chosen": -414.25, + "logps/rejected": -208.125, + "loss": 1.4883, + "nll_loss": 0.91552734375, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.505859375, + "rewards/margins": 2.7958984375, + "rewards/rejected": 0.71142578125, + "step": 493 + }, + { + "epoch": 1.8952929875120077, + "grad_norm": 276.0, + "learning_rate": 6.170212765957446e-08, + "logits/chosen": -3.765625, + "logits/rejected": -3.736328125, + "logps/chosen": -425.875, + "logps/rejected": -236.3125, + "loss": 1.5273, + "nll_loss": 0.978515625, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.591796875, + "rewards/margins": 2.935546875, + "rewards/rejected": 0.6568603515625, + "step": 494 + }, + { + "epoch": 1.899135446685879, + "grad_norm": 153.0, + "learning_rate": 5.957446808510638e-08, + "logits/chosen": -3.7578125, + "logits/rejected": -3.78515625, + "logps/chosen": -411.75, + "logps/rejected": -224.375, + "loss": 1.5166, + "nll_loss": 0.966796875, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.357421875, + "rewards/margins": 2.6630859375, + "rewards/rejected": 0.69488525390625, + "step": 495 + }, + { + "epoch": 1.9029779058597502, + "grad_norm": 235.0, + "learning_rate": 5.7446808510638295e-08, + "logits/chosen": -3.75, + "logits/rejected": -3.810546875, + "logps/chosen": -496.5, + "logps/rejected": -243.25, + "loss": 1.5244, + "nll_loss": 0.861328125, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.9189453125, + "rewards/margins": 2.9736328125, + "rewards/rejected": 0.94476318359375, + "step": 496 + }, + { + "epoch": 1.9068203650336215, + "grad_norm": 160.0, + "learning_rate": 5.531914893617021e-08, + "logits/chosen": -3.828125, + "logits/rejected": -3.86328125, + "logps/chosen": -495.0, + "logps/rejected": -250.0, + "loss": 1.5557, + "nll_loss": 0.96728515625, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.5859375, + "rewards/margins": 2.80078125, + "rewards/rejected": 0.785888671875, + "step": 497 + }, + { + "epoch": 1.9106628242074928, + "grad_norm": 122.0, + "learning_rate": 5.3191489361702123e-08, + "logits/chosen": -3.82421875, + "logits/rejected": -3.802734375, + "logps/chosen": -473.5, + "logps/rejected": -208.375, + "loss": 1.375, + "nll_loss": 0.93212890625, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.396484375, + "rewards/margins": 3.728515625, + "rewards/rejected": 0.66510009765625, + "step": 498 + }, + { + "epoch": 1.914505283381364, + "grad_norm": 251.0, + "learning_rate": 5.106382978723404e-08, + "logits/chosen": -3.849609375, + "logits/rejected": -3.806640625, + "logps/chosen": -409.125, + "logps/rejected": -269.75, + "loss": 1.5361, + "nll_loss": 0.93505859375, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.517578125, + "rewards/margins": 2.5595703125, + "rewards/rejected": 0.9552001953125, + "step": 499 + }, + { + "epoch": 1.9183477425552353, + "grad_norm": 132.0, + "learning_rate": 4.893617021276596e-08, + "logits/chosen": -3.810546875, + "logits/rejected": -3.873046875, + "logps/chosen": -470.75, + "logps/rejected": -261.375, + "loss": 1.4512, + "nll_loss": 0.94775390625, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.91796875, + "rewards/margins": 3.1650390625, + "rewards/rejected": 0.7562255859375, + "step": 500 + }, + { + "epoch": 1.9221902017291066, + "grad_norm": 156.0, + "learning_rate": 4.6808510638297865e-08, + "logits/chosen": -3.720703125, + "logits/rejected": -3.74609375, + "logps/chosen": -512.75, + "logps/rejected": -279.5, + "loss": 1.4082, + "nll_loss": 0.8359375, + "rewards/accuracies": 0.90625, + "rewards/chosen": 4.15625, + "rewards/margins": 2.8798828125, + "rewards/rejected": 1.271240234375, + "step": 501 + }, + { + "epoch": 1.9260326609029779, + "grad_norm": 186.0, + "learning_rate": 4.4680851063829786e-08, + "logits/chosen": -3.6796875, + "logits/rejected": -3.775390625, + "logps/chosen": -445.75, + "logps/rejected": -249.375, + "loss": 1.4902, + "nll_loss": 0.88330078125, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.6357421875, + "rewards/margins": 2.69189453125, + "rewards/rejected": 0.951171875, + "step": 502 + }, + { + "epoch": 1.9298751200768491, + "grad_norm": 153.0, + "learning_rate": 4.25531914893617e-08, + "logits/chosen": -3.734375, + "logits/rejected": -3.736328125, + "logps/chosen": -462.0, + "logps/rejected": -271.5, + "loss": 1.5371, + "nll_loss": 0.96630859375, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.30859375, + "rewards/margins": 2.5009765625, + "rewards/rejected": 0.80517578125, + "step": 503 + }, + { + "epoch": 1.9337175792507204, + "grad_norm": 130.0, + "learning_rate": 4.042553191489362e-08, + "logits/chosen": -3.85546875, + "logits/rejected": -3.900390625, + "logps/chosen": -439.25, + "logps/rejected": -240.875, + "loss": 1.4482, + "nll_loss": 0.95166015625, + "rewards/accuracies": 0.90625, + "rewards/chosen": 4.130859375, + "rewards/margins": 3.28515625, + "rewards/rejected": 0.84967041015625, + "step": 504 + }, + { + "epoch": 1.937560038424592, + "grad_norm": 158.0, + "learning_rate": 3.829787234042553e-08, + "logits/chosen": -3.7578125, + "logits/rejected": -3.796875, + "logps/chosen": -439.5, + "logps/rejected": -217.875, + "loss": 1.377, + "nll_loss": 0.873046875, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.59375, + "rewards/margins": 3.232421875, + "rewards/rejected": 0.361328125, + "step": 505 + }, + { + "epoch": 1.941402497598463, + "grad_norm": 476.0, + "learning_rate": 3.617021276595744e-08, + "logits/chosen": -3.7265625, + "logits/rejected": -3.755859375, + "logps/chosen": -435.375, + "logps/rejected": -211.875, + "loss": 1.3193, + "nll_loss": 0.84814453125, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.798828125, + "rewards/margins": 3.4111328125, + "rewards/rejected": 0.382598876953125, + "step": 506 + }, + { + "epoch": 1.9452449567723344, + "grad_norm": 123.5, + "learning_rate": 3.404255319148936e-08, + "logits/chosen": -3.763671875, + "logits/rejected": -3.748046875, + "logps/chosen": -521.5, + "logps/rejected": -264.9375, + "loss": 1.3398, + "nll_loss": 0.85009765625, + "rewards/accuracies": 0.90625, + "rewards/chosen": 4.505859375, + "rewards/margins": 3.365234375, + "rewards/rejected": 1.144287109375, + "step": 507 + }, + { + "epoch": 1.9490874159462055, + "grad_norm": 124.5, + "learning_rate": 3.191489361702127e-08, + "logits/chosen": -3.791015625, + "logits/rejected": -3.875, + "logps/chosen": -390.125, + "logps/rejected": -251.25, + "loss": 1.4521, + "nll_loss": 0.8115234375, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.06640625, + "rewards/margins": 2.1075439453125, + "rewards/rejected": 0.957763671875, + "step": 508 + }, + { + "epoch": 1.952929875120077, + "grad_norm": 114.5, + "learning_rate": 2.978723404255319e-08, + "logits/chosen": -3.74609375, + "logits/rejected": -3.765625, + "logps/chosen": -433.875, + "logps/rejected": -222.75, + "loss": 1.5059, + "nll_loss": 0.95361328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.55859375, + "rewards/margins": 2.755859375, + "rewards/rejected": 0.8037109375, + "step": 509 + }, + { + "epoch": 1.956772334293948, + "grad_norm": 104.0, + "learning_rate": 2.7659574468085105e-08, + "logits/chosen": -3.712890625, + "logits/rejected": -3.732421875, + "logps/chosen": -438.375, + "logps/rejected": -220.6875, + "loss": 1.2871, + "nll_loss": 0.85888671875, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.62890625, + "rewards/margins": 3.4130859375, + "rewards/rejected": 0.2108154296875, + "step": 510 + }, + { + "epoch": 1.9606147934678195, + "grad_norm": 177.0, + "learning_rate": 2.553191489361702e-08, + "logits/chosen": -3.79296875, + "logits/rejected": -3.806640625, + "logps/chosen": -467.75, + "logps/rejected": -241.875, + "loss": 1.4146, + "nll_loss": 0.83935546875, + "rewards/accuracies": 0.875, + "rewards/chosen": 4.15625, + "rewards/margins": 3.06103515625, + "rewards/rejected": 1.0931396484375, + "step": 511 + }, + { + "epoch": 1.9644572526416906, + "grad_norm": 204.0, + "learning_rate": 2.3404255319148933e-08, + "logits/chosen": -3.875, + "logits/rejected": -3.876953125, + "logps/chosen": -413.25, + "logps/rejected": -264.75, + "loss": 1.5928, + "nll_loss": 0.9541015625, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.19140625, + "rewards/margins": 2.431640625, + "rewards/rejected": 0.75982666015625, + "step": 512 + }, + { + "epoch": 1.968299711815562, + "grad_norm": 128.0, + "learning_rate": 2.127659574468085e-08, + "logits/chosen": -3.7734375, + "logits/rejected": -3.759765625, + "logps/chosen": -503.875, + "logps/rejected": -236.25, + "loss": 1.3477, + "nll_loss": 0.888671875, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.8955078125, + "rewards/margins": 3.55712890625, + "rewards/rejected": 0.3360595703125, + "step": 513 + }, + { + "epoch": 1.9721421709894331, + "grad_norm": 115.5, + "learning_rate": 1.9148936170212764e-08, + "logits/chosen": -3.80078125, + "logits/rejected": -3.90234375, + "logps/chosen": -488.75, + "logps/rejected": -244.5, + "loss": 1.3525, + "nll_loss": 0.8916015625, + "rewards/accuracies": 0.90625, + "rewards/chosen": 4.470703125, + "rewards/margins": 3.541015625, + "rewards/rejected": 0.92620849609375, + "step": 514 + }, + { + "epoch": 1.9759846301633046, + "grad_norm": 119.0, + "learning_rate": 1.702127659574468e-08, + "logits/chosen": -3.80078125, + "logits/rejected": -3.779296875, + "logps/chosen": -426.5, + "logps/rejected": -195.0, + "loss": 1.3643, + "nll_loss": 0.865234375, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.828125, + "rewards/margins": 3.23828125, + "rewards/rejected": 0.5927734375, + "step": 515 + }, + { + "epoch": 1.9798270893371757, + "grad_norm": 153.0, + "learning_rate": 1.4893617021276595e-08, + "logits/chosen": -3.720703125, + "logits/rejected": -3.76171875, + "logps/chosen": -404.5, + "logps/rejected": -220.375, + "loss": 1.4365, + "nll_loss": 0.8349609375, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.24609375, + "rewards/margins": 2.525390625, + "rewards/rejected": 0.72021484375, + "step": 516 + }, + { + "epoch": 1.9836695485110472, + "grad_norm": 232.0, + "learning_rate": 1.276595744680851e-08, + "logits/chosen": -3.728515625, + "logits/rejected": -3.80078125, + "logps/chosen": -491.0, + "logps/rejected": -248.5, + "loss": 1.2461, + "nll_loss": 0.7548828125, + "rewards/accuracies": 0.9375, + "rewards/chosen": 4.2080078125, + "rewards/margins": 3.5234375, + "rewards/rejected": 0.68084716796875, + "step": 517 + }, + { + "epoch": 1.9875120076849182, + "grad_norm": 127.5, + "learning_rate": 1.0638297872340425e-08, + "logits/chosen": -3.736328125, + "logits/rejected": -3.75390625, + "logps/chosen": -444.75, + "logps/rejected": -236.25, + "loss": 1.3828, + "nll_loss": 0.85986328125, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.755859375, + "rewards/margins": 2.978515625, + "rewards/rejected": 0.780731201171875, + "step": 518 + }, + { + "epoch": 1.9913544668587897, + "grad_norm": 166.0, + "learning_rate": 8.51063829787234e-09, + "logits/chosen": -3.693359375, + "logits/rejected": -3.78125, + "logps/chosen": -397.5, + "logps/rejected": -233.25, + "loss": 1.6104, + "nll_loss": 0.9541015625, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.8798828125, + "rewards/margins": 2.2728271484375, + "rewards/rejected": 0.59698486328125, + "step": 519 + }, + { + "epoch": 1.9913544668587897, + "eval_logits/chosen": -3.7707386016845703, + "eval_logits/rejected": -3.8210227489471436, + "eval_logps/chosen": -449.6363525390625, + "eval_logps/rejected": -244.23635864257812, + "eval_loss": 1.4559190273284912, + "eval_nll_loss": 0.8917436003684998, + "eval_rewards/accuracies": 0.8901515007019043, + "eval_rewards/chosen": 3.636150598526001, + "eval_rewards/margins": 2.8020241260528564, + "eval_rewards/rejected": 0.8355379700660706, + "eval_runtime": 105.5363, + "eval_samples_per_second": 4.16, + "eval_steps_per_second": 1.042, + "step": 519 + } + ], + "logging_steps": 1, + "max_steps": 522, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 173, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}