{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 1911, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005232862375719519, "grad_norm": 18.43390616447888, "learning_rate": 2.6041666666666664e-09, "logits/chosen": -2.963651180267334, "logits/rejected": -2.840693473815918, "logps/chosen": -359.36724853515625, "logps/rejected": -423.6724853515625, "loss": 1.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0052328623757195184, "grad_norm": 17.131126053231263, "learning_rate": 2.6041666666666667e-08, "logits/chosen": -2.6158318519592285, "logits/rejected": -2.5636255741119385, "logps/chosen": -315.0495300292969, "logps/rejected": -260.0464172363281, "loss": 0.9999, "rewards/accuracies": 0.4444444477558136, "rewards/chosen": 3.34601936629042e-05, "rewards/margins": 0.000495927466545254, "rewards/rejected": -0.00046246720012277365, "step": 10 }, { "epoch": 0.010465724751439037, "grad_norm": 15.433289180818496, "learning_rate": 5.208333333333333e-08, "logits/chosen": -2.6066763401031494, "logits/rejected": -2.592315673828125, "logps/chosen": -218.59963989257812, "logps/rejected": -218.1477508544922, "loss": 1.0002, "rewards/accuracies": 0.5, "rewards/chosen": 4.2955857679771725e-06, "rewards/margins": 0.0003672370803542435, "rewards/rejected": -0.00036294152960181236, "step": 20 }, { "epoch": 0.015698587127158554, "grad_norm": 17.767123014569844, "learning_rate": 7.812499999999999e-08, "logits/chosen": -2.565275192260742, "logits/rejected": -2.5802865028381348, "logps/chosen": -281.9302978515625, "logps/rejected": -287.4366455078125, "loss": 0.9998, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.0007054550223983824, "rewards/margins": 0.0005853560287505388, "rewards/rejected": 0.0001200990955112502, "step": 30 }, { "epoch": 0.020931449502878074, "grad_norm": 17.41312868086795, "learning_rate": 1.0416666666666667e-07, "logits/chosen": -2.6604909896850586, "logits/rejected": -2.570391893386841, "logps/chosen": -284.16229248046875, "logps/rejected": -277.1500549316406, "loss": 0.9993, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.0006504150805994868, "rewards/margins": 0.00035138815292157233, "rewards/rejected": 0.00029902689857408404, "step": 40 }, { "epoch": 0.026164311878597593, "grad_norm": 17.203608103791886, "learning_rate": 1.3020833333333334e-07, "logits/chosen": -2.728846311569214, "logits/rejected": -2.6940040588378906, "logps/chosen": -257.30377197265625, "logps/rejected": -267.99725341796875, "loss": 0.9989, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 9.348688763566315e-05, "rewards/margins": 0.0014541767304763198, "rewards/rejected": -0.0013606901047751307, "step": 50 }, { "epoch": 0.03139717425431711, "grad_norm": 16.691658929475604, "learning_rate": 1.5624999999999999e-07, "logits/chosen": -2.6505489349365234, "logits/rejected": -2.6367762088775635, "logps/chosen": -282.7791442871094, "logps/rejected": -285.486083984375, "loss": 0.9963, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.004240754060447216, "rewards/margins": 0.004483290947973728, "rewards/rejected": -0.0002425375860184431, "step": 60 }, { "epoch": 0.03663003663003663, "grad_norm": 18.323689190568434, "learning_rate": 1.8229166666666666e-07, "logits/chosen": -2.701293468475342, "logits/rejected": -2.6295533180236816, "logps/chosen": -304.28106689453125, "logps/rejected": -285.37933349609375, "loss": 0.9945, "rewards/accuracies": 0.625, "rewards/chosen": 0.010745614767074585, "rewards/margins": 0.010555150918662548, "rewards/rejected": 0.00019046352826990187, "step": 70 }, { "epoch": 0.04186289900575615, "grad_norm": 17.893201646793354, "learning_rate": 2.0833333333333333e-07, "logits/chosen": -2.6148569583892822, "logits/rejected": -2.535352945327759, "logps/chosen": -274.98089599609375, "logps/rejected": -278.56536865234375, "loss": 0.9924, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.016005638986825943, "rewards/margins": 0.013915233314037323, "rewards/rejected": 0.0020904061384499073, "step": 80 }, { "epoch": 0.04709576138147567, "grad_norm": 16.75717250030696, "learning_rate": 2.3437499999999998e-07, "logits/chosen": -2.617600917816162, "logits/rejected": -2.5719563961029053, "logps/chosen": -241.8197021484375, "logps/rejected": -236.5170440673828, "loss": 0.9907, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.020918551832437515, "rewards/margins": 0.013779711909592152, "rewards/rejected": 0.007138841785490513, "step": 90 }, { "epoch": 0.052328623757195186, "grad_norm": 16.36100822715445, "learning_rate": 2.604166666666667e-07, "logits/chosen": -2.6756792068481445, "logits/rejected": -2.549262523651123, "logps/chosen": -287.8036804199219, "logps/rejected": -237.2367401123047, "loss": 0.9799, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.03541067987680435, "rewards/margins": 0.04892192780971527, "rewards/rejected": -0.013511243276298046, "step": 100 }, { "epoch": 0.052328623757195186, "eval_logits/chosen": -2.605264902114868, "eval_logits/rejected": -2.5498783588409424, "eval_logps/chosen": -279.58929443359375, "eval_logps/rejected": -262.43585205078125, "eval_loss": 0.9798349738121033, "eval_rewards/accuracies": 0.7003968358039856, "eval_rewards/chosen": 0.023877285420894623, "eval_rewards/margins": 0.046207960695028305, "eval_rewards/rejected": -0.022330671548843384, "eval_runtime": 408.2828, "eval_samples_per_second": 4.899, "eval_steps_per_second": 0.154, "step": 100 }, { "epoch": 0.0575614861329147, "grad_norm": 16.24006322757547, "learning_rate": 2.864583333333333e-07, "logits/chosen": -2.533465623855591, "logits/rejected": -2.4855306148529053, "logps/chosen": -255.5850830078125, "logps/rejected": -241.38766479492188, "loss": 0.972, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.02970467507839203, "rewards/margins": 0.08004742860794067, "rewards/rejected": -0.050342757254838943, "step": 110 }, { "epoch": 0.06279434850863422, "grad_norm": 16.407771077926927, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -2.643631935119629, "logits/rejected": -2.6150763034820557, "logps/chosen": -347.9541931152344, "logps/rejected": -319.07122802734375, "loss": 0.9706, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.015048133209347725, "rewards/margins": 0.05341935157775879, "rewards/rejected": -0.03837122023105621, "step": 120 }, { "epoch": 0.06802721088435375, "grad_norm": 57.87178703392371, "learning_rate": 3.3854166666666667e-07, "logits/chosen": -2.479541301727295, "logits/rejected": -2.4731030464172363, "logps/chosen": -223.9632110595703, "logps/rejected": -259.69085693359375, "loss": 0.972, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.058804191648960114, "rewards/margins": 0.10448155552148819, "rewards/rejected": -0.1632857620716095, "step": 130 }, { "epoch": 0.07326007326007326, "grad_norm": 22.53408225517127, "learning_rate": 3.645833333333333e-07, "logits/chosen": -2.491089105606079, "logits/rejected": -2.472991466522217, "logps/chosen": -304.58258056640625, "logps/rejected": -292.1494140625, "loss": 0.9449, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.03405456990003586, "rewards/margins": 0.15607304871082306, "rewards/rejected": -0.19012761116027832, "step": 140 }, { "epoch": 0.07849293563579278, "grad_norm": 23.793712815777106, "learning_rate": 3.9062499999999997e-07, "logits/chosen": -2.453864336013794, "logits/rejected": -2.3710696697235107, "logps/chosen": -271.48321533203125, "logps/rejected": -273.72845458984375, "loss": 0.9258, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.037565313279628754, "rewards/margins": 0.1909410059452057, "rewards/rejected": -0.22850628197193146, "step": 150 }, { "epoch": 0.0837257980115123, "grad_norm": 24.936273155118595, "learning_rate": 4.1666666666666667e-07, "logits/chosen": -2.371394395828247, "logits/rejected": -2.2828476428985596, "logps/chosen": -287.2839050292969, "logps/rejected": -279.890625, "loss": 0.9283, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.028330225497484207, "rewards/margins": 0.2533509135246277, "rewards/rejected": -0.2816811203956604, "step": 160 }, { "epoch": 0.08895866038723181, "grad_norm": 23.68013848737139, "learning_rate": 4.427083333333333e-07, "logits/chosen": -2.3668949604034424, "logits/rejected": -2.360105276107788, "logps/chosen": -269.80035400390625, "logps/rejected": -293.9357604980469, "loss": 0.9181, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.0644833967089653, "rewards/margins": 0.1013181209564209, "rewards/rejected": -0.165801540017128, "step": 170 }, { "epoch": 0.09419152276295134, "grad_norm": 30.716843045435407, "learning_rate": 4.6874999999999996e-07, "logits/chosen": -2.2320008277893066, "logits/rejected": -2.1564977169036865, "logps/chosen": -260.1246337890625, "logps/rejected": -281.7524719238281, "loss": 0.923, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.12353141605854034, "rewards/margins": 0.1733051985502243, "rewards/rejected": -0.29683661460876465, "step": 180 }, { "epoch": 0.09942438513867086, "grad_norm": 29.911411340600235, "learning_rate": 4.947916666666667e-07, "logits/chosen": -2.3174493312835693, "logits/rejected": -2.1575634479522705, "logps/chosen": -376.51751708984375, "logps/rejected": -358.4214782714844, "loss": 0.9265, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.5403885841369629, "rewards/margins": 0.3255039155483246, "rewards/rejected": -0.8658924102783203, "step": 190 }, { "epoch": 0.10465724751439037, "grad_norm": 37.2503320116184, "learning_rate": 4.999732803821339e-07, "logits/chosen": -2.002506732940674, "logits/rejected": -1.8850481510162354, "logps/chosen": -343.7098083496094, "logps/rejected": -379.3221435546875, "loss": 0.9062, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6649556756019592, "rewards/margins": 0.24632687866687775, "rewards/rejected": -0.9112824201583862, "step": 200 }, { "epoch": 0.10465724751439037, "eval_logits/chosen": -2.039356231689453, "eval_logits/rejected": -1.8738951683044434, "eval_logps/chosen": -369.8689880371094, "eval_logps/rejected": -384.9631042480469, "eval_loss": 0.8992272019386292, "eval_rewards/accuracies": 0.704365074634552, "eval_rewards/chosen": -0.878919243812561, "eval_rewards/margins": 0.3686836361885071, "eval_rewards/rejected": -1.247602939605713, "eval_runtime": 409.7114, "eval_samples_per_second": 4.881, "eval_steps_per_second": 0.154, "step": 200 }, { "epoch": 0.10989010989010989, "grad_norm": 53.75412082856009, "learning_rate": 4.998647417232375e-07, "logits/chosen": -2.1129627227783203, "logits/rejected": -1.880600929260254, "logps/chosen": -294.85504150390625, "logps/rejected": -313.2070007324219, "loss": 0.9118, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.6581923961639404, "rewards/margins": 0.2888200283050537, "rewards/rejected": -0.9470125436782837, "step": 210 }, { "epoch": 0.1151229722658294, "grad_norm": 37.721202737167374, "learning_rate": 4.996727502703357e-07, "logits/chosen": -1.7387809753417969, "logits/rejected": -1.5530433654785156, "logps/chosen": -320.2716369628906, "logps/rejected": -372.41168212890625, "loss": 0.8624, "rewards/accuracies": 0.75, "rewards/chosen": -0.6635367274284363, "rewards/margins": 0.4996022582054138, "rewards/rejected": -1.16313898563385, "step": 220 }, { "epoch": 0.12035583464154893, "grad_norm": 121.33392784727086, "learning_rate": 4.993973701470142e-07, "logits/chosen": -1.203376054763794, "logits/rejected": -1.1193268299102783, "logps/chosen": -392.47088623046875, "logps/rejected": -464.223876953125, "loss": 0.9228, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.0930960178375244, "rewards/margins": 0.3919003903865814, "rewards/rejected": -1.4849965572357178, "step": 230 }, { "epoch": 0.12558869701726844, "grad_norm": 33.63096181249512, "learning_rate": 4.990386933279972e-07, "logits/chosen": -1.5336560010910034, "logits/rejected": -1.2797176837921143, "logps/chosen": -339.15191650390625, "logps/rejected": -397.858642578125, "loss": 0.8589, "rewards/accuracies": 0.75, "rewards/chosen": -0.9715251922607422, "rewards/margins": 0.39502108097076416, "rewards/rejected": -1.3665462732315063, "step": 240 }, { "epoch": 0.13082155939298795, "grad_norm": 34.328083408688954, "learning_rate": 4.985968396084284e-07, "logits/chosen": -1.3410297632217407, "logits/rejected": -0.9227784872055054, "logps/chosen": -368.9054260253906, "logps/rejected": -398.6717529296875, "loss": 0.9457, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7665129899978638, "rewards/margins": 0.5669997930526733, "rewards/rejected": -1.3335126638412476, "step": 250 }, { "epoch": 0.1360544217687075, "grad_norm": 48.60943616878464, "learning_rate": 4.98071956563861e-07, "logits/chosen": -0.8289437294006348, "logits/rejected": -0.4671274721622467, "logps/chosen": -344.4454040527344, "logps/rejected": -383.0525817871094, "loss": 0.892, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.7416936159133911, "rewards/margins": 0.6542305946350098, "rewards/rejected": -1.3959239721298218, "step": 260 }, { "epoch": 0.141287284144427, "grad_norm": 33.250532200218636, "learning_rate": 4.97464219500968e-07, "logits/chosen": -0.3106756806373596, "logits/rejected": 0.10684038698673248, "logps/chosen": -363.1103515625, "logps/rejected": -440.0777893066406, "loss": 0.8149, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8890632390975952, "rewards/margins": 0.5507485270500183, "rewards/rejected": -1.4398118257522583, "step": 270 }, { "epoch": 0.14652014652014653, "grad_norm": 38.94408939571324, "learning_rate": 4.967738313989918e-07, "logits/chosen": -0.37238869071006775, "logits/rejected": 0.008820340037345886, "logps/chosen": -402.12322998046875, "logps/rejected": -457.7510681152344, "loss": 0.848, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.9409120678901672, "rewards/margins": 0.8176702260971069, "rewards/rejected": -1.758582353591919, "step": 280 }, { "epoch": 0.15175300889586604, "grad_norm": 35.56851659836682, "learning_rate": 4.960010228419499e-07, "logits/chosen": -1.0504642724990845, "logits/rejected": -0.817493736743927, "logps/chosen": -381.19488525390625, "logps/rejected": -392.2685546875, "loss": 0.8568, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8723945617675781, "rewards/margins": 0.5610424280166626, "rewards/rejected": -1.4334371089935303, "step": 290 }, { "epoch": 0.15698587127158556, "grad_norm": 34.58717588936045, "learning_rate": 4.951460519416227e-07, "logits/chosen": -0.9317871928215027, "logits/rejected": -0.7162570953369141, "logps/chosen": -382.6187744140625, "logps/rejected": -390.21826171875, "loss": 0.8625, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0087454319000244, "rewards/margins": 0.4423387050628662, "rewards/rejected": -1.4510842561721802, "step": 300 }, { "epoch": 0.15698587127158556, "eval_logits/chosen": -0.7556310892105103, "eval_logits/rejected": -0.4321170151233673, "eval_logps/chosen": -362.779296875, "eval_logps/rejected": -402.3089294433594, "eval_loss": 0.9493342041969299, "eval_rewards/accuracies": 0.7400793433189392, "eval_rewards/chosen": -0.8080223798751831, "eval_rewards/margins": 0.613038957118988, "eval_rewards/rejected": -1.421061396598816, "eval_runtime": 409.3664, "eval_samples_per_second": 4.886, "eval_steps_per_second": 0.154, "step": 300 }, { "epoch": 0.16221873364730507, "grad_norm": 64.89068361442095, "learning_rate": 4.942092042513458e-07, "logits/chosen": -0.047006282955408096, "logits/rejected": 0.4156367778778076, "logps/chosen": -373.67205810546875, "logps/rejected": -423.0428771972656, "loss": 0.8883, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.8939437866210938, "rewards/margins": 0.7893495559692383, "rewards/rejected": -1.683293342590332, "step": 310 }, { "epoch": 0.1674515960230246, "grad_norm": 46.62691878451126, "learning_rate": 4.931907926706373e-07, "logits/chosen": 0.36694270372390747, "logits/rejected": 1.1297041177749634, "logps/chosen": -423.45941162109375, "logps/rejected": -438.6988830566406, "loss": 0.8295, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1606793403625488, "rewards/margins": 0.6528772711753845, "rewards/rejected": -1.8135566711425781, "step": 320 }, { "epoch": 0.1726844583987441, "grad_norm": 44.38959758027336, "learning_rate": 4.920911573406924e-07, "logits/chosen": 0.4341273307800293, "logits/rejected": 1.2812960147857666, "logps/chosen": -396.37841796875, "logps/rejected": -423.52166748046875, "loss": 0.8646, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.153520941734314, "rewards/margins": 0.766487717628479, "rewards/rejected": -1.920008897781372, "step": 330 }, { "epoch": 0.17791732077446362, "grad_norm": 41.36989462539144, "learning_rate": 4.909106655307787e-07, "logits/chosen": 0.15560273826122284, "logits/rejected": 0.5286809802055359, "logps/chosen": -394.1658935546875, "logps/rejected": -445.2894592285156, "loss": 0.7968, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.1813338994979858, "rewards/margins": 0.7696493864059448, "rewards/rejected": -1.9509834051132202, "step": 340 }, { "epoch": 0.18315018315018314, "grad_norm": 56.42858902844415, "learning_rate": 4.896497115155709e-07, "logits/chosen": 0.41545963287353516, "logits/rejected": 0.7852723002433777, "logps/chosen": -315.1676330566406, "logps/rejected": -413.4566345214844, "loss": 0.7707, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.1187893152236938, "rewards/margins": 0.8319570422172546, "rewards/rejected": -1.9507462978363037, "step": 350 }, { "epoch": 0.18838304552590268, "grad_norm": 68.82266360081938, "learning_rate": 4.883087164434672e-07, "logits/chosen": 0.5310618281364441, "logits/rejected": 1.2865915298461914, "logps/chosen": -391.3490905761719, "logps/rejected": -477.39697265625, "loss": 0.8291, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.1477725505828857, "rewards/margins": 0.8737491369247437, "rewards/rejected": -2.02152156829834, "step": 360 }, { "epoch": 0.1936159079016222, "grad_norm": 72.85685673883998, "learning_rate": 4.868881281959282e-07, "logits/chosen": 0.11976215988397598, "logits/rejected": 0.6760571599006653, "logps/chosen": -343.851806640625, "logps/rejected": -392.71783447265625, "loss": 0.8291, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.0080305337905884, "rewards/margins": 0.6320805549621582, "rewards/rejected": -1.6401112079620361, "step": 370 }, { "epoch": 0.1988487702773417, "grad_norm": 36.55575025371968, "learning_rate": 4.853884212378889e-07, "logits/chosen": 0.7114084362983704, "logits/rejected": 1.0887553691864014, "logps/chosen": -304.578857421875, "logps/rejected": -421.1670837402344, "loss": 0.7927, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.7573381662368774, "rewards/margins": 0.7430611848831177, "rewards/rejected": -1.5003993511199951, "step": 380 }, { "epoch": 0.20408163265306123, "grad_norm": 49.98725547562052, "learning_rate": 4.838100964592904e-07, "logits/chosen": 1.1876529455184937, "logits/rejected": 1.9734843969345093, "logps/chosen": -424.49005126953125, "logps/rejected": -445.054443359375, "loss": 0.8246, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.137253999710083, "rewards/margins": 0.6889451146125793, "rewards/rejected": -1.826198935508728, "step": 390 }, { "epoch": 0.20931449502878074, "grad_norm": 42.093081928168466, "learning_rate": 4.821536810077878e-07, "logits/chosen": 0.8505879640579224, "logits/rejected": 1.98516047000885, "logps/chosen": -433.35186767578125, "logps/rejected": -440.00628662109375, "loss": 0.8393, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.228239893913269, "rewards/margins": 0.6536797881126404, "rewards/rejected": -1.8819196224212646, "step": 400 }, { "epoch": 0.20931449502878074, "eval_logits/chosen": 0.49491578340530396, "eval_logits/rejected": 1.2280761003494263, "eval_logps/chosen": -403.68408203125, "eval_logps/rejected": -443.1323547363281, "eval_loss": 0.8450161218643188, "eval_rewards/accuracies": 0.738095223903656, "eval_rewards/chosen": -1.2170705795288086, "eval_rewards/margins": 0.6122247576713562, "eval_rewards/rejected": -1.82929527759552, "eval_runtime": 407.5274, "eval_samples_per_second": 4.908, "eval_steps_per_second": 0.155, "step": 400 }, { "epoch": 0.21454735740450026, "grad_norm": 49.918696539707824, "learning_rate": 4.804197281126862e-07, "logits/chosen": -0.059110742062330246, "logits/rejected": 0.8801722526550293, "logps/chosen": -428.4386291503906, "logps/rejected": -472.10797119140625, "loss": 0.8962, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2714943885803223, "rewards/margins": 0.4991036355495453, "rewards/rejected": -1.7705981731414795, "step": 410 }, { "epoch": 0.21978021978021978, "grad_norm": 53.98715925527677, "learning_rate": 4.786088169001671e-07, "logits/chosen": -0.5467379689216614, "logits/rejected": 0.04367492347955704, "logps/chosen": -371.7878112792969, "logps/rejected": -428.397216796875, "loss": 0.8556, "rewards/accuracies": 0.625, "rewards/chosen": -1.074549913406372, "rewards/margins": 0.7141280174255371, "rewards/rejected": -1.7886779308319092, "step": 420 }, { "epoch": 0.2250130821559393, "grad_norm": 49.20019154623459, "learning_rate": 4.767215521998648e-07, "logits/chosen": -1.084761381149292, "logits/rejected": -0.19863851368427277, "logps/chosen": -371.8482360839844, "logps/rejected": -400.63031005859375, "loss": 0.8319, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.9295535087585449, "rewards/margins": 0.7482463121414185, "rewards/rejected": -1.6777995824813843, "step": 430 }, { "epoch": 0.2302459445316588, "grad_norm": 81.70925119379834, "learning_rate": 4.7475856434285853e-07, "logits/chosen": -1.0488715171813965, "logits/rejected": -0.11229286342859268, "logps/chosen": -395.6819152832031, "logps/rejected": -411.98095703125, "loss": 0.8264, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.0028269290924072, "rewards/margins": 0.7223213315010071, "rewards/rejected": -1.7251479625701904, "step": 440 }, { "epoch": 0.23547880690737832, "grad_norm": 45.406870747479154, "learning_rate": 4.727205089511466e-07, "logits/chosen": -0.7608393430709839, "logits/rejected": -0.39146721363067627, "logps/chosen": -339.39312744140625, "logps/rejected": -398.0794677734375, "loss": 0.8445, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9024568796157837, "rewards/margins": 0.5179198980331421, "rewards/rejected": -1.4203767776489258, "step": 450 }, { "epoch": 0.24071166928309787, "grad_norm": 48.66797167861632, "learning_rate": 4.706080667186738e-07, "logits/chosen": -1.2728325128555298, "logits/rejected": -0.8300641179084778, "logps/chosen": -403.8634033203125, "logps/rejected": -419.13775634765625, "loss": 0.8366, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.7553515434265137, "rewards/margins": 0.6253620386123657, "rewards/rejected": -1.3807135820388794, "step": 460 }, { "epoch": 0.24594453165881738, "grad_norm": 62.96548280856701, "learning_rate": 4.68421943183986e-07, "logits/chosen": -0.8926060795783997, "logits/rejected": -0.3875535726547241, "logps/chosen": -378.51263427734375, "logps/rejected": -445.01263427734375, "loss": 0.8124, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.9251837730407715, "rewards/margins": 0.7882810831069946, "rewards/rejected": -1.7134649753570557, "step": 470 }, { "epoch": 0.25117739403453687, "grad_norm": 65.54665658174166, "learning_rate": 4.661628684945851e-07, "logits/chosen": -0.6543710231781006, "logits/rejected": 0.08349782228469849, "logps/chosen": -326.32427978515625, "logps/rejected": -393.5327453613281, "loss": 0.82, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.7713912129402161, "rewards/margins": 0.8168617486953735, "rewards/rejected": -1.5882530212402344, "step": 480 }, { "epoch": 0.2564102564102564, "grad_norm": 44.24759129435125, "learning_rate": 4.638315971630662e-07, "logits/chosen": -0.8573375940322876, "logits/rejected": -0.35136765241622925, "logps/chosen": -379.94488525390625, "logps/rejected": -421.4915466308594, "loss": 0.7853, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.8803781270980835, "rewards/margins": 0.7282763719558716, "rewards/rejected": -1.6086546182632446, "step": 490 }, { "epoch": 0.2616431187859759, "grad_norm": 43.63899131852928, "learning_rate": 4.6142890781511635e-07, "logits/chosen": 0.0875982865691185, "logits/rejected": 0.5346558690071106, "logps/chosen": -395.8002624511719, "logps/rejected": -459.2823791503906, "loss": 0.7936, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.321560025215149, "rewards/margins": 0.7252427339553833, "rewards/rejected": -2.0468027591705322, "step": 500 }, { "epoch": 0.2616431187859759, "eval_logits/chosen": 0.42498406767845154, "eval_logits/rejected": 1.2275949716567993, "eval_logps/chosen": -439.120849609375, "eval_logps/rejected": -499.2641906738281, "eval_loss": 0.8415341377258301, "eval_rewards/accuracies": 0.738095223903656, "eval_rewards/chosen": -1.5714380741119385, "eval_rewards/margins": 0.8191761374473572, "eval_rewards/rejected": -2.3906142711639404, "eval_runtime": 414.5011, "eval_samples_per_second": 4.825, "eval_steps_per_second": 0.152, "step": 500 }, { "epoch": 0.2668759811616955, "grad_norm": 72.56582565131232, "learning_rate": 4.5895560292945996e-07, "logits/chosen": 0.3352723717689514, "logits/rejected": 1.3601490259170532, "logps/chosen": -416.24237060546875, "logps/rejected": -471.38134765625, "loss": 0.8336, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.471160650253296, "rewards/margins": 0.6955917477607727, "rewards/rejected": -2.166752338409424, "step": 510 }, { "epoch": 0.272108843537415, "grad_norm": 39.44398625642088, "learning_rate": 4.5641250856983743e-07, "logits/chosen": -1.3128844499588013, "logits/rejected": -0.6595572829246521, "logps/chosen": -391.11260986328125, "logps/rejected": -444.6856994628906, "loss": 0.8189, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.8228667974472046, "rewards/margins": 0.5250405073165894, "rewards/rejected": -1.347907304763794, "step": 520 }, { "epoch": 0.2773417059131345, "grad_norm": 73.40296140192903, "learning_rate": 4.5380047410910655e-07, "logits/chosen": -0.4393788278102875, "logits/rejected": 0.1780240833759308, "logps/chosen": -370.919677734375, "logps/rejected": -383.4080810546875, "loss": 0.8199, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.7066813111305237, "rewards/margins": 0.6764801740646362, "rewards/rejected": -1.3831616640090942, "step": 530 }, { "epoch": 0.282574568288854, "grad_norm": 64.06520632888025, "learning_rate": 4.5112037194555876e-07, "logits/chosen": 0.07873444259166718, "logits/rejected": 0.8679628372192383, "logps/chosen": -394.37139892578125, "logps/rejected": -449.7489318847656, "loss": 0.7698, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.9526742696762085, "rewards/margins": 0.9246940612792969, "rewards/rejected": -1.8773682117462158, "step": 540 }, { "epoch": 0.28780743066457354, "grad_norm": 39.60964337401594, "learning_rate": 4.4837309721154536e-07, "logits/chosen": 0.6299174427986145, "logits/rejected": 1.4356385469436646, "logps/chosen": -467.62261962890625, "logps/rejected": -533.4133911132812, "loss": 0.8566, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6188760995864868, "rewards/margins": 0.8416110873222351, "rewards/rejected": -2.460486888885498, "step": 550 }, { "epoch": 0.29304029304029305, "grad_norm": 47.437799939372404, "learning_rate": 4.4555956747451065e-07, "logits/chosen": 1.0129201412200928, "logits/rejected": 1.431279182434082, "logps/chosen": -441.26055908203125, "logps/rejected": -502.3465270996094, "loss": 0.8331, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5448591709136963, "rewards/margins": 0.6874719262123108, "rewards/rejected": -2.2323310375213623, "step": 560 }, { "epoch": 0.29827315541601257, "grad_norm": 86.83218082537917, "learning_rate": 4.426807224305315e-07, "logits/chosen": 0.23010826110839844, "logits/rejected": 1.1926991939544678, "logps/chosen": -419.4606018066406, "logps/rejected": -471.22821044921875, "loss": 0.8379, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.2937848567962646, "rewards/margins": 0.7681288123130798, "rewards/rejected": -2.06191349029541, "step": 570 }, { "epoch": 0.3035060177917321, "grad_norm": 48.64607365932245, "learning_rate": 4.397375235904669e-07, "logits/chosen": 0.08270619064569473, "logits/rejected": 0.8474105596542358, "logps/chosen": -358.3929748535156, "logps/rejected": -372.9140625, "loss": 0.8132, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.029772400856018, "rewards/margins": 0.6086955070495605, "rewards/rejected": -1.638467788696289, "step": 580 }, { "epoch": 0.3087388801674516, "grad_norm": 36.02981125001583, "learning_rate": 4.3673095395882074e-07, "logits/chosen": -0.075676828622818, "logits/rejected": 0.6597450971603394, "logps/chosen": -368.6099853515625, "logps/rejected": -416.2569274902344, "loss": 0.8141, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.0809032917022705, "rewards/margins": 0.6270149946212769, "rewards/rejected": -1.7079181671142578, "step": 590 }, { "epoch": 0.3139717425431711, "grad_norm": 47.45205898115052, "learning_rate": 4.3366201770542687e-07, "logits/chosen": 0.019076282158493996, "logits/rejected": 0.4663005471229553, "logps/chosen": -378.0471496582031, "logps/rejected": -458.962646484375, "loss": 0.8417, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0055654048919678, "rewards/margins": 0.6836211085319519, "rewards/rejected": -1.689186453819275, "step": 600 }, { "epoch": 0.3139717425431711, "eval_logits/chosen": 0.5051451921463013, "eval_logits/rejected": 1.2799084186553955, "eval_logps/chosen": -390.00830078125, "eval_logps/rejected": -451.793212890625, "eval_loss": 0.810525119304657, "eval_rewards/accuracies": 0.7599206566810608, "eval_rewards/chosen": -1.0803130865097046, "eval_rewards/margins": 0.8355910778045654, "eval_rewards/rejected": -1.915903925895691, "eval_runtime": 408.0486, "eval_samples_per_second": 4.901, "eval_steps_per_second": 0.154, "step": 600 }, { "epoch": 0.31920460491889063, "grad_norm": 46.34613632964824, "learning_rate": 4.3053173983006395e-07, "logits/chosen": 0.4634198546409607, "logits/rejected": 1.4166696071624756, "logps/chosen": -351.01513671875, "logps/rejected": -415.1665954589844, "loss": 0.7792, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.1427425146102905, "rewards/margins": 0.7582564949989319, "rewards/rejected": -1.9009990692138672, "step": 610 }, { "epoch": 0.32443746729461015, "grad_norm": 57.98615157781653, "learning_rate": 4.2734116582011403e-07, "logits/chosen": 0.2684568762779236, "logits/rejected": 1.96038818359375, "logps/chosen": -461.70001220703125, "logps/rejected": -465.15887451171875, "loss": 0.8167, "rewards/accuracies": 0.75, "rewards/chosen": -1.3099931478500366, "rewards/margins": 0.9379714727401733, "rewards/rejected": -2.247964859008789, "step": 620 }, { "epoch": 0.32967032967032966, "grad_norm": 69.43179032769773, "learning_rate": 4.2409136130137845e-07, "logits/chosen": 1.0865520238876343, "logits/rejected": 1.68658447265625, "logps/chosen": -343.22052001953125, "logps/rejected": -445.29754638671875, "loss": 0.8456, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.2416740655899048, "rewards/margins": 0.7472217082977295, "rewards/rejected": -1.9888956546783447, "step": 630 }, { "epoch": 0.3349031920460492, "grad_norm": 48.9250805478614, "learning_rate": 4.207834116821672e-07, "logits/chosen": 1.078563928604126, "logits/rejected": 1.9194217920303345, "logps/chosen": -427.1504821777344, "logps/rejected": -477.97430419921875, "loss": 0.8182, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.4011329412460327, "rewards/margins": 0.8307523727416992, "rewards/rejected": -2.2318854331970215, "step": 640 }, { "epoch": 0.3401360544217687, "grad_norm": 57.74424131409625, "learning_rate": 4.174184217907818e-07, "logits/chosen": 0.8527647256851196, "logits/rejected": 1.566917061805725, "logps/chosen": -439.05712890625, "logps/rejected": -472.4815368652344, "loss": 0.8095, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5909265279769897, "rewards/margins": 0.6773198843002319, "rewards/rejected": -2.2682464122772217, "step": 650 }, { "epoch": 0.3453689167974882, "grad_norm": 48.48445100627217, "learning_rate": 4.1399751550651084e-07, "logits/chosen": 1.2492133378982544, "logits/rejected": 1.939026117324829, "logps/chosen": -355.75885009765625, "logps/rejected": -402.2335205078125, "loss": 0.8152, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2290223836898804, "rewards/margins": 0.6078513860702515, "rewards/rejected": -1.836874008178711, "step": 660 }, { "epoch": 0.35060177917320773, "grad_norm": 50.38517267296823, "learning_rate": 4.1052183538426426e-07, "logits/chosen": 0.5203877091407776, "logits/rejected": 1.35017991065979, "logps/chosen": -366.6665344238281, "logps/rejected": -411.147216796875, "loss": 0.8584, "rewards/accuracies": 0.75, "rewards/chosen": -0.9456546902656555, "rewards/margins": 0.7343701124191284, "rewards/rejected": -1.6800247430801392, "step": 670 }, { "epoch": 0.35583464154892724, "grad_norm": 85.6481969519573, "learning_rate": 4.0699254227296884e-07, "logits/chosen": 0.23912549018859863, "logits/rejected": 0.8521639704704285, "logps/chosen": -366.981201171875, "logps/rejected": -395.6986389160156, "loss": 0.8104, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.000410795211792, "rewards/margins": 0.6674467921257019, "rewards/rejected": -1.6678575277328491, "step": 680 }, { "epoch": 0.36106750392464676, "grad_norm": 36.74169209014277, "learning_rate": 4.034108149278543e-07, "logits/chosen": 0.6640294790267944, "logits/rejected": 1.402207612991333, "logps/chosen": -398.92315673828125, "logps/rejected": -453.0848693847656, "loss": 0.7751, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.1213617324829102, "rewards/margins": 0.7801300883293152, "rewards/rejected": -1.9014918804168701, "step": 690 }, { "epoch": 0.3663003663003663, "grad_norm": 42.36249787285924, "learning_rate": 3.9977784961675833e-07, "logits/chosen": 0.9835656881332397, "logits/rejected": 1.715651273727417, "logps/chosen": -443.865966796875, "logps/rejected": -465.02471923828125, "loss": 0.7909, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.4827320575714111, "rewards/margins": 0.6749345064163208, "rewards/rejected": -2.1576666831970215, "step": 700 }, { "epoch": 0.3663003663003663, "eval_logits/chosen": 0.4494816064834595, "eval_logits/rejected": 1.4414782524108887, "eval_logps/chosen": -390.69805908203125, "eval_logps/rejected": -460.0457763671875, "eval_loss": 0.8042942881584167, "eval_rewards/accuracies": 0.75, "eval_rewards/chosen": -1.0872102975845337, "eval_rewards/margins": 0.9112196564674377, "eval_rewards/rejected": -1.998429775238037, "eval_runtime": 414.5577, "eval_samples_per_second": 4.824, "eval_steps_per_second": 0.152, "step": 700 }, { "epoch": 0.3715332286760858, "grad_norm": 69.66328483999943, "learning_rate": 3.96094859720583e-07, "logits/chosen": 0.2663443386554718, "logits/rejected": 1.5525261163711548, "logps/chosen": -432.343505859375, "logps/rejected": -497.6241149902344, "loss": 0.7396, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.0979771614074707, "rewards/margins": 1.0610734224319458, "rewards/rejected": -2.159050703048706, "step": 710 }, { "epoch": 0.37676609105180536, "grad_norm": 60.682322992428375, "learning_rate": 3.923630753280357e-07, "logits/chosen": 0.608077883720398, "logits/rejected": 1.1105670928955078, "logps/chosen": -365.9642028808594, "logps/rejected": -426.92681884765625, "loss": 0.7678, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.1676547527313232, "rewards/margins": 0.6641029119491577, "rewards/rejected": -1.8317575454711914, "step": 720 }, { "epoch": 0.3819989534275249, "grad_norm": 66.54122939015758, "learning_rate": 3.8858374282478893e-07, "logits/chosen": 0.016317982226610184, "logits/rejected": 0.7667428255081177, "logps/chosen": -371.57305908203125, "logps/rejected": -450.36468505859375, "loss": 0.8097, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9967383146286011, "rewards/margins": 0.9009350538253784, "rewards/rejected": -1.8976733684539795, "step": 730 }, { "epoch": 0.3872318158032444, "grad_norm": 44.84518591843858, "learning_rate": 3.8475812447719823e-07, "logits/chosen": -0.04057773947715759, "logits/rejected": 0.7534581422805786, "logps/chosen": -363.1037902832031, "logps/rejected": -390.85003662109375, "loss": 0.7773, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.0838040113449097, "rewards/margins": 0.7179003953933716, "rewards/rejected": -1.8017044067382812, "step": 740 }, { "epoch": 0.3924646781789639, "grad_norm": 65.82227398243593, "learning_rate": 3.8088749801071496e-07, "logits/chosen": -0.04224572330713272, "logits/rejected": 0.6889729499816895, "logps/chosen": -378.70208740234375, "logps/rejected": -475.9454040527344, "loss": 0.7914, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.1617001295089722, "rewards/margins": 0.9271462559700012, "rewards/rejected": -2.088846206665039, "step": 750 }, { "epoch": 0.3976975405546834, "grad_norm": 51.76206681652384, "learning_rate": 3.7697315618313644e-07, "logits/chosen": 0.2801060378551483, "logits/rejected": 1.2058014869689941, "logps/chosen": -369.66851806640625, "logps/rejected": -422.9747009277344, "loss": 0.8309, "rewards/accuracies": 0.75, "rewards/chosen": -1.2287108898162842, "rewards/margins": 0.7445263862609863, "rewards/rejected": -1.9732372760772705, "step": 760 }, { "epoch": 0.40293040293040294, "grad_norm": 62.0612113628045, "learning_rate": 3.7301640635283584e-07, "logits/chosen": 0.4703196585178375, "logits/rejected": 0.8989171981811523, "logps/chosen": -436.1560974121094, "logps/rejected": -496.692626953125, "loss": 0.8333, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.4676082134246826, "rewards/margins": 0.5586889982223511, "rewards/rejected": -2.026297092437744, "step": 770 }, { "epoch": 0.40816326530612246, "grad_norm": 57.41721553248708, "learning_rate": 3.6901857004211443e-07, "logits/chosen": 0.9081114530563354, "logits/rejected": 1.7587671279907227, "logps/chosen": -428.9352111816406, "logps/rejected": -489.14117431640625, "loss": 0.7833, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5515849590301514, "rewards/margins": 0.9139375686645508, "rewards/rejected": -2.465522289276123, "step": 780 }, { "epoch": 0.413396127681842, "grad_norm": 48.598689383894325, "learning_rate": 3.6498098249582444e-07, "logits/chosen": 1.1841216087341309, "logits/rejected": 1.674604058265686, "logps/chosen": -414.0514221191406, "logps/rejected": -506.62115478515625, "loss": 0.8168, "rewards/accuracies": 0.6875, "rewards/chosen": -1.8437888622283936, "rewards/margins": 0.7532427906990051, "rewards/rejected": -2.597031593322754, "step": 790 }, { "epoch": 0.4186289900575615, "grad_norm": 101.02805283479535, "learning_rate": 3.6090499223540757e-07, "logits/chosen": 1.3700300455093384, "logits/rejected": 1.9168527126312256, "logps/chosen": -518.4417724609375, "logps/rejected": -559.4696044921875, "loss": 0.8545, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.0815773010253906, "rewards/margins": 0.6288520693778992, "rewards/rejected": -2.7104296684265137, "step": 800 }, { "epoch": 0.4186289900575615, "eval_logits/chosen": 0.9682077765464783, "eval_logits/rejected": 1.7931019067764282, "eval_logps/chosen": -463.3611145019531, "eval_logps/rejected": -521.2472534179688, "eval_loss": 0.8064725399017334, "eval_rewards/accuracies": 0.7559523582458496, "eval_rewards/chosen": -1.8138411045074463, "eval_rewards/margins": 0.7966035604476929, "eval_rewards/rejected": -2.6104445457458496, "eval_runtime": 412.1114, "eval_samples_per_second": 4.853, "eval_steps_per_second": 0.153, "step": 800 }, { "epoch": 0.423861852433281, "grad_norm": 54.180926505957565, "learning_rate": 3.5679196060850034e-07, "logits/chosen": 0.9972521662712097, "logits/rejected": 1.6269267797470093, "logps/chosen": -433.6529235839844, "logps/rejected": -483.6210021972656, "loss": 0.8555, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5342273712158203, "rewards/margins": 0.676469624042511, "rewards/rejected": -2.2106969356536865, "step": 810 }, { "epoch": 0.4290947148090005, "grad_norm": 35.87010193584681, "learning_rate": 3.5264326133425464e-07, "logits/chosen": 0.29808443784713745, "logits/rejected": 0.9751693606376648, "logps/chosen": -388.32952880859375, "logps/rejected": -425.7991638183594, "loss": 0.8143, "rewards/accuracies": 0.75, "rewards/chosen": -1.2507455348968506, "rewards/margins": 0.6961800456047058, "rewards/rejected": -1.9469257593154907, "step": 820 }, { "epoch": 0.43432757718472004, "grad_norm": 52.049103604382175, "learning_rate": 3.4846028004452693e-07, "logits/chosen": -0.38219153881073, "logits/rejected": 0.2841838598251343, "logps/chosen": -344.09307861328125, "logps/rejected": -391.5385437011719, "loss": 0.876, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.9970030784606934, "rewards/margins": 0.5505674481391907, "rewards/rejected": -1.5475704669952393, "step": 830 }, { "epoch": 0.43956043956043955, "grad_norm": 56.22696532074088, "learning_rate": 3.4424441382108826e-07, "logits/chosen": -0.6143403053283691, "logits/rejected": 0.4311816096305847, "logps/chosen": -400.6844787597656, "logps/rejected": -431.4810485839844, "loss": 0.7825, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.8071960210800171, "rewards/margins": 0.8356133699417114, "rewards/rejected": -1.642809510231018, "step": 840 }, { "epoch": 0.44479330193615907, "grad_norm": 58.48724014340389, "learning_rate": 3.399970707290105e-07, "logits/chosen": 0.0011257259175181389, "logits/rejected": 0.8019648790359497, "logps/chosen": -358.78546142578125, "logps/rejected": -397.22698974609375, "loss": 0.8097, "rewards/accuracies": 0.75, "rewards/chosen": -1.0374311208724976, "rewards/margins": 0.7070726156234741, "rewards/rejected": -1.7445037364959717, "step": 850 }, { "epoch": 0.4500261643118786, "grad_norm": 69.53882656236874, "learning_rate": 3.3571966934638376e-07, "logits/chosen": 0.44737330079078674, "logits/rejected": 0.7749794125556946, "logps/chosen": -356.664306640625, "logps/rejected": -470.05230712890625, "loss": 0.769, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.0418094396591187, "rewards/margins": 0.882452666759491, "rewards/rejected": -1.924262285232544, "step": 860 }, { "epoch": 0.4552590266875981, "grad_norm": 53.877647411408944, "learning_rate": 3.314136382905234e-07, "logits/chosen": 0.3656928539276123, "logits/rejected": 0.7745137810707092, "logps/chosen": -402.55523681640625, "logps/rejected": -458.47637939453125, "loss": 0.8074, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.1821459531784058, "rewards/margins": 0.715279757976532, "rewards/rejected": -1.897425651550293, "step": 870 }, { "epoch": 0.4604918890633176, "grad_norm": 65.77438111350074, "learning_rate": 3.270804157408225e-07, "logits/chosen": 0.17270120978355408, "logits/rejected": 1.0893535614013672, "logps/chosen": -448.43560791015625, "logps/rejected": -449.55230712890625, "loss": 0.8026, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.4231189489364624, "rewards/margins": 0.6511213183403015, "rewards/rejected": -2.074240207672119, "step": 880 }, { "epoch": 0.46572475143903713, "grad_norm": 44.39519225481988, "learning_rate": 3.227214489584128e-07, "logits/chosen": -0.4358143210411072, "logits/rejected": 0.053479015827178955, "logps/chosen": -426.91778564453125, "logps/rejected": -460.1244201660156, "loss": 0.829, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.2934101819992065, "rewards/margins": 0.6437898874282837, "rewards/rejected": -1.9372001886367798, "step": 890 }, { "epoch": 0.47095761381475665, "grad_norm": 63.00866806866458, "learning_rate": 3.1833819380279023e-07, "logits/chosen": -0.7756184935569763, "logits/rejected": -0.2751461863517761, "logps/chosen": -357.54150390625, "logps/rejected": -505.11712646484375, "loss": 0.7903, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.9035024642944336, "rewards/margins": 1.2450222969055176, "rewards/rejected": -2.1485252380371094, "step": 900 }, { "epoch": 0.47095761381475665, "eval_logits/chosen": -1.1342899799346924, "eval_logits/rejected": -0.48295506834983826, "eval_logps/chosen": -371.87213134765625, "eval_logps/rejected": -433.2554016113281, "eval_loss": 0.8039537072181702, "eval_rewards/accuracies": 0.7579365372657776, "eval_rewards/chosen": -0.8989505171775818, "eval_rewards/margins": 0.8315751552581787, "eval_rewards/rejected": -1.7305257320404053, "eval_runtime": 414.7482, "eval_samples_per_second": 4.822, "eval_steps_per_second": 0.152, "step": 900 }, { "epoch": 0.47619047619047616, "grad_norm": 64.44714812473441, "learning_rate": 3.139321142455703e-07, "logits/chosen": -1.1097314357757568, "logits/rejected": -0.2634844183921814, "logps/chosen": -319.09796142578125, "logps/rejected": -401.9964599609375, "loss": 0.8329, "rewards/accuracies": 0.75, "rewards/chosen": -0.8718813061714172, "rewards/margins": 0.8985443115234375, "rewards/rejected": -1.77042555809021, "step": 910 }, { "epoch": 0.48142333856619574, "grad_norm": 45.0051150055344, "learning_rate": 3.095046818815331e-07, "logits/chosen": -1.04958975315094, "logits/rejected": -0.40570583939552307, "logps/chosen": -383.8529968261719, "logps/rejected": -435.478759765625, "loss": 0.7896, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0067728757858276, "rewards/margins": 0.6971748471260071, "rewards/rejected": -1.7039477825164795, "step": 920 }, { "epoch": 0.48665620094191525, "grad_norm": 53.667295615114405, "learning_rate": 3.0505737543712275e-07, "logits/chosen": -0.6021451354026794, "logits/rejected": -0.11517045646905899, "logps/chosen": -384.52581787109375, "logps/rejected": -419.0810546875, "loss": 0.7798, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9730221033096313, "rewards/margins": 0.6482226848602295, "rewards/rejected": -1.62124502658844, "step": 930 }, { "epoch": 0.49188906331763477, "grad_norm": 81.05705838335177, "learning_rate": 3.0059168027656475e-07, "logits/chosen": -0.3525870740413666, "logits/rejected": 0.6466337442398071, "logps/chosen": -390.93280029296875, "logps/rejected": -439.4302673339844, "loss": 0.7723, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.1053266525268555, "rewards/margins": 0.8592435121536255, "rewards/rejected": -1.9645700454711914, "step": 940 }, { "epoch": 0.4971219256933543, "grad_norm": 55.17334155861909, "learning_rate": 2.9610908790576663e-07, "logits/chosen": -0.33681556582450867, "logits/rejected": 0.7191028594970703, "logps/chosen": -391.11737060546875, "logps/rejected": -463.0302734375, "loss": 0.793, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.1846669912338257, "rewards/margins": 0.9837862253189087, "rewards/rejected": -2.1684532165527344, "step": 950 }, { "epoch": 0.5023547880690737, "grad_norm": 48.9582899063176, "learning_rate": 2.9161109547416667e-07, "logits/chosen": -0.3603673279285431, "logits/rejected": 0.849152684211731, "logps/chosen": -411.8035583496094, "logps/rejected": -507.90887451171875, "loss": 0.8013, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.5003211498260498, "rewards/margins": 1.0145832300186157, "rewards/rejected": -2.514904499053955, "step": 960 }, { "epoch": 0.5075876504447933, "grad_norm": 57.29854674109045, "learning_rate": 2.8709920527469834e-07, "logits/chosen": -0.7571192383766174, "logits/rejected": 0.572393000125885, "logps/chosen": -433.143798828125, "logps/rejected": -467.4877014160156, "loss": 0.7955, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.1437658071517944, "rewards/margins": 1.0298449993133545, "rewards/rejected": -2.1736106872558594, "step": 970 }, { "epoch": 0.5128205128205128, "grad_norm": 37.24912096181186, "learning_rate": 2.8257492424203685e-07, "logits/chosen": -1.0983556509017944, "logits/rejected": 0.3809484839439392, "logps/chosen": -407.6033020019531, "logps/rejected": -474.23516845703125, "loss": 0.7669, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.0250732898712158, "rewards/margins": 1.089968204498291, "rewards/rejected": -2.115041494369507, "step": 980 }, { "epoch": 0.5180533751962323, "grad_norm": 56.21664517730591, "learning_rate": 2.780397634492949e-07, "logits/chosen": -0.49527350068092346, "logits/rejected": 0.6699007153511047, "logps/chosen": -395.61102294921875, "logps/rejected": -454.3805236816406, "loss": 0.8112, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.305254340171814, "rewards/margins": 0.9790977239608765, "rewards/rejected": -2.2843518257141113, "step": 990 }, { "epoch": 0.5232862375719518, "grad_norm": 101.07546570412681, "learning_rate": 2.7349523760333674e-07, "logits/chosen": 0.031771112233400345, "logits/rejected": 0.39206039905548096, "logps/chosen": -387.6393127441406, "logps/rejected": -444.1368713378906, "loss": 0.7805, "rewards/accuracies": 0.75, "rewards/chosen": -1.3543338775634766, "rewards/margins": 0.7986514568328857, "rewards/rejected": -2.152985095977783, "step": 1000 }, { "epoch": 0.5232862375719518, "eval_logits/chosen": 0.11115988343954086, "eval_logits/rejected": 1.2055213451385498, "eval_logps/chosen": -440.97509765625, "eval_logps/rejected": -517.9630737304688, "eval_loss": 0.7873586416244507, "eval_rewards/accuracies": 0.773809552192688, "eval_rewards/chosen": -1.5899808406829834, "eval_rewards/margins": 0.9876214861869812, "eval_rewards/rejected": -2.5776023864746094, "eval_runtime": 408.5958, "eval_samples_per_second": 4.895, "eval_steps_per_second": 0.154, "step": 1000 }, { "epoch": 0.5285190999476713, "grad_norm": 52.40512625800263, "learning_rate": 2.6894286453887827e-07, "logits/chosen": -0.13681717216968536, "logits/rejected": 0.8107517957687378, "logps/chosen": -426.64251708984375, "logps/rejected": -537.076416015625, "loss": 0.7774, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.3897221088409424, "rewards/margins": 1.117490291595459, "rewards/rejected": -2.5072124004364014, "step": 1010 }, { "epoch": 0.533751962323391, "grad_norm": 59.75424454695307, "learning_rate": 2.6438416471154273e-07, "logits/chosen": -0.12451888620853424, "logits/rejected": 1.1669331789016724, "logps/chosen": -407.44879150390625, "logps/rejected": -507.1592712402344, "loss": 0.735, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.2722725868225098, "rewards/margins": 1.3867954015731812, "rewards/rejected": -2.6590683460235596, "step": 1020 }, { "epoch": 0.5389848246991105, "grad_norm": 53.599811498760715, "learning_rate": 2.598206606900406e-07, "logits/chosen": 0.016549933701753616, "logits/rejected": 1.402275800704956, "logps/chosen": -442.63177490234375, "logps/rejected": -470.9737243652344, "loss": 0.7927, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.564623475074768, "rewards/margins": 0.9005531072616577, "rewards/rejected": -2.4651763439178467, "step": 1030 }, { "epoch": 0.54421768707483, "grad_norm": 32.65370170558801, "learning_rate": 2.552538766476443e-07, "logits/chosen": -0.6142684817314148, "logits/rejected": 0.251781702041626, "logps/chosen": -390.28082275390625, "logps/rejected": -496.885498046875, "loss": 0.8254, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.3231480121612549, "rewards/margins": 0.8904861211776733, "rewards/rejected": -2.2136340141296387, "step": 1040 }, { "epoch": 0.5494505494505495, "grad_norm": 75.3162366889796, "learning_rate": 2.5068533785312666e-07, "logits/chosen": -1.4156506061553955, "logits/rejected": -0.5838024020195007, "logps/chosen": -358.865478515625, "logps/rejected": -445.2547912597656, "loss": 0.7938, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.9552318453788757, "rewards/margins": 0.8637508153915405, "rewards/rejected": -1.818982720375061, "step": 1050 }, { "epoch": 0.554683411826269, "grad_norm": 43.62050381714458, "learning_rate": 2.461165701613333e-07, "logits/chosen": -1.292654275894165, "logits/rejected": -0.5297520160675049, "logps/chosen": -386.48492431640625, "logps/rejected": -437.10675048828125, "loss": 0.7879, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.1228294372558594, "rewards/margins": 0.7809044122695923, "rewards/rejected": -1.9037336111068726, "step": 1060 }, { "epoch": 0.5599162742019885, "grad_norm": 45.52131263841483, "learning_rate": 2.415490995035596e-07, "logits/chosen": -1.1806949377059937, "logits/rejected": -0.24476492404937744, "logps/chosen": -408.827880859375, "logps/rejected": -472.0335998535156, "loss": 0.8107, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.034440279006958, "rewards/margins": 0.8914132118225098, "rewards/rejected": -1.9258534908294678, "step": 1070 }, { "epoch": 0.565149136577708, "grad_norm": 59.76598969958421, "learning_rate": 2.3698445137790258e-07, "logits/chosen": -1.3123562335968018, "logits/rejected": -0.6329250335693359, "logps/chosen": -378.4320983886719, "logps/rejected": -419.2596740722656, "loss": 0.8029, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0317833423614502, "rewards/margins": 0.7433665990829468, "rewards/rejected": -1.775149941444397, "step": 1080 }, { "epoch": 0.5703819989534276, "grad_norm": 69.6973872101716, "learning_rate": 2.3242415033975575e-07, "logits/chosen": -0.9085898399353027, "logits/rejected": 0.49086079001426697, "logps/chosen": -418.4552307128906, "logps/rejected": -471.2793884277344, "loss": 0.846, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.2729108333587646, "rewards/margins": 1.2000945806503296, "rewards/rejected": -2.473005533218384, "step": 1090 }, { "epoch": 0.5756148613291471, "grad_norm": 37.21209144292686, "learning_rate": 2.2786971949262134e-07, "logits/chosen": -0.8077551126480103, "logits/rejected": 0.13687923550605774, "logps/chosen": -386.1564025878906, "logps/rejected": -452.08514404296875, "loss": 0.7927, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.346660852432251, "rewards/margins": 0.8283847570419312, "rewards/rejected": -2.1750454902648926, "step": 1100 }, { "epoch": 0.5756148613291471, "eval_logits/chosen": -0.7289183139801025, "eval_logits/rejected": 0.31206902861595154, "eval_logps/chosen": -426.6308288574219, "eval_logps/rejected": -501.3155212402344, "eval_loss": 0.785283625125885, "eval_rewards/accuracies": 0.7579365372657776, "eval_rewards/chosen": -1.4465374946594238, "eval_rewards/margins": 0.9645892977714539, "eval_rewards/rejected": -2.4111268520355225, "eval_runtime": 405.2266, "eval_samples_per_second": 4.936, "eval_steps_per_second": 0.155, "step": 1100 }, { "epoch": 0.5808477237048666, "grad_norm": 44.747850264101764, "learning_rate": 2.2332267997940513e-07, "logits/chosen": -0.40378910303115845, "logits/rejected": 0.6866604089736938, "logps/chosen": -385.66259765625, "logps/rejected": -460.9515075683594, "loss": 0.8221, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.359961748123169, "rewards/margins": 0.8599497079849243, "rewards/rejected": -2.2199113368988037, "step": 1110 }, { "epoch": 0.5860805860805861, "grad_norm": 73.28752107724188, "learning_rate": 2.1878455047436753e-07, "logits/chosen": -0.4600033760070801, "logits/rejected": 0.9429658055305481, "logps/chosen": -484.031005859375, "logps/rejected": -545.8448486328125, "loss": 0.7808, "rewards/accuracies": 0.8125, "rewards/chosen": -1.6805721521377563, "rewards/margins": 1.0684734582901, "rewards/rejected": -2.7490456104278564, "step": 1120 }, { "epoch": 0.5913134484563056, "grad_norm": 65.12299170442947, "learning_rate": 2.1425684667589852e-07, "logits/chosen": -0.3600446581840515, "logits/rejected": 0.7780245542526245, "logps/chosen": -443.42999267578125, "logps/rejected": -547.5982666015625, "loss": 0.772, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.7521789073944092, "rewards/margins": 0.9194129109382629, "rewards/rejected": -2.6715919971466064, "step": 1130 }, { "epoch": 0.5965463108320251, "grad_norm": 78.09081204895311, "learning_rate": 2.0974108080028692e-07, "logits/chosen": -0.860657811164856, "logits/rejected": 0.7113239169120789, "logps/chosen": -449.82952880859375, "logps/rejected": -499.965576171875, "loss": 0.7424, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5903230905532837, "rewards/margins": 0.8902558088302612, "rewards/rejected": -2.480578899383545, "step": 1140 }, { "epoch": 0.6017791732077447, "grad_norm": 50.70575598637482, "learning_rate": 2.0523876107665194e-07, "logits/chosen": -0.7675200700759888, "logits/rejected": 0.4552821218967438, "logps/chosen": -391.4772033691406, "logps/rejected": -466.75555419921875, "loss": 0.7646, "rewards/accuracies": 0.75, "rewards/chosen": -1.3859961032867432, "rewards/margins": 0.9621819257736206, "rewards/rejected": -2.3481781482696533, "step": 1150 }, { "epoch": 0.6070120355834642, "grad_norm": 64.51277865024055, "learning_rate": 2.0075139124320787e-07, "logits/chosen": -0.7119883298873901, "logits/rejected": 0.5894179940223694, "logps/chosen": -394.200927734375, "logps/rejected": -432.2201232910156, "loss": 0.8242, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.4039642810821533, "rewards/margins": 0.9810295104980469, "rewards/rejected": -2.3849940299987793, "step": 1160 }, { "epoch": 0.6122448979591837, "grad_norm": 52.94401392391387, "learning_rate": 1.962804700450265e-07, "logits/chosen": -0.8103808164596558, "logits/rejected": 0.13401614129543304, "logps/chosen": -421.75421142578125, "logps/rejected": -518.6265869140625, "loss": 0.8182, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.4089207649230957, "rewards/margins": 0.8579117655754089, "rewards/rejected": -2.2668323516845703, "step": 1170 }, { "epoch": 0.6174777603349032, "grad_norm": 48.12312215016781, "learning_rate": 1.9182749073346943e-07, "logits/chosen": -1.0783116817474365, "logits/rejected": 0.6697069406509399, "logps/chosen": -399.8518371582031, "logps/rejected": -465.48193359375, "loss": 0.7218, "rewards/accuracies": 0.75, "rewards/chosen": -1.2468689680099487, "rewards/margins": 0.9532965421676636, "rewards/rejected": -2.200165271759033, "step": 1180 }, { "epoch": 0.6227106227106227, "grad_norm": 42.26533735058185, "learning_rate": 1.8739394056745372e-07, "logits/chosen": -0.4827755093574524, "logits/rejected": 0.7201014757156372, "logps/chosen": -397.86981201171875, "logps/rejected": -476.1461486816406, "loss": 0.724, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.246742606163025, "rewards/margins": 0.9675485491752625, "rewards/rejected": -2.2142913341522217, "step": 1190 }, { "epoch": 0.6279434850863422, "grad_norm": 87.1106352692028, "learning_rate": 1.8298130031671972e-07, "logits/chosen": -0.7631278038024902, "logits/rejected": 0.46218782663345337, "logps/chosen": -414.7618713378906, "logps/rejected": -487.93603515625, "loss": 0.7714, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2280397415161133, "rewards/margins": 0.9974902272224426, "rewards/rejected": -2.225529909133911, "step": 1200 }, { "epoch": 0.6279434850863422, "eval_logits/chosen": -0.8372275829315186, "eval_logits/rejected": 0.3216133415699005, "eval_logps/chosen": -411.14093017578125, "eval_logps/rejected": -491.5005187988281, "eval_loss": 0.7813597321510315, "eval_rewards/accuracies": 0.7678571343421936, "eval_rewards/chosen": -1.291638970375061, "eval_rewards/margins": 1.0213383436203003, "eval_rewards/rejected": -2.3129770755767822, "eval_runtime": 404.3361, "eval_samples_per_second": 4.946, "eval_steps_per_second": 0.156, "step": 1200 }, { "epoch": 0.6331763474620618, "grad_norm": 46.76828951656419, "learning_rate": 1.785910437672658e-07, "logits/chosen": -0.9225088357925415, "logits/rejected": 0.8012136220932007, "logps/chosen": -435.16143798828125, "logps/rejected": -493.6361389160156, "loss": 0.7767, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.3316601514816284, "rewards/margins": 1.245229959487915, "rewards/rejected": -2.576890230178833, "step": 1210 }, { "epoch": 0.6384092098377813, "grad_norm": 135.40344806490603, "learning_rate": 1.7422463722911624e-07, "logits/chosen": -1.0115296840667725, "logits/rejected": 0.24621915817260742, "logps/chosen": -437.01202392578125, "logps/rejected": -481.07769775390625, "loss": 0.809, "rewards/accuracies": 0.75, "rewards/chosen": -1.2778805494308472, "rewards/margins": 0.9990509748458862, "rewards/rejected": -2.2769315242767334, "step": 1220 }, { "epoch": 0.6436420722135008, "grad_norm": 50.891064165637296, "learning_rate": 1.6988353904658492e-07, "logits/chosen": -1.0459206104278564, "logits/rejected": 0.11323748528957367, "logps/chosen": -427.8426818847656, "logps/rejected": -473.751220703125, "loss": 0.7387, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.118016004562378, "rewards/margins": 1.0759532451629639, "rewards/rejected": -2.193969249725342, "step": 1230 }, { "epoch": 0.6488749345892203, "grad_norm": 57.62804230003882, "learning_rate": 1.6556919911120081e-07, "logits/chosen": -0.7161341309547424, "logits/rejected": 0.28161749243736267, "logps/chosen": -358.07965087890625, "logps/rejected": -437.2621154785156, "loss": 0.7675, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.3208184242248535, "rewards/margins": 0.8684528470039368, "rewards/rejected": -2.1892712116241455, "step": 1240 }, { "epoch": 0.6541077969649398, "grad_norm": 52.522850326161084, "learning_rate": 1.6128305837745546e-07, "logits/chosen": -1.0030014514923096, "logits/rejected": 0.5807463526725769, "logps/chosen": -396.1776428222656, "logps/rejected": -484.74554443359375, "loss": 0.7612, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.2601598501205444, "rewards/margins": 0.9652479290962219, "rewards/rejected": -2.225407600402832, "step": 1250 }, { "epoch": 0.6593406593406593, "grad_norm": 43.357942531950776, "learning_rate": 1.570265483815364e-07, "logits/chosen": -1.2291408777236938, "logits/rejected": 0.32300296425819397, "logps/chosen": -381.19891357421875, "logps/rejected": -430.6766052246094, "loss": 0.7584, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.0204633474349976, "rewards/margins": 1.1893185377120972, "rewards/rejected": -2.209782123565674, "step": 1260 }, { "epoch": 0.6645735217163788, "grad_norm": 53.9036931404524, "learning_rate": 1.5280109076320506e-07, "logits/chosen": -0.20181569457054138, "logits/rejected": 1.424915075302124, "logps/chosen": -432.1175842285156, "logps/rejected": -491.6654357910156, "loss": 0.7347, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.6122735738754272, "rewards/margins": 0.9608899354934692, "rewards/rejected": -2.5731635093688965, "step": 1270 }, { "epoch": 0.6698063840920984, "grad_norm": 59.33368732639341, "learning_rate": 1.4860809679098158e-07, "logits/chosen": 0.1443500518798828, "logits/rejected": 1.5357582569122314, "logps/chosen": -408.69775390625, "logps/rejected": -474.74688720703125, "loss": 0.7778, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.4550931453704834, "rewards/margins": 1.0619676113128662, "rewards/rejected": -2.5170607566833496, "step": 1280 }, { "epoch": 0.6750392464678179, "grad_norm": 46.05424535338694, "learning_rate": 1.444489668907914e-07, "logits/chosen": -0.767706573009491, "logits/rejected": 0.035219788551330566, "logps/chosen": -377.96600341796875, "logps/rejected": -463.80316162109375, "loss": 0.8026, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.1998144388198853, "rewards/margins": 0.7478494644165039, "rewards/rejected": -1.9476639032363892, "step": 1290 }, { "epoch": 0.6802721088435374, "grad_norm": 73.33100634934911, "learning_rate": 1.403250901782354e-07, "logits/chosen": -0.7081127166748047, "logits/rejected": 0.5111311674118042, "logps/chosen": -397.9610290527344, "logps/rejected": -490.15313720703125, "loss": 0.7514, "rewards/accuracies": 0.75, "rewards/chosen": -1.2067577838897705, "rewards/margins": 0.9489742517471313, "rewards/rejected": -2.1557321548461914, "step": 1300 }, { "epoch": 0.6802721088435374, "eval_logits/chosen": -0.7223237752914429, "eval_logits/rejected": 0.5838782787322998, "eval_logps/chosen": -404.3044128417969, "eval_logps/rejected": -484.7901611328125, "eval_loss": 0.7837539315223694, "eval_rewards/accuracies": 0.77182537317276, "eval_rewards/chosen": -1.223273754119873, "eval_rewards/margins": 1.0225993394851685, "eval_rewards/rejected": -2.245873212814331, "eval_runtime": 411.0788, "eval_samples_per_second": 4.865, "eval_steps_per_second": 0.153, "step": 1300 }, { "epoch": 0.6855049712192569, "grad_norm": 110.62138865967401, "learning_rate": 1.3623784399463584e-07, "logits/chosen": -0.9859231114387512, "logits/rejected": 0.5916813611984253, "logps/chosen": -399.94586181640625, "logps/rejected": -475.5068359375, "loss": 0.7889, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.0855576992034912, "rewards/margins": 1.092066764831543, "rewards/rejected": -2.177624225616455, "step": 1310 }, { "epoch": 0.6907378335949764, "grad_norm": 44.71517613491715, "learning_rate": 1.3218859344701632e-07, "logits/chosen": -0.35537010431289673, "logits/rejected": 1.0424387454986572, "logps/chosen": -418.4120178222656, "logps/rejected": -501.056640625, "loss": 0.7556, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4536817073822021, "rewards/margins": 0.9854963421821594, "rewards/rejected": -2.439177989959717, "step": 1320 }, { "epoch": 0.6959706959706959, "grad_norm": 85.57837022135045, "learning_rate": 1.2817869095216624e-07, "logits/chosen": -0.5951244235038757, "logits/rejected": 0.6276283264160156, "logps/chosen": -411.0013732910156, "logps/rejected": -552.7861938476562, "loss": 0.7364, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.205119013786316, "rewards/margins": 1.203356385231018, "rewards/rejected": -2.408475160598755, "step": 1330 }, { "epoch": 0.7012035583464155, "grad_norm": 51.06588010311495, "learning_rate": 1.2420947578494522e-07, "logits/chosen": -0.18927031755447388, "logits/rejected": 0.9235250353813171, "logps/chosen": -433.1712341308594, "logps/rejected": -480.4144592285156, "loss": 0.7701, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.4656416177749634, "rewards/margins": 0.7956992387771606, "rewards/rejected": -2.261340618133545, "step": 1340 }, { "epoch": 0.706436420722135, "grad_norm": 67.30883485565569, "learning_rate": 1.202822736309758e-07, "logits/chosen": -0.4543929994106293, "logits/rejected": 1.075971007347107, "logps/chosen": -431.24493408203125, "logps/rejected": -527.0070190429688, "loss": 0.7557, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.244934320449829, "rewards/margins": 1.1924127340316772, "rewards/rejected": -2.437346935272217, "step": 1350 }, { "epoch": 0.7116692830978545, "grad_norm": 56.271665810067795, "learning_rate": 1.1639839614387572e-07, "logits/chosen": -0.5630747079849243, "logits/rejected": 0.7446565628051758, "logps/chosen": -435.5628967285156, "logps/rejected": -518.8600463867188, "loss": 0.8152, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.189913034439087, "rewards/margins": 1.073265790939331, "rewards/rejected": -2.263178586959839, "step": 1360 }, { "epoch": 0.716902145473574, "grad_norm": 56.327753340524126, "learning_rate": 1.1255914050717552e-07, "logits/chosen": -0.38961178064346313, "logits/rejected": 0.9478599429130554, "logps/chosen": -420.0284729003906, "logps/rejected": -472.03106689453125, "loss": 0.7142, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.3269022703170776, "rewards/margins": 0.9800853729248047, "rewards/rejected": -2.306987762451172, "step": 1370 }, { "epoch": 0.7221350078492935, "grad_norm": 53.12467021296384, "learning_rate": 1.0876578900107053e-07, "logits/chosen": -0.6571879386901855, "logits/rejected": 1.0737478733062744, "logps/chosen": -379.66265869140625, "logps/rejected": -458.2452087402344, "loss": 0.759, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.257787823677063, "rewards/margins": 1.0836238861083984, "rewards/rejected": -2.341411590576172, "step": 1380 }, { "epoch": 0.727367870225013, "grad_norm": 58.33202649287008, "learning_rate": 1.050196085741491e-07, "logits/chosen": -0.060309648513793945, "logits/rejected": 0.7888890504837036, "logps/chosen": -385.73162841796875, "logps/rejected": -460.37298583984375, "loss": 0.749, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.3211241960525513, "rewards/margins": 0.8563600778579712, "rewards/rejected": -2.1774842739105225, "step": 1390 }, { "epoch": 0.7326007326007326, "grad_norm": 58.42668102545408, "learning_rate": 1.0132185042024246e-07, "logits/chosen": 0.19692227244377136, "logits/rejected": 1.4302622079849243, "logps/chosen": -416.8544006347656, "logps/rejected": -491.1285095214844, "loss": 0.7356, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.6231311559677124, "rewards/margins": 0.9427364468574524, "rewards/rejected": -2.5658679008483887, "step": 1400 }, { "epoch": 0.7326007326007326, "eval_logits/chosen": -0.11784098297357559, "eval_logits/rejected": 1.2245301008224487, "eval_logps/chosen": -431.15155029296875, "eval_logps/rejected": -514.0866088867188, "eval_loss": 0.7766879796981812, "eval_rewards/accuracies": 0.7698412537574768, "eval_rewards/chosen": -1.4917447566986084, "eval_rewards/margins": 1.047093391418457, "eval_rewards/rejected": -2.5388379096984863, "eval_runtime": 404.9348, "eval_samples_per_second": 4.939, "eval_steps_per_second": 0.156, "step": 1400 }, { "epoch": 0.7378335949764521, "grad_norm": 66.76056371509945, "learning_rate": 9.767374956053584e-08, "logits/chosen": 0.0736219733953476, "logits/rejected": 1.6470451354980469, "logps/chosen": -408.95489501953125, "logps/rejected": -498.83734130859375, "loss": 0.8638, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.5531549453735352, "rewards/margins": 0.975112795829773, "rewards/rejected": -2.5282678604125977, "step": 1410 }, { "epoch": 0.7430664573521716, "grad_norm": 44.29932463683008, "learning_rate": 9.407652443108192e-08, "logits/chosen": -0.04026483744382858, "logits/rejected": 1.169201135635376, "logps/chosen": -423.7627868652344, "logps/rejected": -484.053466796875, "loss": 0.8113, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6051456928253174, "rewards/margins": 0.8360587358474731, "rewards/rejected": -2.44120454788208, "step": 1420 }, { "epoch": 0.7482993197278912, "grad_norm": 59.43819999358329, "learning_rate": 9.053137647585229e-08, "logits/chosen": 0.0075632184743881226, "logits/rejected": 0.9761813282966614, "logps/chosen": -400.4501953125, "logps/rejected": -461.3731994628906, "loss": 0.7358, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4282331466674805, "rewards/margins": 0.8752400279045105, "rewards/rejected": -2.3034732341766357, "step": 1430 }, { "epoch": 0.7535321821036107, "grad_norm": 64.45381303244972, "learning_rate": 8.70394897454659e-08, "logits/chosen": -0.6237810850143433, "logits/rejected": 0.8538782000541687, "logps/chosen": -413.45465087890625, "logps/rejected": -487.5758361816406, "loss": 0.7702, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.0352585315704346, "rewards/margins": 1.2015461921691895, "rewards/rejected": -2.236804485321045, "step": 1440 }, { "epoch": 0.7587650444793302, "grad_norm": 39.92473052936565, "learning_rate": 8.360203050172488e-08, "logits/chosen": -0.381369024515152, "logits/rejected": 0.5927223563194275, "logps/chosen": -406.38775634765625, "logps/rejected": -461.39324951171875, "loss": 0.7642, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.3674395084381104, "rewards/margins": 0.7517277002334595, "rewards/rejected": -2.1191673278808594, "step": 1450 }, { "epoch": 0.7639979068550498, "grad_norm": 49.387776834643546, "learning_rate": 8.022014682809305e-08, "logits/chosen": -0.053152896463871, "logits/rejected": 1.0388424396514893, "logps/chosen": -409.360595703125, "logps/rejected": -455.10662841796875, "loss": 0.8063, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.4831829071044922, "rewards/margins": 0.8227430582046509, "rewards/rejected": -2.3059258460998535, "step": 1460 }, { "epoch": 0.7692307692307693, "grad_norm": 51.7234518248411, "learning_rate": 7.689496824624525e-08, "logits/chosen": -0.36129945516586304, "logits/rejected": 1.0099319219589233, "logps/chosen": -383.045654296875, "logps/rejected": -451.1670837402344, "loss": 0.7746, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.2285678386688232, "rewards/margins": 0.9865066409111023, "rewards/rejected": -2.2150747776031494, "step": 1470 }, { "epoch": 0.7744636316064888, "grad_norm": 57.77055304826112, "learning_rate": 7.362760533881649e-08, "logits/chosen": 0.2819596529006958, "logits/rejected": 1.0644227266311646, "logps/chosen": -385.4560546875, "logps/rejected": -437.28466796875, "loss": 0.7994, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.398332118988037, "rewards/margins": 0.626512348651886, "rewards/rejected": -2.024844169616699, "step": 1480 }, { "epoch": 0.7796964939822083, "grad_norm": 57.34379108958058, "learning_rate": 7.041914937847584e-08, "logits/chosen": 0.07771716266870499, "logits/rejected": 1.1199547052383423, "logps/chosen": -404.61810302734375, "logps/rejected": -468.77734375, "loss": 0.7516, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.365378737449646, "rewards/margins": 0.8668498992919922, "rewards/rejected": -2.2322287559509277, "step": 1490 }, { "epoch": 0.7849293563579278, "grad_norm": 58.06019157785553, "learning_rate": 6.727067196345099e-08, "logits/chosen": -0.18102507293224335, "logits/rejected": 0.7657878398895264, "logps/chosen": -360.10906982421875, "logps/rejected": -484.46173095703125, "loss": 0.7475, "rewards/accuracies": 0.75, "rewards/chosen": -1.3445265293121338, "rewards/margins": 1.082401156425476, "rewards/rejected": -2.4269278049468994, "step": 1500 }, { "epoch": 0.7849293563579278, "eval_logits/chosen": -0.212717205286026, "eval_logits/rejected": 1.0528589487075806, "eval_logps/chosen": -417.6551818847656, "eval_logps/rejected": -496.0364074707031, "eval_loss": 0.7755566239356995, "eval_rewards/accuracies": 0.7638888955116272, "eval_rewards/chosen": -1.3567817211151123, "eval_rewards/margins": 1.001554250717163, "eval_rewards/rejected": -2.3583357334136963, "eval_runtime": 409.0375, "eval_samples_per_second": 4.89, "eval_steps_per_second": 0.154, "step": 1500 }, { "epoch": 0.7901622187336473, "grad_norm": 46.14399971238, "learning_rate": 6.418322465962233e-08, "logits/chosen": 0.3324340879917145, "logits/rejected": 1.1771190166473389, "logps/chosen": -418.973876953125, "logps/rejected": -522.1957397460938, "loss": 0.812, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.657254934310913, "rewards/margins": 0.9044488668441772, "rewards/rejected": -2.5617034435272217, "step": 1510 }, { "epoch": 0.7953950811093669, "grad_norm": 37.18508348319721, "learning_rate": 6.115783864930905e-08, "logits/chosen": -0.19596168398857117, "logits/rejected": 1.1539347171783447, "logps/chosen": -353.31866455078125, "logps/rejected": -445.51495361328125, "loss": 0.7564, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.2385740280151367, "rewards/margins": 1.0164680480957031, "rewards/rejected": -2.25504207611084, "step": 1520 }, { "epoch": 0.8006279434850864, "grad_norm": 52.91354119052287, "learning_rate": 5.8195524386862374e-08, "logits/chosen": -0.3700222074985504, "logits/rejected": 0.2583986818790436, "logps/chosen": -421.437744140625, "logps/rejected": -567.0833740234375, "loss": 0.7151, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.259513258934021, "rewards/margins": 0.8530277013778687, "rewards/rejected": -2.1125407218933105, "step": 1530 }, { "epoch": 0.8058608058608059, "grad_norm": 40.01896284065745, "learning_rate": 5.529727126118228e-08, "logits/chosen": -0.13671225309371948, "logits/rejected": 0.7043404579162598, "logps/chosen": -467.30694580078125, "logps/rejected": -535.9297485351562, "loss": 0.7686, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5604652166366577, "rewards/margins": 0.9186226725578308, "rewards/rejected": -2.4790878295898438, "step": 1540 }, { "epoch": 0.8110936682365254, "grad_norm": 46.124018443801454, "learning_rate": 5.246404726526918e-08, "logits/chosen": -0.4600491523742676, "logits/rejected": 0.564723014831543, "logps/chosen": -449.27508544921875, "logps/rejected": -464.333251953125, "loss": 0.7744, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.1245921850204468, "rewards/margins": 1.0609307289123535, "rewards/rejected": -2.1855227947235107, "step": 1550 }, { "epoch": 0.8163265306122449, "grad_norm": 50.49063385312754, "learning_rate": 4.969679867292276e-08, "logits/chosen": -0.1635294407606125, "logits/rejected": 0.7217547297477722, "logps/chosen": -434.32562255859375, "logps/rejected": -510.69378662109375, "loss": 0.8104, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3322641849517822, "rewards/margins": 0.8803984522819519, "rewards/rejected": -2.212662935256958, "step": 1560 }, { "epoch": 0.8215593929879644, "grad_norm": 75.03096730706677, "learning_rate": 4.6996449722693315e-08, "logits/chosen": -0.06163690239191055, "logits/rejected": 1.269026756286621, "logps/chosen": -383.6000061035156, "logps/rejected": -468.09637451171875, "loss": 0.7823, "rewards/accuracies": 0.75, "rewards/chosen": -1.356022834777832, "rewards/margins": 1.0311052799224854, "rewards/rejected": -2.3871281147003174, "step": 1570 }, { "epoch": 0.826792255363684, "grad_norm": 63.72060054169525, "learning_rate": 4.436390230919465e-08, "logits/chosen": -0.34788089990615845, "logits/rejected": 1.6296714544296265, "logps/chosen": -444.2071838378906, "logps/rejected": -509.6371154785156, "loss": 0.8362, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.350345492362976, "rewards/margins": 1.1345208883285522, "rewards/rejected": -2.4848666191101074, "step": 1580 }, { "epoch": 0.8320251177394035, "grad_norm": 66.7327349321288, "learning_rate": 4.180003568187776e-08, "logits/chosen": -0.08060705661773682, "logits/rejected": 1.4282660484313965, "logps/chosen": -401.8912048339844, "logps/rejected": -481.07208251953125, "loss": 0.7394, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2525925636291504, "rewards/margins": 1.085805892944336, "rewards/rejected": -2.3383984565734863, "step": 1590 }, { "epoch": 0.837257980115123, "grad_norm": 51.38346556962735, "learning_rate": 3.930570615136919e-08, "logits/chosen": -0.021803859621286392, "logits/rejected": 0.8460650444030762, "logps/chosen": -399.91607666015625, "logps/rejected": -513.109619140625, "loss": 0.7625, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.4438254833221436, "rewards/margins": 0.8929357528686523, "rewards/rejected": -2.336761236190796, "step": 1600 }, { "epoch": 0.837257980115123, "eval_logits/chosen": -0.42059749364852905, "eval_logits/rejected": 0.7869580388069153, "eval_logps/chosen": -404.67962646484375, "eval_logps/rejected": -483.9888000488281, "eval_loss": 0.7750742435455322, "eval_rewards/accuracies": 0.7777777910232544, "eval_rewards/chosen": -1.227026343345642, "eval_rewards/margins": 1.0108333826065063, "eval_rewards/rejected": -2.2378594875335693, "eval_runtime": 441.2045, "eval_samples_per_second": 4.533, "eval_steps_per_second": 0.143, "step": 1600 }, { "epoch": 0.8424908424908425, "grad_norm": 52.46864246860495, "learning_rate": 3.6881746803469756e-08, "logits/chosen": -0.7535071969032288, "logits/rejected": 0.33819881081581116, "logps/chosen": -422.0393981933594, "logps/rejected": -465.2088928222656, "loss": 0.7698, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.0208406448364258, "rewards/margins": 0.8941202163696289, "rewards/rejected": -1.9149608612060547, "step": 1610 }, { "epoch": 0.847723704866562, "grad_norm": 59.00985096818891, "learning_rate": 3.452896722091128e-08, "logits/chosen": -0.07647248357534409, "logits/rejected": 1.697693109512329, "logps/chosen": -402.09527587890625, "logps/rejected": -459.788818359375, "loss": 0.7798, "rewards/accuracies": 0.75, "rewards/chosen": -1.3960047960281372, "rewards/margins": 1.0105960369110107, "rewards/rejected": -2.4066009521484375, "step": 1620 }, { "epoch": 0.8529565672422815, "grad_norm": 84.84338531611888, "learning_rate": 3.2248153212961677e-08, "logits/chosen": -0.11207184940576553, "logits/rejected": 0.8803804516792297, "logps/chosen": -412.326904296875, "logps/rejected": -515.648193359375, "loss": 0.7876, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.3124299049377441, "rewards/margins": 1.0197770595550537, "rewards/rejected": -2.332207202911377, "step": 1630 }, { "epoch": 0.858189429618001, "grad_norm": 83.82267718892379, "learning_rate": 3.004006655297209e-08, "logits/chosen": -0.34982767701148987, "logits/rejected": 0.7822405099868774, "logps/chosen": -453.8860778808594, "logps/rejected": -502.5855407714844, "loss": 0.7795, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.2538963556289673, "rewards/margins": 0.9325149655342102, "rewards/rejected": -2.1864113807678223, "step": 1640 }, { "epoch": 0.8634222919937206, "grad_norm": 55.73523059497138, "learning_rate": 2.7905444723949762e-08, "logits/chosen": -0.08788832277059555, "logits/rejected": 1.0154967308044434, "logps/chosen": -422.19854736328125, "logps/rejected": -506.4143981933594, "loss": 0.7932, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.4875541925430298, "rewards/margins": 0.9180333018302917, "rewards/rejected": -2.4055874347686768, "step": 1650 }, { "epoch": 0.8686551543694401, "grad_norm": 60.02309079239001, "learning_rate": 2.5845000672245572e-08, "logits/chosen": -0.22186072170734406, "logits/rejected": 0.9693048596382141, "logps/chosen": -362.5050964355469, "logps/rejected": -469.5879821777344, "loss": 0.7794, "rewards/accuracies": 0.75, "rewards/chosen": -1.1964887380599976, "rewards/margins": 1.1623802185058594, "rewards/rejected": -2.3588690757751465, "step": 1660 }, { "epoch": 0.8738880167451596, "grad_norm": 61.49726125377505, "learning_rate": 2.385942256943499e-08, "logits/chosen": -0.22105903923511505, "logits/rejected": 1.207337498664856, "logps/chosen": -457.2879943847656, "logps/rejected": -508.7391052246094, "loss": 0.7471, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5363842248916626, "rewards/margins": 0.9326260685920715, "rewards/rejected": -2.4690101146698, "step": 1670 }, { "epoch": 0.8791208791208791, "grad_norm": 71.74052715646674, "learning_rate": 2.194937358247506e-08, "logits/chosen": 0.1824917197227478, "logits/rejected": 1.4263757467269897, "logps/chosen": -393.04388427734375, "logps/rejected": -506.92315673828125, "loss": 0.7237, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.3214830160140991, "rewards/margins": 1.0443296432495117, "rewards/rejected": -2.3658127784729004, "step": 1680 }, { "epoch": 0.8843537414965986, "grad_norm": 86.67114662265121, "learning_rate": 2.011549165221127e-08, "logits/chosen": 0.1338118612766266, "logits/rejected": 1.4261093139648438, "logps/chosen": -404.98028564453125, "logps/rejected": -494.10400390625, "loss": 0.7183, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.3930095434188843, "rewards/margins": 1.1570873260498047, "rewards/rejected": -2.5500969886779785, "step": 1690 }, { "epoch": 0.8895866038723181, "grad_norm": 66.54431398831392, "learning_rate": 1.8358389280311303e-08, "logits/chosen": 0.024025822058320045, "logits/rejected": 1.8998467922210693, "logps/chosen": -424.21636962890625, "logps/rejected": -464.24432373046875, "loss": 0.7493, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.2815468311309814, "rewards/margins": 1.0820454359054565, "rewards/rejected": -2.3635926246643066, "step": 1700 }, { "epoch": 0.8895866038723181, "eval_logits/chosen": -0.08914075046777725, "eval_logits/rejected": 1.2325109243392944, "eval_logps/chosen": -419.86297607421875, "eval_logps/rejected": -502.7920227050781, "eval_loss": 0.7748306393623352, "eval_rewards/accuracies": 0.77182537317276, "eval_rewards/chosen": -1.378859519958496, "eval_rewards/margins": 1.0470322370529175, "eval_rewards/rejected": -2.425891637802124, "eval_runtime": 410.6438, "eval_samples_per_second": 4.87, "eval_steps_per_second": 0.153, "step": 1700 }, { "epoch": 0.8948194662480377, "grad_norm": 82.91107716704815, "learning_rate": 1.6678653324693787e-08, "logits/chosen": 0.09943423420190811, "logits/rejected": 1.5888570547103882, "logps/chosen": -432.12664794921875, "logps/rejected": -515.0614013671875, "loss": 0.8028, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.421820878982544, "rewards/margins": 1.0157796144485474, "rewards/rejected": -2.4376003742218018, "step": 1710 }, { "epoch": 0.9000523286237572, "grad_norm": 45.16664485799351, "learning_rate": 1.507684480352292e-08, "logits/chosen": 0.10240556299686432, "logits/rejected": 1.514649748802185, "logps/chosen": -426.95184326171875, "logps/rejected": -487.20465087890625, "loss": 0.7514, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.4028398990631104, "rewards/margins": 0.9551857113838196, "rewards/rejected": -2.3580257892608643, "step": 1720 }, { "epoch": 0.9052851909994767, "grad_norm": 66.0030113062902, "learning_rate": 1.3553498707832761e-08, "logits/chosen": -0.15374299883842468, "logits/rejected": 0.9044667482376099, "logps/chosen": -376.7266845703125, "logps/rejected": -478.06884765625, "loss": 0.7573, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.2118138074874878, "rewards/margins": 1.0770864486694336, "rewards/rejected": -2.288900136947632, "step": 1730 }, { "epoch": 0.9105180533751962, "grad_norm": 51.55163467051401, "learning_rate": 1.2109123822844653e-08, "logits/chosen": -0.31222978234291077, "logits/rejected": 1.1838620901107788, "logps/chosen": -431.54449462890625, "logps/rejected": -472.8711853027344, "loss": 0.8026, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.4353928565979004, "rewards/margins": 0.8553255796432495, "rewards/rejected": -2.2907185554504395, "step": 1740 }, { "epoch": 0.9157509157509157, "grad_norm": 64.20088111350992, "learning_rate": 1.0744202558037014e-08, "logits/chosen": -0.27103763818740845, "logits/rejected": 0.9759141802787781, "logps/chosen": -451.8807067871094, "logps/rejected": -521.4490966796875, "loss": 0.8208, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.1822432279586792, "rewards/margins": 1.0939109325408936, "rewards/rejected": -2.2761542797088623, "step": 1750 }, { "epoch": 0.9209837781266352, "grad_norm": 42.55359587813972, "learning_rate": 9.459190786024696e-09, "logits/chosen": -0.07091750204563141, "logits/rejected": 0.7565028071403503, "logps/chosen": -404.9520568847656, "logps/rejected": -519.3016967773438, "loss": 0.7198, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3031336069107056, "rewards/margins": 0.9350326657295227, "rewards/rejected": -2.238166332244873, "step": 1760 }, { "epoch": 0.9262166405023547, "grad_norm": 98.92030583827494, "learning_rate": 8.254517690300944e-09, "logits/chosen": -0.25574153661727905, "logits/rejected": 0.7122960686683655, "logps/chosen": -427.6814880371094, "logps/rejected": -519.8585205078125, "loss": 0.7696, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.337904691696167, "rewards/margins": 0.9577649235725403, "rewards/rejected": -2.2956700325012207, "step": 1770 }, { "epoch": 0.9314495028780743, "grad_norm": 33.628882825474335, "learning_rate": 7.130585621893809e-09, "logits/chosen": 0.29787325859069824, "logits/rejected": 1.4469048976898193, "logps/chosen": -426.8885192871094, "logps/rejected": -520.8005981445312, "loss": 0.7802, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.4531055688858032, "rewards/margins": 1.0528675317764282, "rewards/rejected": -2.5059728622436523, "step": 1780 }, { "epoch": 0.9366823652537938, "grad_norm": 37.7353977570388, "learning_rate": 6.0877699649840574e-09, "logits/chosen": -0.5353270769119263, "logits/rejected": 0.20474915206432343, "logps/chosen": -430.41058349609375, "logps/rejected": -490.887939453125, "loss": 0.7494, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.2458174228668213, "rewards/margins": 0.7676967978477478, "rewards/rejected": -2.013514518737793, "step": 1790 }, { "epoch": 0.9419152276295133, "grad_norm": 62.943056200154395, "learning_rate": 5.126419011529992e-09, "logits/chosen": -0.013582557439804077, "logits/rejected": 1.1790709495544434, "logps/chosen": -389.61468505859375, "logps/rejected": -512.7748413085938, "loss": 0.7604, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.347475290298462, "rewards/margins": 1.0473517179489136, "rewards/rejected": -2.394826889038086, "step": 1800 }, { "epoch": 0.9419152276295133, "eval_logits/chosen": -0.22020134329795837, "eval_logits/rejected": 1.0741875171661377, "eval_logps/chosen": -415.20343017578125, "eval_logps/rejected": -497.237548828125, "eval_loss": 0.7742918729782104, "eval_rewards/accuracies": 0.77182537317276, "eval_rewards/chosen": -1.3322640657424927, "eval_rewards/margins": 1.0380831956863403, "eval_rewards/rejected": -2.370347023010254, "eval_runtime": 428.9144, "eval_samples_per_second": 4.663, "eval_steps_per_second": 0.147, "step": 1800 }, { "epoch": 0.9471480900052328, "grad_norm": 90.30930652350084, "learning_rate": 4.246853844940723e-09, "logits/chosen": 0.003252273891121149, "logits/rejected": 1.5239484310150146, "logps/chosen": -379.72174072265625, "logps/rejected": -447.86767578125, "loss": 0.7845, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.443015217781067, "rewards/margins": 0.8682249784469604, "rewards/rejected": -2.3112399578094482, "step": 1810 }, { "epoch": 0.9523809523809523, "grad_norm": 54.501538855304695, "learning_rate": 3.449368232836869e-09, "logits/chosen": -0.10718011856079102, "logits/rejected": 0.7375177145004272, "logps/chosen": -327.4703369140625, "logps/rejected": -422.5323181152344, "loss": 0.7536, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.159043312072754, "rewards/margins": 0.9762653112411499, "rewards/rejected": -2.1353087425231934, "step": 1820 }, { "epoch": 0.957613814756672, "grad_norm": 90.11800044253823, "learning_rate": 2.734228528934679e-09, "logits/chosen": -0.20851321518421173, "logits/rejected": 0.8201483488082886, "logps/chosen": -419.8130798339844, "logps/rejected": -480.1256408691406, "loss": 0.8026, "rewards/accuracies": 0.75, "rewards/chosen": -1.3179951906204224, "rewards/margins": 0.9454077482223511, "rewards/rejected": -2.2634031772613525, "step": 1830 }, { "epoch": 0.9628466771323915, "grad_norm": 53.240324076115115, "learning_rate": 2.1016735840859447e-09, "logits/chosen": -0.23634688556194305, "logits/rejected": 1.3548707962036133, "logps/chosen": -421.292724609375, "logps/rejected": -501.21075439453125, "loss": 0.7467, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.359490990638733, "rewards/margins": 1.1585173606872559, "rewards/rejected": -2.518008232116699, "step": 1840 }, { "epoch": 0.968079539508111, "grad_norm": 55.77868101016801, "learning_rate": 1.551914666503812e-09, "logits/chosen": -0.5564178228378296, "logits/rejected": 1.2977861166000366, "logps/chosen": -473.85498046875, "logps/rejected": -522.0704345703125, "loss": 0.7551, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.1998566389083862, "rewards/margins": 1.1426351070404053, "rewards/rejected": -2.342491865158081, "step": 1850 }, { "epoch": 0.9733124018838305, "grad_norm": 55.07443979489719, "learning_rate": 1.0851353912008642e-09, "logits/chosen": -0.2731333374977112, "logits/rejected": 1.2076146602630615, "logps/chosen": -402.7923889160156, "logps/rejected": -454.7273864746094, "loss": 0.7814, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2705854177474976, "rewards/margins": 0.870033860206604, "rewards/rejected": -2.1406192779541016, "step": 1860 }, { "epoch": 0.97854526425955, "grad_norm": 73.19792172491795, "learning_rate": 7.014916586632336e-10, "logits/chosen": -0.23980256915092468, "logits/rejected": 1.3596718311309814, "logps/chosen": -383.4898376464844, "logps/rejected": -445.1707458496094, "loss": 0.7587, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.203575849533081, "rewards/margins": 0.938707172870636, "rewards/rejected": -2.1422829627990723, "step": 1870 }, { "epoch": 0.9837781266352695, "grad_norm": 48.665901256766034, "learning_rate": 4.011116027811956e-10, "logits/chosen": -0.41328221559524536, "logits/rejected": 0.7479709386825562, "logps/chosen": -386.04315185546875, "logps/rejected": -518.6088256835938, "loss": 0.7558, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.227036952972412, "rewards/margins": 1.0733864307403564, "rewards/rejected": -2.3004233837127686, "step": 1880 }, { "epoch": 0.989010989010989, "grad_norm": 71.59152487698616, "learning_rate": 1.840955480532924e-10, "logits/chosen": -0.103136345744133, "logits/rejected": 0.5122213363647461, "logps/chosen": -479.87548828125, "logps/rejected": -522.3965454101562, "loss": 0.7835, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.2658259868621826, "rewards/margins": 0.7818335890769958, "rewards/rejected": -2.047659397125244, "step": 1890 }, { "epoch": 0.9942438513867086, "grad_norm": 83.56566983759409, "learning_rate": 5.051597607894087e-11, "logits/chosen": -0.28623780608177185, "logits/rejected": 0.8169389963150024, "logps/chosen": -390.40338134765625, "logps/rejected": -486.4009704589844, "loss": 0.7654, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.3803126811981201, "rewards/margins": 0.9602187871932983, "rewards/rejected": -2.340531587600708, "step": 1900 }, { "epoch": 0.9942438513867086, "eval_logits/chosen": -0.218413844704628, "eval_logits/rejected": 1.0746814012527466, "eval_logps/chosen": -415.4467468261719, "eval_logps/rejected": -497.4510498046875, "eval_loss": 0.7743334770202637, "eval_rewards/accuracies": 0.77182537317276, "eval_rewards/chosen": -1.3346970081329346, "eval_rewards/margins": 1.037785291671753, "eval_rewards/rejected": -2.3724822998046875, "eval_runtime": 429.1862, "eval_samples_per_second": 4.66, "eval_steps_per_second": 0.147, "step": 1900 }, { "epoch": 0.9994767137624281, "grad_norm": 34.04731232198274, "learning_rate": 4.1750135001961117e-13, "logits/chosen": 0.16522832214832306, "logits/rejected": 1.3153358697891235, "logps/chosen": -456.4495544433594, "logps/rejected": -518.9078369140625, "loss": 0.7578, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.2935277223587036, "rewards/margins": 1.182199239730835, "rewards/rejected": -2.475727081298828, "step": 1910 }, { "epoch": 1.0, "step": 1911, "total_flos": 0.0, "train_loss": 0.45257438286507334, "train_runtime": 23936.1745, "train_samples_per_second": 2.554, "train_steps_per_second": 0.08 } ], "logging_steps": 10, "max_steps": 1911, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }