| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 225.0, | |
| "eval_steps": 500, | |
| "global_step": 450, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "completion_length": 276.0401916503906, | |
| "epoch": 1.0, | |
| "grad_norm": 1.3174133685527831, | |
| "kl": 0.0, | |
| "learning_rate": 7.142857142857142e-08, | |
| "loss": -0.0, | |
| "reward": 1.6562499403953552, | |
| "reward_std": 0.4393078237771988, | |
| "rewards/answer_reward_func": 0.7499999701976776, | |
| "rewards/format_reward_func": 0.9062500298023224, | |
| "step": 2 | |
| }, | |
| { | |
| "completion_length": 258.4851303100586, | |
| "epoch": 2.0, | |
| "grad_norm": 1.1213410501938252, | |
| "kl": 0.0002532005310058594, | |
| "learning_rate": 1.4285714285714285e-07, | |
| "loss": 0.0, | |
| "reward": 1.6145833730697632, | |
| "reward_std": 0.5048187077045441, | |
| "rewards/answer_reward_func": 0.7395834028720856, | |
| "rewards/format_reward_func": 0.8750000298023224, | |
| "step": 4 | |
| }, | |
| { | |
| "completion_length": 195.49553680419922, | |
| "epoch": 3.0, | |
| "grad_norm": 1.0016861118507379, | |
| "kl": 0.00025844573974609375, | |
| "learning_rate": 2.1428571428571426e-07, | |
| "loss": 0.0, | |
| "reward": 1.7062500715255737, | |
| "reward_std": 0.35797178000211716, | |
| "rewards/answer_reward_func": 0.768750011920929, | |
| "rewards/format_reward_func": 0.9375000298023224, | |
| "step": 6 | |
| }, | |
| { | |
| "completion_length": 229.18155670166016, | |
| "epoch": 4.0, | |
| "grad_norm": 1.2018085544480408, | |
| "kl": 0.0002665519714355469, | |
| "learning_rate": 2.857142857142857e-07, | |
| "loss": 0.0, | |
| "reward": 1.5499999523162842, | |
| "reward_std": 0.6482782661914825, | |
| "rewards/answer_reward_func": 0.706250011920929, | |
| "rewards/format_reward_func": 0.8437500298023224, | |
| "step": 8 | |
| }, | |
| { | |
| "completion_length": 229.14732360839844, | |
| "epoch": 5.0, | |
| "grad_norm": 1.0483380939796498, | |
| "kl": 0.00025463104248046875, | |
| "learning_rate": 3.5714285714285716e-07, | |
| "loss": 0.0, | |
| "reward": 1.5416666865348816, | |
| "reward_std": 0.5995936393737793, | |
| "rewards/answer_reward_func": 0.6666666865348816, | |
| "rewards/format_reward_func": 0.8750000298023224, | |
| "step": 10 | |
| }, | |
| { | |
| "completion_length": 264.84227752685547, | |
| "epoch": 6.0, | |
| "grad_norm": 0.9353624844800403, | |
| "kl": 0.00026226043701171875, | |
| "learning_rate": 4.285714285714285e-07, | |
| "loss": 0.0, | |
| "reward": 1.6062499284744263, | |
| "reward_std": 0.5437474548816681, | |
| "rewards/answer_reward_func": 0.7104166746139526, | |
| "rewards/format_reward_func": 0.8958333730697632, | |
| "step": 12 | |
| }, | |
| { | |
| "completion_length": 250.52828216552734, | |
| "epoch": 7.0, | |
| "grad_norm": 0.9582379182354116, | |
| "kl": 0.00023317337036132812, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0, | |
| "reward": 1.4354166388511658, | |
| "reward_std": 0.6316270232200623, | |
| "rewards/answer_reward_func": 0.6229166984558105, | |
| "rewards/format_reward_func": 0.8125000298023224, | |
| "step": 14 | |
| }, | |
| { | |
| "completion_length": 251.17857360839844, | |
| "epoch": 8.0, | |
| "grad_norm": 0.9981178249237231, | |
| "kl": 0.00026416778564453125, | |
| "learning_rate": 4.999740409224932e-07, | |
| "loss": 0.0, | |
| "reward": 1.683333396911621, | |
| "reward_std": 0.4488546848297119, | |
| "rewards/answer_reward_func": 0.7562499940395355, | |
| "rewards/format_reward_func": 0.9270833432674408, | |
| "step": 16 | |
| }, | |
| { | |
| "completion_length": 193.23214721679688, | |
| "epoch": 9.0, | |
| "grad_norm": 0.9392773566245655, | |
| "kl": 0.0002613067626953125, | |
| "learning_rate": 4.998961690809627e-07, | |
| "loss": 0.0, | |
| "reward": 1.6416666507720947, | |
| "reward_std": 0.5428857207298279, | |
| "rewards/answer_reward_func": 0.7354166507720947, | |
| "rewards/format_reward_func": 0.9062500298023224, | |
| "step": 18 | |
| }, | |
| { | |
| "completion_length": 260.5014953613281, | |
| "epoch": 10.0, | |
| "grad_norm": 1.1286522636909715, | |
| "kl": 0.00028705596923828125, | |
| "learning_rate": 4.997664006472578e-07, | |
| "loss": 0.0, | |
| "reward": 1.4458333849906921, | |
| "reward_std": 0.6854044795036316, | |
| "rewards/answer_reward_func": 0.6541666686534882, | |
| "rewards/format_reward_func": 0.7916666865348816, | |
| "step": 20 | |
| }, | |
| { | |
| "completion_length": 227.18006896972656, | |
| "epoch": 11.0, | |
| "grad_norm": 1.207761352187691, | |
| "kl": 0.00022029876708984375, | |
| "learning_rate": 4.995847625707292e-07, | |
| "loss": 0.0, | |
| "reward": 1.7687500715255737, | |
| "reward_std": 0.3790818303823471, | |
| "rewards/answer_reward_func": 0.8208333849906921, | |
| "rewards/format_reward_func": 0.9479166865348816, | |
| "step": 22 | |
| }, | |
| { | |
| "completion_length": 286.2544708251953, | |
| "epoch": 12.0, | |
| "grad_norm": 0.8620527379171249, | |
| "kl": 0.00023794174194335938, | |
| "learning_rate": 4.993512925726318e-07, | |
| "loss": 0.0, | |
| "reward": 1.5958333611488342, | |
| "reward_std": 0.5891686081886292, | |
| "rewards/answer_reward_func": 0.7208333313465118, | |
| "rewards/format_reward_func": 0.8750000298023224, | |
| "step": 24 | |
| }, | |
| { | |
| "completion_length": 257.24107360839844, | |
| "epoch": 13.0, | |
| "grad_norm": 1.0091569196984636, | |
| "kl": 0.00029087066650390625, | |
| "learning_rate": 4.990660391382923e-07, | |
| "loss": 0.0, | |
| "reward": 1.645833432674408, | |
| "reward_std": 0.5208363234996796, | |
| "rewards/answer_reward_func": 0.7500000298023224, | |
| "rewards/format_reward_func": 0.8958333432674408, | |
| "step": 26 | |
| }, | |
| { | |
| "completion_length": 242.4806671142578, | |
| "epoch": 14.0, | |
| "grad_norm": 1.0541615732715437, | |
| "kl": 0.000263214111328125, | |
| "learning_rate": 4.987290615070384e-07, | |
| "loss": 0.0, | |
| "reward": 1.5625, | |
| "reward_std": 0.5553774684667587, | |
| "rewards/answer_reward_func": 0.6770833432674408, | |
| "rewards/format_reward_func": 0.8854166865348816, | |
| "step": 28 | |
| }, | |
| { | |
| "completion_length": 282.1651916503906, | |
| "epoch": 15.0, | |
| "grad_norm": 0.9687037221617821, | |
| "kl": 0.0002727508544921875, | |
| "learning_rate": 4.983404296598978e-07, | |
| "loss": 0.0, | |
| "reward": 1.618749976158142, | |
| "reward_std": 0.5825705528259277, | |
| "rewards/answer_reward_func": 0.7333332896232605, | |
| "rewards/format_reward_func": 0.8854166865348816, | |
| "step": 30 | |
| }, | |
| { | |
| "completion_length": 296.96429443359375, | |
| "epoch": 16.0, | |
| "grad_norm": 0.9052560635102206, | |
| "kl": 0.00027942657470703125, | |
| "learning_rate": 4.979002243050646e-07, | |
| "loss": 0.0, | |
| "reward": 1.6041666865348816, | |
| "reward_std": 0.5438288599252701, | |
| "rewards/answer_reward_func": 0.7187500596046448, | |
| "rewards/format_reward_func": 0.8854166865348816, | |
| "step": 32 | |
| }, | |
| { | |
| "completion_length": 252.49703216552734, | |
| "epoch": 17.0, | |
| "grad_norm": 1.2903003085293443, | |
| "kl": 0.000274658203125, | |
| "learning_rate": 4.974085368611381e-07, | |
| "loss": 0.0, | |
| "reward": 1.4916666746139526, | |
| "reward_std": 0.6277603805065155, | |
| "rewards/answer_reward_func": 0.6895833611488342, | |
| "rewards/format_reward_func": 0.8020833432674408, | |
| "step": 34 | |
| }, | |
| { | |
| "completion_length": 232.88690948486328, | |
| "epoch": 18.0, | |
| "grad_norm": 1.244565109610492, | |
| "kl": 0.0002675056457519531, | |
| "learning_rate": 4.968654694381379e-07, | |
| "loss": 0.0, | |
| "reward": 1.543749988079071, | |
| "reward_std": 0.5515827536582947, | |
| "rewards/answer_reward_func": 0.6687500178813934, | |
| "rewards/format_reward_func": 0.8750000298023224, | |
| "step": 36 | |
| }, | |
| { | |
| "completion_length": 243.01786041259766, | |
| "epoch": 19.0, | |
| "grad_norm": 1.1406446283382206, | |
| "kl": 0.000301361083984375, | |
| "learning_rate": 4.962711348162987e-07, | |
| "loss": 0.0, | |
| "reward": 1.5791666507720947, | |
| "reward_std": 0.5238045454025269, | |
| "rewards/answer_reward_func": 0.6937499940395355, | |
| "rewards/format_reward_func": 0.8854166865348816, | |
| "step": 38 | |
| }, | |
| { | |
| "completion_length": 274.58631896972656, | |
| "epoch": 20.0, | |
| "grad_norm": 0.9070852985537547, | |
| "kl": 0.00029468536376953125, | |
| "learning_rate": 4.956256564226487e-07, | |
| "loss": 0.0, | |
| "reward": 1.583333432674408, | |
| "reward_std": 0.5769022554159164, | |
| "rewards/answer_reward_func": 0.7083333730697632, | |
| "rewards/format_reward_func": 0.8750000298023224, | |
| "step": 40 | |
| }, | |
| { | |
| "completion_length": 269.13245391845703, | |
| "epoch": 21.0, | |
| "grad_norm": 0.9374825554752583, | |
| "kl": 0.0003261566162109375, | |
| "learning_rate": 4.949291683053768e-07, | |
| "loss": 0.0, | |
| "reward": 1.574999988079071, | |
| "reward_std": 0.459792360663414, | |
| "rewards/answer_reward_func": 0.7000000178813934, | |
| "rewards/format_reward_func": 0.8750000298023224, | |
| "step": 42 | |
| }, | |
| { | |
| "completion_length": 219.09673309326172, | |
| "epoch": 22.0, | |
| "grad_norm": 1.0791631460921267, | |
| "kl": 0.00034618377685546875, | |
| "learning_rate": 4.941818151059955e-07, | |
| "loss": 0.0, | |
| "reward": 1.6729166507720947, | |
| "reward_std": 0.416482537984848, | |
| "rewards/answer_reward_func": 0.7666666507720947, | |
| "rewards/format_reward_func": 0.9062500298023224, | |
| "step": 44 | |
| }, | |
| { | |
| "completion_length": 295.5714416503906, | |
| "epoch": 23.0, | |
| "grad_norm": 0.9749875952370675, | |
| "kl": 0.000362396240234375, | |
| "learning_rate": 4.933837520293017e-07, | |
| "loss": 0.0, | |
| "reward": 1.5750000476837158, | |
| "reward_std": 0.46283578872680664, | |
| "rewards/answer_reward_func": 0.6687500476837158, | |
| "rewards/format_reward_func": 0.90625, | |
| "step": 46 | |
| }, | |
| { | |
| "completion_length": 252.36013793945312, | |
| "epoch": 24.0, | |
| "grad_norm": 1.0280635613370392, | |
| "kl": 0.00028324127197265625, | |
| "learning_rate": 4.925351448111454e-07, | |
| "loss": 0.0, | |
| "reward": 1.6520832777023315, | |
| "reward_std": 0.4525061100721359, | |
| "rewards/answer_reward_func": 0.7458333671092987, | |
| "rewards/format_reward_func": 0.90625, | |
| "step": 48 | |
| }, | |
| { | |
| "completion_length": 282.2053680419922, | |
| "epoch": 25.0, | |
| "grad_norm": 0.9539175555414532, | |
| "kl": 0.00036907196044921875, | |
| "learning_rate": 4.91636169684011e-07, | |
| "loss": 0.0, | |
| "reward": 1.5645832419395447, | |
| "reward_std": 0.5369424521923065, | |
| "rewards/answer_reward_func": 0.7000000476837158, | |
| "rewards/format_reward_func": 0.8645833432674408, | |
| "step": 50 | |
| }, | |
| { | |
| "completion_length": 251.30060577392578, | |
| "epoch": 26.0, | |
| "grad_norm": 1.044878374634429, | |
| "kl": 0.00030231475830078125, | |
| "learning_rate": 4.906870133404186e-07, | |
| "loss": 0.0, | |
| "reward": 1.5916666388511658, | |
| "reward_std": 0.4689074903726578, | |
| "rewards/answer_reward_func": 0.7375000417232513, | |
| "rewards/format_reward_func": 0.8541666865348816, | |
| "step": 52 | |
| }, | |
| { | |
| "completion_length": 299.84674072265625, | |
| "epoch": 27.0, | |
| "grad_norm": 1.2843916568865072, | |
| "kl": 0.00040435791015625, | |
| "learning_rate": 4.896878728941531e-07, | |
| "loss": 0.0, | |
| "reward": 1.508333444595337, | |
| "reward_std": 0.6762565672397614, | |
| "rewards/answer_reward_func": 0.6854166984558105, | |
| "rewards/format_reward_func": 0.8229166865348816, | |
| "step": 54 | |
| }, | |
| { | |
| "completion_length": 243.796142578125, | |
| "epoch": 28.0, | |
| "grad_norm": 0.900181145652731, | |
| "kl": 0.0004787445068359375, | |
| "learning_rate": 4.886389558393284e-07, | |
| "loss": 0.0, | |
| "reward": 1.7166666984558105, | |
| "reward_std": 0.39391638338565826, | |
| "rewards/answer_reward_func": 0.7895833253860474, | |
| "rewards/format_reward_func": 0.9270833432674408, | |
| "step": 56 | |
| }, | |
| { | |
| "completion_length": 231.99553680419922, | |
| "epoch": 29.0, | |
| "grad_norm": 1.0630743880615185, | |
| "kl": 0.000530242919921875, | |
| "learning_rate": 4.875404800072976e-07, | |
| "loss": 0.0, | |
| "reward": 1.6895833611488342, | |
| "reward_std": 0.3743758350610733, | |
| "rewards/answer_reward_func": 0.762499988079071, | |
| "rewards/format_reward_func": 0.9270833432674408, | |
| "step": 58 | |
| }, | |
| { | |
| "completion_length": 269.95982360839844, | |
| "epoch": 30.0, | |
| "grad_norm": 1.0709394734964757, | |
| "kl": 0.00045013427734375, | |
| "learning_rate": 4.86392673521415e-07, | |
| "loss": 0.0, | |
| "reward": 1.5354167222976685, | |
| "reward_std": 0.5249515026807785, | |
| "rewards/answer_reward_func": 0.6604166626930237, | |
| "rewards/format_reward_func": 0.8750000298023224, | |
| "step": 60 | |
| }, | |
| { | |
| "completion_length": 261.08631896972656, | |
| "epoch": 31.0, | |
| "grad_norm": 0.8641809174103421, | |
| "kl": 0.0004787445068359375, | |
| "learning_rate": 4.851957747496606e-07, | |
| "loss": 0.0, | |
| "reward": 1.5916667580604553, | |
| "reward_std": 0.46947768330574036, | |
| "rewards/answer_reward_func": 0.6958333253860474, | |
| "rewards/format_reward_func": 0.8958333432674408, | |
| "step": 62 | |
| }, | |
| { | |
| "completion_length": 269.31846618652344, | |
| "epoch": 32.0, | |
| "grad_norm": 0.8600216221217903, | |
| "kl": 0.000507354736328125, | |
| "learning_rate": 4.839500322551386e-07, | |
| "loss": 0.0, | |
| "reward": 1.6083332896232605, | |
| "reward_std": 0.43799301981925964, | |
| "rewards/answer_reward_func": 0.7229167222976685, | |
| "rewards/format_reward_func": 0.8854166865348816, | |
| "step": 64 | |
| }, | |
| { | |
| "completion_length": 267.78721618652344, | |
| "epoch": 33.0, | |
| "grad_norm": 1.0025448441816613, | |
| "kl": 0.000667572021484375, | |
| "learning_rate": 4.826557047444563e-07, | |
| "loss": 0.0, | |
| "reward": 1.4520833492279053, | |
| "reward_std": 0.6994097530841827, | |
| "rewards/answer_reward_func": 0.6187500357627869, | |
| "rewards/format_reward_func": 0.8333333730697632, | |
| "step": 66 | |
| }, | |
| { | |
| "completion_length": 188.45387268066406, | |
| "epoch": 34.0, | |
| "grad_norm": 1.126524575820377, | |
| "kl": 0.00066375732421875, | |
| "learning_rate": 4.813130610139993e-07, | |
| "loss": 0.0, | |
| "reward": 1.7291666269302368, | |
| "reward_std": 0.4704573303461075, | |
| "rewards/answer_reward_func": 0.8125, | |
| "rewards/format_reward_func": 0.9166666865348816, | |
| "step": 68 | |
| }, | |
| { | |
| "completion_length": 261.0133972167969, | |
| "epoch": 35.0, | |
| "grad_norm": 1.022490845344434, | |
| "kl": 0.000644683837890625, | |
| "learning_rate": 4.799223798941089e-07, | |
| "loss": 0.0, | |
| "reward": 1.6062500476837158, | |
| "reward_std": 0.41893763840198517, | |
| "rewards/answer_reward_func": 0.6791667342185974, | |
| "rewards/format_reward_func": 0.9270833432674408, | |
| "step": 70 | |
| }, | |
| { | |
| "completion_length": 256.7976379394531, | |
| "epoch": 36.0, | |
| "grad_norm": 1.1014000572624154, | |
| "kl": 0.0005702972412109375, | |
| "learning_rate": 4.78483950191177e-07, | |
| "loss": 0.0, | |
| "reward": 1.6583333611488342, | |
| "reward_std": 0.3685748428106308, | |
| "rewards/answer_reward_func": 0.7312500774860382, | |
| "rewards/format_reward_func": 0.9270833432674408, | |
| "step": 72 | |
| }, | |
| { | |
| "completion_length": 243.47917938232422, | |
| "epoch": 37.0, | |
| "grad_norm": 0.9707014299449478, | |
| "kl": 0.000640869140625, | |
| "learning_rate": 4.769980706276687e-07, | |
| "loss": 0.0, | |
| "reward": 1.6625000834465027, | |
| "reward_std": 0.4419756233692169, | |
| "rewards/answer_reward_func": 0.7458333373069763, | |
| "rewards/format_reward_func": 0.9166666865348816, | |
| "step": 74 | |
| }, | |
| { | |
| "completion_length": 263.03424072265625, | |
| "epoch": 38.0, | |
| "grad_norm": 1.1820665004678212, | |
| "kl": 0.0007991790771484375, | |
| "learning_rate": 4.7546504978008595e-07, | |
| "loss": 0.0, | |
| "reward": 1.6458333730697632, | |
| "reward_std": 0.4905931055545807, | |
| "rewards/answer_reward_func": 0.7395833730697632, | |
| "rewards/format_reward_func": 0.9062500298023224, | |
| "step": 76 | |
| }, | |
| { | |
| "completion_length": 261.6458511352539, | |
| "epoch": 39.0, | |
| "grad_norm": 1.1213274436179879, | |
| "kl": 0.0009899139404296875, | |
| "learning_rate": 4.738852060148848e-07, | |
| "loss": 0.0, | |
| "reward": 1.64166659116745, | |
| "reward_std": 0.48026007413864136, | |
| "rewards/answer_reward_func": 0.7354167103767395, | |
| "rewards/format_reward_func": 0.9062500298023224, | |
| "step": 78 | |
| }, | |
| { | |
| "completion_length": 207.6875, | |
| "epoch": 40.0, | |
| "grad_norm": 1.1448749496253416, | |
| "kl": 0.0008602142333984375, | |
| "learning_rate": 4.722588674223593e-07, | |
| "loss": 0.0, | |
| "reward": 1.5166666507720947, | |
| "reward_std": 0.6429217755794525, | |
| "rewards/answer_reward_func": 0.6729167103767395, | |
| "rewards/format_reward_func": 0.8437500298023224, | |
| "step": 80 | |
| }, | |
| { | |
| "completion_length": 266.53125, | |
| "epoch": 41.0, | |
| "grad_norm": 1.811926744689833, | |
| "kl": 0.00104522705078125, | |
| "learning_rate": 4.70586371748506e-07, | |
| "loss": 0.0, | |
| "reward": 1.7104166746139526, | |
| "reward_std": 0.4602654129266739, | |
| "rewards/answer_reward_func": 0.783333420753479, | |
| "rewards/format_reward_func": 0.9270833432674408, | |
| "step": 82 | |
| }, | |
| { | |
| "completion_length": 194.20238876342773, | |
| "epoch": 42.0, | |
| "grad_norm": 1.270434695255441, | |
| "kl": 0.0009632110595703125, | |
| "learning_rate": 4.6886806632488363e-07, | |
| "loss": 0.0, | |
| "reward": 1.5916666984558105, | |
| "reward_std": 0.5647197067737579, | |
| "rewards/answer_reward_func": 0.737500011920929, | |
| "rewards/format_reward_func": 0.8541666865348816, | |
| "step": 84 | |
| }, | |
| { | |
| "completion_length": 275.46429443359375, | |
| "epoch": 43.0, | |
| "grad_norm": 1.2073773531159788, | |
| "kl": 0.000888824462890625, | |
| "learning_rate": 4.6710430799648143e-07, | |
| "loss": 0.0, | |
| "reward": 1.477083444595337, | |
| "reward_std": 0.5555408447980881, | |
| "rewards/answer_reward_func": 0.612500011920929, | |
| "rewards/format_reward_func": 0.8645833432674408, | |
| "step": 86 | |
| }, | |
| { | |
| "completion_length": 282.5639953613281, | |
| "epoch": 44.0, | |
| "grad_norm": 1.064636286784791, | |
| "kl": 0.00095367431640625, | |
| "learning_rate": 4.652954630476127e-07, | |
| "loss": 0.0, | |
| "reward": 1.6437500715255737, | |
| "reward_std": 0.3799179792404175, | |
| "rewards/answer_reward_func": 0.7062500715255737, | |
| "rewards/format_reward_func": 0.9375, | |
| "step": 88 | |
| }, | |
| { | |
| "completion_length": 272.7544708251953, | |
| "epoch": 45.0, | |
| "grad_norm": 1.042110371163498, | |
| "kl": 0.00125885009765625, | |
| "learning_rate": 4.6344190712584713e-07, | |
| "loss": 0.0, | |
| "reward": 1.5416666865348816, | |
| "reward_std": 0.5731079876422882, | |
| "rewards/answer_reward_func": 0.6875, | |
| "rewards/format_reward_func": 0.8541666865348816, | |
| "step": 90 | |
| }, | |
| { | |
| "completion_length": 256.8943634033203, | |
| "epoch": 46.0, | |
| "grad_norm": 0.9870127861284985, | |
| "kl": 0.001251220703125, | |
| "learning_rate": 4.615440251639995e-07, | |
| "loss": 0.0, | |
| "reward": 1.65625, | |
| "reward_std": 0.4869039058685303, | |
| "rewards/answer_reward_func": 0.7395833432674408, | |
| "rewards/format_reward_func": 0.9166666865348816, | |
| "step": 92 | |
| }, | |
| { | |
| "completion_length": 211.87054443359375, | |
| "epoch": 47.0, | |
| "grad_norm": 0.9671737851691804, | |
| "kl": 0.001361846923828125, | |
| "learning_rate": 4.596022113001894e-07, | |
| "loss": 0.0, | |
| "reward": 1.6625000834465027, | |
| "reward_std": 0.5325099229812622, | |
| "rewards/answer_reward_func": 0.7666666805744171, | |
| "rewards/format_reward_func": 0.8958333432674408, | |
| "step": 94 | |
| }, | |
| { | |
| "completion_length": 193.9732208251953, | |
| "epoch": 48.0, | |
| "grad_norm": 0.967143862670542, | |
| "kl": 0.001125335693359375, | |
| "learning_rate": 4.576168687959895e-07, | |
| "loss": 0.0, | |
| "reward": 1.5833333730697632, | |
| "reward_std": 0.5218211710453033, | |
| "rewards/answer_reward_func": 0.6874999701976776, | |
| "rewards/format_reward_func": 0.8958333730697632, | |
| "step": 96 | |
| }, | |
| { | |
| "completion_length": 235.55209350585938, | |
| "epoch": 49.0, | |
| "grad_norm": 0.8679119146176711, | |
| "kl": 0.00141143798828125, | |
| "learning_rate": 4.555884099526793e-07, | |
| "loss": 0.0, | |
| "reward": 1.6437500715255737, | |
| "reward_std": 0.4265412539243698, | |
| "rewards/answer_reward_func": 0.7479167282581329, | |
| "rewards/format_reward_func": 0.8958333730697632, | |
| "step": 98 | |
| }, | |
| { | |
| "completion_length": 258.24107360839844, | |
| "epoch": 50.0, | |
| "grad_norm": 0.9735600110189709, | |
| "kl": 0.00128936767578125, | |
| "learning_rate": 4.5351725602562174e-07, | |
| "loss": 0.0, | |
| "reward": 1.5458332896232605, | |
| "reward_std": 0.5863806307315826, | |
| "rewards/answer_reward_func": 0.6812499761581421, | |
| "rewards/format_reward_func": 0.8645833432674408, | |
| "step": 100 | |
| }, | |
| { | |
| "completion_length": 251.37798309326172, | |
| "epoch": 51.0, | |
| "grad_norm": 1.1010153912626721, | |
| "kl": 0.00257110595703125, | |
| "learning_rate": 4.514038371367791e-07, | |
| "loss": 0.0, | |
| "reward": 1.6875000596046448, | |
| "reward_std": 0.3308330178260803, | |
| "rewards/answer_reward_func": 0.7395833432674408, | |
| "rewards/format_reward_func": 0.9479166865348816, | |
| "step": 102 | |
| }, | |
| { | |
| "completion_length": 265.4866180419922, | |
| "epoch": 52.0, | |
| "grad_norm": 0.9819207836814513, | |
| "kl": 0.0019073486328125, | |
| "learning_rate": 4.4924859218538936e-07, | |
| "loss": 0.0, | |
| "reward": 1.631250023841858, | |
| "reward_std": 0.4357140362262726, | |
| "rewards/answer_reward_func": 0.7250000238418579, | |
| "rewards/format_reward_func": 0.90625, | |
| "step": 104 | |
| }, | |
| { | |
| "completion_length": 233.65030670166016, | |
| "epoch": 53.0, | |
| "grad_norm": 1.0300874744860617, | |
| "kl": 0.001468658447265625, | |
| "learning_rate": 4.470519687568185e-07, | |
| "loss": 0.0, | |
| "reward": 1.4812501072883606, | |
| "reward_std": 0.5527896136045456, | |
| "rewards/answer_reward_func": 0.6375000774860382, | |
| "rewards/format_reward_func": 0.84375, | |
| "step": 106 | |
| }, | |
| { | |
| "completion_length": 206.92857360839844, | |
| "epoch": 54.0, | |
| "grad_norm": 1.1141272522177064, | |
| "kl": 0.001773834228515625, | |
| "learning_rate": 4.4481442302960923e-07, | |
| "loss": 0.0, | |
| "reward": 1.7687500715255737, | |
| "reward_std": 0.3796774446964264, | |
| "rewards/answer_reward_func": 0.820833295583725, | |
| "rewards/format_reward_func": 0.9479166865348816, | |
| "step": 108 | |
| }, | |
| { | |
| "completion_length": 221.09673309326172, | |
| "epoch": 55.0, | |
| "grad_norm": 1.352829863229064, | |
| "kl": 0.002471923828125, | |
| "learning_rate": 4.4253641968074505e-07, | |
| "loss": 0.0, | |
| "reward": 1.5208333730697632, | |
| "reward_std": 0.5520410537719727, | |
| "rewards/answer_reward_func": 0.6562500298023224, | |
| "rewards/format_reward_func": 0.8645833432674408, | |
| "step": 110 | |
| }, | |
| { | |
| "completion_length": 257.77232360839844, | |
| "epoch": 56.0, | |
| "grad_norm": 1.1920858052182373, | |
| "kl": 0.00225067138671875, | |
| "learning_rate": 4.402184317891501e-07, | |
| "loss": 0.0, | |
| "reward": 1.5583334565162659, | |
| "reward_std": 0.5900199711322784, | |
| "rewards/answer_reward_func": 0.6937500238418579, | |
| "rewards/format_reward_func": 0.8645833730697632, | |
| "step": 112 | |
| }, | |
| { | |
| "completion_length": 235.68899536132812, | |
| "epoch": 57.0, | |
| "grad_norm": 1.3653230138170256, | |
| "kl": 0.00194549560546875, | |
| "learning_rate": 4.37860940737443e-07, | |
| "loss": 0.0, | |
| "reward": 1.6520833373069763, | |
| "reward_std": 0.46646520495414734, | |
| "rewards/answer_reward_func": 0.7458333969116211, | |
| "rewards/format_reward_func": 0.9062500298023224, | |
| "step": 114 | |
| }, | |
| { | |
| "completion_length": 258.796142578125, | |
| "epoch": 58.0, | |
| "grad_norm": 0.8403380498881522, | |
| "kl": 0.001804351806640625, | |
| "learning_rate": 4.354644361119671e-07, | |
| "loss": 0.0, | |
| "reward": 1.600000023841858, | |
| "reward_std": 0.505972146987915, | |
| "rewards/answer_reward_func": 0.7145833671092987, | |
| "rewards/format_reward_func": 0.8854166865348816, | |
| "step": 116 | |
| }, | |
| { | |
| "completion_length": 249.9494171142578, | |
| "epoch": 59.0, | |
| "grad_norm": 1.0657379909631177, | |
| "kl": 0.001987457275390625, | |
| "learning_rate": 4.3302941560111716e-07, | |
| "loss": 0.0, | |
| "reward": 1.6354166865348816, | |
| "reward_std": 0.4670645296573639, | |
| "rewards/answer_reward_func": 0.71875, | |
| "rewards/format_reward_func": 0.9166666865348816, | |
| "step": 118 | |
| }, | |
| { | |
| "completion_length": 223.75149536132812, | |
| "epoch": 60.0, | |
| "grad_norm": 1.0364423920157473, | |
| "kl": 0.0026397705078125, | |
| "learning_rate": 4.3055638489198236e-07, | |
| "loss": 0.0, | |
| "reward": 1.6645833849906921, | |
| "reward_std": 0.36297454684972763, | |
| "rewards/answer_reward_func": 0.7270833849906921, | |
| "rewards/format_reward_func": 0.9375000298023224, | |
| "step": 120 | |
| }, | |
| { | |
| "completion_length": 231.37798309326172, | |
| "epoch": 61.0, | |
| "grad_norm": 0.689843165639308, | |
| "kl": 0.00292205810546875, | |
| "learning_rate": 4.280458575653296e-07, | |
| "loss": 0.0, | |
| "reward": 1.625, | |
| "reward_std": 0.4028606414794922, | |
| "rewards/answer_reward_func": 0.7083333730697632, | |
| "rewards/format_reward_func": 0.9166666865348816, | |
| "step": 122 | |
| }, | |
| { | |
| "completion_length": 205.21726989746094, | |
| "epoch": 62.0, | |
| "grad_norm": 0.9343619997416344, | |
| "kl": 0.002593994140625, | |
| "learning_rate": 4.2549835498894665e-07, | |
| "loss": 0.0, | |
| "reward": 1.6812500357627869, | |
| "reward_std": 0.3913162499666214, | |
| "rewards/answer_reward_func": 0.7645833492279053, | |
| "rewards/format_reward_func": 0.9166666865348816, | |
| "step": 124 | |
| }, | |
| { | |
| "completion_length": 244.34524536132812, | |
| "epoch": 63.0, | |
| "grad_norm": 0.8737346807108217, | |
| "kl": 0.00287628173828125, | |
| "learning_rate": 4.229144062093679e-07, | |
| "loss": 0.0, | |
| "reward": 1.6604167222976685, | |
| "reward_std": 0.3475506007671356, | |
| "rewards/answer_reward_func": 0.7229166924953461, | |
| "rewards/format_reward_func": 0.9375, | |
| "step": 126 | |
| }, | |
| { | |
| "completion_length": 207.37053680419922, | |
| "epoch": 64.0, | |
| "grad_norm": 0.9790196814028996, | |
| "kl": 0.003173828125, | |
| "learning_rate": 4.2029454784200675e-07, | |
| "loss": 0.0, | |
| "reward": 1.5520833730697632, | |
| "reward_std": 0.5492734163999557, | |
| "rewards/answer_reward_func": 0.6770833432674408, | |
| "rewards/format_reward_func": 0.8750000298023224, | |
| "step": 128 | |
| }, | |
| { | |
| "completion_length": 243.30506896972656, | |
| "epoch": 65.0, | |
| "grad_norm": 0.8985131154749594, | |
| "kl": 0.00261688232421875, | |
| "learning_rate": 4.1763932395971433e-07, | |
| "loss": 0.0, | |
| "reward": 1.5479167103767395, | |
| "reward_std": 0.5215294808149338, | |
| "rewards/answer_reward_func": 0.6729166805744171, | |
| "rewards/format_reward_func": 0.8750000298023224, | |
| "step": 130 | |
| }, | |
| { | |
| "completion_length": 187.57887268066406, | |
| "epoch": 66.0, | |
| "grad_norm": 1.3006465432399807, | |
| "kl": 0.00403594970703125, | |
| "learning_rate": 4.1494928597979117e-07, | |
| "loss": 0.0, | |
| "reward": 1.7729166746139526, | |
| "reward_std": 0.29627224802970886, | |
| "rewards/answer_reward_func": 0.8250000774860382, | |
| "rewards/format_reward_func": 0.9479166865348816, | |
| "step": 132 | |
| }, | |
| { | |
| "completion_length": 234.69644165039062, | |
| "epoch": 67.0, | |
| "grad_norm": 1.3361777010265043, | |
| "kl": 0.003692626953125, | |
| "learning_rate": 4.122249925494726e-07, | |
| "loss": 0.0, | |
| "reward": 1.6229167580604553, | |
| "reward_std": 0.42341606318950653, | |
| "rewards/answer_reward_func": 0.7062499821186066, | |
| "rewards/format_reward_func": 0.9166666865348816, | |
| "step": 134 | |
| }, | |
| { | |
| "completion_length": 202.57589721679688, | |
| "epoch": 68.0, | |
| "grad_norm": 1.17339881745284, | |
| "kl": 0.004425048828125, | |
| "learning_rate": 4.094670094299131e-07, | |
| "loss": 0.0, | |
| "reward": 1.6874999403953552, | |
| "reward_std": 0.44907137751579285, | |
| "rewards/answer_reward_func": 0.7708333134651184, | |
| "rewards/format_reward_func": 0.9166666865348816, | |
| "step": 136 | |
| }, | |
| { | |
| "completion_length": 213.21875762939453, | |
| "epoch": 69.0, | |
| "grad_norm": 0.7429382258526259, | |
| "kl": 0.00403594970703125, | |
| "learning_rate": 4.066759093786931e-07, | |
| "loss": 0.0, | |
| "reward": 1.756250023841858, | |
| "reward_std": 0.4163903295993805, | |
| "rewards/answer_reward_func": 0.8291666805744171, | |
| "rewards/format_reward_func": 0.9270833432674408, | |
| "step": 138 | |
| }, | |
| { | |
| "completion_length": 270.60120391845703, | |
| "epoch": 70.0, | |
| "grad_norm": 0.9593428203594789, | |
| "kl": 0.00347137451171875, | |
| "learning_rate": 4.038522720308732e-07, | |
| "loss": 0.0, | |
| "reward": 1.7083333134651184, | |
| "reward_std": 0.5051226913928986, | |
| "rewards/answer_reward_func": 0.8125000298023224, | |
| "rewards/format_reward_func": 0.8958333432674408, | |
| "step": 140 | |
| }, | |
| { | |
| "completion_length": 278.47471618652344, | |
| "epoch": 71.0, | |
| "grad_norm": 0.9836950616198032, | |
| "kl": 0.00399017333984375, | |
| "learning_rate": 4.009966837786194e-07, | |
| "loss": 0.0, | |
| "reward": 1.5312500596046448, | |
| "reward_std": 0.5006265342235565, | |
| "rewards/answer_reward_func": 0.625, | |
| "rewards/format_reward_func": 0.9062500298023224, | |
| "step": 142 | |
| }, | |
| { | |
| "completion_length": 262.28125, | |
| "epoch": 72.0, | |
| "grad_norm": 1.007097659012535, | |
| "kl": 0.00365447998046875, | |
| "learning_rate": 3.981097376494259e-07, | |
| "loss": 0.0, | |
| "reward": 1.5854166746139526, | |
| "reward_std": 0.4385734647512436, | |
| "rewards/answer_reward_func": 0.679166704416275, | |
| "rewards/format_reward_func": 0.9062500298023224, | |
| "step": 144 | |
| }, | |
| { | |
| "completion_length": 201.98065948486328, | |
| "epoch": 73.0, | |
| "grad_norm": 0.9873674045003755, | |
| "kl": 0.00390625, | |
| "learning_rate": 3.951920331829592e-07, | |
| "loss": 0.0, | |
| "reward": 1.7104167342185974, | |
| "reward_std": 0.41417770087718964, | |
| "rewards/answer_reward_func": 0.7833333313465118, | |
| "rewards/format_reward_func": 0.9270833432674408, | |
| "step": 146 | |
| }, | |
| { | |
| "completion_length": 233.78274536132812, | |
| "epoch": 74.0, | |
| "grad_norm": 0.8019175311994922, | |
| "kl": 0.0042877197265625, | |
| "learning_rate": 3.922441763065506e-07, | |
| "loss": 0.0, | |
| "reward": 1.6625000834465027, | |
| "reward_std": 0.5313624888658524, | |
| "rewards/answer_reward_func": 0.7666666507720947, | |
| "rewards/format_reward_func": 0.8958333432674408, | |
| "step": 148 | |
| }, | |
| { | |
| "completion_length": 215.09673309326172, | |
| "epoch": 75.0, | |
| "grad_norm": 1.1807131764400134, | |
| "kl": 0.004913330078125, | |
| "learning_rate": 3.8926677920936093e-07, | |
| "loss": 0.0, | |
| "reward": 1.7541666626930237, | |
| "reward_std": 0.37399402260780334, | |
| "rewards/answer_reward_func": 0.8270833790302277, | |
| "rewards/format_reward_func": 0.9270833432674408, | |
| "step": 150 | |
| }, | |
| { | |
| "completion_length": 236.639892578125, | |
| "epoch": 76.0, | |
| "grad_norm": 0.8847773717166705, | |
| "kl": 0.00482177734375, | |
| "learning_rate": 3.862604602152464e-07, | |
| "loss": 0.0, | |
| "reward": 1.600000023841858, | |
| "reward_std": 0.4837402403354645, | |
| "rewards/answer_reward_func": 0.6937500536441803, | |
| "rewards/format_reward_func": 0.9062500298023224, | |
| "step": 152 | |
| }, | |
| { | |
| "completion_length": 256.1979217529297, | |
| "epoch": 77.0, | |
| "grad_norm": 1.1407877139333158, | |
| "kl": 0.00395965576171875, | |
| "learning_rate": 3.8322584365434934e-07, | |
| "loss": 0.0, | |
| "reward": 1.6541666984558105, | |
| "reward_std": 0.4243648201227188, | |
| "rewards/answer_reward_func": 0.7479166686534882, | |
| "rewards/format_reward_func": 0.90625, | |
| "step": 154 | |
| }, | |
| { | |
| "completion_length": 273.0744094848633, | |
| "epoch": 78.0, | |
| "grad_norm": 0.9978906361186654, | |
| "kl": 0.004241943359375, | |
| "learning_rate": 3.8016355973344173e-07, | |
| "loss": 0.0, | |
| "reward": 1.5770833492279053, | |
| "reward_std": 0.43396155536174774, | |
| "rewards/answer_reward_func": 0.6812500357627869, | |
| "rewards/format_reward_func": 0.8958333730697632, | |
| "step": 156 | |
| }, | |
| { | |
| "completion_length": 184.15774536132812, | |
| "epoch": 79.0, | |
| "grad_norm": 1.6097549828202833, | |
| "kl": 0.006256103515625, | |
| "learning_rate": 3.7707424440504863e-07, | |
| "loss": 0.0, | |
| "reward": 1.5937500596046448, | |
| "reward_std": 0.47065815329551697, | |
| "rewards/answer_reward_func": 0.6770833432674408, | |
| "rewards/format_reward_func": 0.9166666865348816, | |
| "step": 158 | |
| }, | |
| { | |
| "completion_length": 258.27679443359375, | |
| "epoch": 80.0, | |
| "grad_norm": 1.0827800443504645, | |
| "kl": 0.0040435791015625, | |
| "learning_rate": 3.739585392353787e-07, | |
| "loss": 0.0, | |
| "reward": 1.5458333492279053, | |
| "reward_std": 0.48180142045021057, | |
| "rewards/answer_reward_func": 0.6500000357627869, | |
| "rewards/format_reward_func": 0.8958333432674408, | |
| "step": 160 | |
| }, | |
| { | |
| "completion_length": 225.63839721679688, | |
| "epoch": 81.0, | |
| "grad_norm": 0.9501823254901924, | |
| "kl": 0.0042724609375, | |
| "learning_rate": 3.7081709127108767e-07, | |
| "loss": 0.0, | |
| "reward": 1.7666666507720947, | |
| "reward_std": 0.300030842423439, | |
| "rewards/answer_reward_func": 0.8187500238418579, | |
| "rewards/format_reward_func": 0.9479166865348816, | |
| "step": 162 | |
| }, | |
| { | |
| "completion_length": 197.98214721679688, | |
| "epoch": 82.0, | |
| "grad_norm": 1.1449475824136237, | |
| "kl": 0.0059356689453125, | |
| "learning_rate": 3.6765055290490513e-07, | |
| "loss": 0.0, | |
| "reward": 1.7208333611488342, | |
| "reward_std": 0.40443800389766693, | |
| "rewards/answer_reward_func": 0.7937500774860382, | |
| "rewards/format_reward_func": 0.9270833432674408, | |
| "step": 164 | |
| }, | |
| { | |
| "completion_length": 279.3407897949219, | |
| "epoch": 83.0, | |
| "grad_norm": 1.0930660569963664, | |
| "kl": 0.00414276123046875, | |
| "learning_rate": 3.644595817401501e-07, | |
| "loss": 0.0, | |
| "reward": 1.6229166984558105, | |
| "reward_std": 0.3718283176422119, | |
| "rewards/answer_reward_func": 0.6854166984558105, | |
| "rewards/format_reward_func": 0.9375000298023224, | |
| "step": 166 | |
| }, | |
| { | |
| "completion_length": 267.35120391845703, | |
| "epoch": 84.0, | |
| "grad_norm": 0.9881293106299988, | |
| "kl": 0.005523681640625, | |
| "learning_rate": 3.6124484045416483e-07, | |
| "loss": 0.0, | |
| "reward": 1.6041667461395264, | |
| "reward_std": 0.5452571213245392, | |
| "rewards/answer_reward_func": 0.71875, | |
| "rewards/format_reward_func": 0.8854166865348816, | |
| "step": 168 | |
| }, | |
| { | |
| "completion_length": 195.10565948486328, | |
| "epoch": 85.0, | |
| "grad_norm": 0.917551086901663, | |
| "kl": 0.0063629150390625, | |
| "learning_rate": 3.580069966606949e-07, | |
| "loss": 0.0, | |
| "reward": 1.5958333015441895, | |
| "reward_std": 0.42350558936595917, | |
| "rewards/answer_reward_func": 0.679166704416275, | |
| "rewards/format_reward_func": 0.9166666865348816, | |
| "step": 170 | |
| }, | |
| { | |
| "completion_length": 279.7306594848633, | |
| "epoch": 86.0, | |
| "grad_norm": 0.9580308294096218, | |
| "kl": 0.00469970703125, | |
| "learning_rate": 3.547467227712444e-07, | |
| "loss": 0.0, | |
| "reward": 1.6729167699813843, | |
| "reward_std": 0.43303608894348145, | |
| "rewards/answer_reward_func": 0.7458333373069763, | |
| "rewards/format_reward_func": 0.9270833730697632, | |
| "step": 172 | |
| }, | |
| { | |
| "completion_length": 275.31846618652344, | |
| "epoch": 87.0, | |
| "grad_norm": 1.413142917482677, | |
| "kl": 0.006561279296875, | |
| "learning_rate": 3.5146469585543386e-07, | |
| "loss": 0.0, | |
| "reward": 1.48333340883255, | |
| "reward_std": 0.5618228912353516, | |
| "rewards/answer_reward_func": 0.6291667222976685, | |
| "rewards/format_reward_func": 0.8541666865348816, | |
| "step": 174 | |
| }, | |
| { | |
| "completion_length": 261.55804443359375, | |
| "epoch": 88.0, | |
| "grad_norm": 1.2298546234313372, | |
| "kl": 0.0056304931640625, | |
| "learning_rate": 3.481615975003922e-07, | |
| "loss": 0.0, | |
| "reward": 1.7395833730697632, | |
| "reward_std": 0.4074627459049225, | |
| "rewards/answer_reward_func": 0.8020833432674408, | |
| "rewards/format_reward_func": 0.9375000298023224, | |
| "step": 176 | |
| }, | |
| { | |
| "completion_length": 225.52976989746094, | |
| "epoch": 89.0, | |
| "grad_norm": 0.9886590206068148, | |
| "kl": 0.0056610107421875, | |
| "learning_rate": 3.448381136692089e-07, | |
| "loss": 0.0, | |
| "reward": 1.6395833492279053, | |
| "reward_std": 0.46003973484039307, | |
| "rewards/answer_reward_func": 0.7333333790302277, | |
| "rewards/format_reward_func": 0.9062500298023224, | |
| "step": 178 | |
| }, | |
| { | |
| "completion_length": 233.577392578125, | |
| "epoch": 90.0, | |
| "grad_norm": 1.015531261972698, | |
| "kl": 0.00555419921875, | |
| "learning_rate": 3.4149493455847897e-07, | |
| "loss": 0.0, | |
| "reward": 1.6687501072883606, | |
| "reward_std": 0.45826081931591034, | |
| "rewards/answer_reward_func": 0.7520833313465118, | |
| "rewards/format_reward_func": 0.9166666865348816, | |
| "step": 180 | |
| }, | |
| { | |
| "completion_length": 259.03424072265625, | |
| "epoch": 91.0, | |
| "grad_norm": 0.979842350756012, | |
| "kl": 0.005523681640625, | |
| "learning_rate": 3.3813275445496766e-07, | |
| "loss": 0.0, | |
| "reward": 1.5625000596046448, | |
| "reward_std": 0.5489896833896637, | |
| "rewards/answer_reward_func": 0.6875000596046448, | |
| "rewards/format_reward_func": 0.875, | |
| "step": 182 | |
| }, | |
| { | |
| "completion_length": 293.6592330932617, | |
| "epoch": 92.0, | |
| "grad_norm": 0.9379840832402985, | |
| "kl": 0.0054168701171875, | |
| "learning_rate": 3.347522715914262e-07, | |
| "loss": 0.0, | |
| "reward": 1.5604167580604553, | |
| "reward_std": 0.5173148959875107, | |
| "rewards/answer_reward_func": 0.6750000417232513, | |
| "rewards/format_reward_func": 0.8854166865348816, | |
| "step": 184 | |
| }, | |
| { | |
| "completion_length": 231.08185577392578, | |
| "epoch": 93.0, | |
| "grad_norm": 1.3551218453321658, | |
| "kl": 0.0079193115234375, | |
| "learning_rate": 3.313541880015877e-07, | |
| "loss": 0.0, | |
| "reward": 1.6229167580604553, | |
| "reward_std": 0.498469278216362, | |
| "rewards/answer_reward_func": 0.7166667282581329, | |
| "rewards/format_reward_func": 0.90625, | |
| "step": 186 | |
| }, | |
| { | |
| "completion_length": 184.6636962890625, | |
| "epoch": 94.0, | |
| "grad_norm": 0.7343039648552588, | |
| "kl": 0.0069122314453125, | |
| "learning_rate": 3.279392093743747e-07, | |
| "loss": 0.0, | |
| "reward": 1.6437500715255737, | |
| "reward_std": 0.40093202888965607, | |
| "rewards/answer_reward_func": 0.7374999821186066, | |
| "rewards/format_reward_func": 0.90625, | |
| "step": 188 | |
| }, | |
| { | |
| "completion_length": 164.43750381469727, | |
| "epoch": 95.0, | |
| "grad_norm": 1.3333212202349367, | |
| "kl": 0.009368896484375, | |
| "learning_rate": 3.245080449073459e-07, | |
| "loss": 0.0, | |
| "reward": 1.518750011920929, | |
| "reward_std": 0.5581459701061249, | |
| "rewards/answer_reward_func": 0.6645833551883698, | |
| "rewards/format_reward_func": 0.8541666865348816, | |
| "step": 190 | |
| }, | |
| { | |
| "completion_length": 243.56846618652344, | |
| "epoch": 96.0, | |
| "grad_norm": 0.9202461820347047, | |
| "kl": 0.004638671875, | |
| "learning_rate": 3.210614071594162e-07, | |
| "loss": 0.0, | |
| "reward": 1.5416667461395264, | |
| "reward_std": 0.6040982007980347, | |
| "rewards/answer_reward_func": 0.6666666865348816, | |
| "rewards/format_reward_func": 0.875, | |
| "step": 192 | |
| }, | |
| { | |
| "completion_length": 226.83928680419922, | |
| "epoch": 97.0, | |
| "grad_norm": 1.0722947522372448, | |
| "kl": 0.0070037841796875, | |
| "learning_rate": 3.1760001190287695e-07, | |
| "loss": 0.0, | |
| "reward": 1.6041668057441711, | |
| "reward_std": 0.4665968716144562, | |
| "rewards/answer_reward_func": 0.7083334028720856, | |
| "rewards/format_reward_func": 0.8958333432674408, | |
| "step": 194 | |
| }, | |
| { | |
| "completion_length": 174.95387268066406, | |
| "epoch": 98.0, | |
| "grad_norm": 1.3127977169467409, | |
| "kl": 0.00787353515625, | |
| "learning_rate": 3.141245779747502e-07, | |
| "loss": 0.0, | |
| "reward": 1.6520834565162659, | |
| "reward_std": 0.4045562893152237, | |
| "rewards/answer_reward_func": 0.7250000536441803, | |
| "rewards/format_reward_func": 0.9270833730697632, | |
| "step": 196 | |
| }, | |
| { | |
| "completion_length": 171.74703216552734, | |
| "epoch": 99.0, | |
| "grad_norm": 1.187680476036088, | |
| "kl": 0.007598876953125, | |
| "learning_rate": 3.106358271275056e-07, | |
| "loss": 0.0, | |
| "reward": 1.756250023841858, | |
| "reward_std": 0.37892505526542664, | |
| "rewards/answer_reward_func": 0.8187499940395355, | |
| "rewards/format_reward_func": 0.9375, | |
| "step": 198 | |
| }, | |
| { | |
| "completion_length": 250.858642578125, | |
| "epoch": 100.0, | |
| "grad_norm": 0.9052133122410684, | |
| "kl": 0.00677490234375, | |
| "learning_rate": 3.0713448387917227e-07, | |
| "loss": 0.0, | |
| "reward": 1.681249976158142, | |
| "reward_std": 0.4449262470006943, | |
| "rewards/answer_reward_func": 0.7750000357627869, | |
| "rewards/format_reward_func": 0.90625, | |
| "step": 200 | |
| }, | |
| { | |
| "completion_length": 163.5416717529297, | |
| "epoch": 101.0, | |
| "grad_norm": 0.8373387718652715, | |
| "kl": 0.0063018798828125, | |
| "learning_rate": 3.0362127536287636e-07, | |
| "loss": 0.0, | |
| "reward": 1.652083396911621, | |
| "reward_std": 0.46857765316963196, | |
| "rewards/answer_reward_func": 0.7354167401790619, | |
| "rewards/format_reward_func": 0.9166666865348816, | |
| "step": 202 | |
| }, | |
| { | |
| "completion_length": 246.1131134033203, | |
| "epoch": 102.0, | |
| "grad_norm": 0.7374900854502903, | |
| "kl": 0.006195068359375, | |
| "learning_rate": 3.0009693117583523e-07, | |
| "loss": 0.0, | |
| "reward": 1.6479167938232422, | |
| "reward_std": 0.5083845257759094, | |
| "rewards/answer_reward_func": 0.762499988079071, | |
| "rewards/format_reward_func": 0.8854166865348816, | |
| "step": 204 | |
| }, | |
| { | |
| "completion_length": 231.79911041259766, | |
| "epoch": 103.0, | |
| "grad_norm": 1.180696672190672, | |
| "kl": 0.010528564453125, | |
| "learning_rate": 2.965621832278401e-07, | |
| "loss": 0.0, | |
| "reward": 1.6229166388511658, | |
| "reward_std": 0.6331139504909515, | |
| "rewards/answer_reward_func": 0.7479166984558105, | |
| "rewards/format_reward_func": 0.875, | |
| "step": 206 | |
| }, | |
| { | |
| "completion_length": 237.18602752685547, | |
| "epoch": 104.0, | |
| "grad_norm": 1.0783781256408975, | |
| "kl": 0.0068817138671875, | |
| "learning_rate": 2.9301776558925875e-07, | |
| "loss": 0.0, | |
| "reward": 1.6979168057441711, | |
| "reward_std": 0.3851548731327057, | |
| "rewards/answer_reward_func": 0.7604166865348816, | |
| "rewards/format_reward_func": 0.9375000298023224, | |
| "step": 208 | |
| }, | |
| { | |
| "completion_length": 194.03274536132812, | |
| "epoch": 105.0, | |
| "grad_norm": 1.3242781951977625, | |
| "kl": 0.008575439453125, | |
| "learning_rate": 2.894644143385885e-07, | |
| "loss": 0.0, | |
| "reward": 1.7208333611488342, | |
| "reward_std": 0.3679712861776352, | |
| "rewards/answer_reward_func": 0.7729166746139526, | |
| "rewards/format_reward_func": 0.9479166865348816, | |
| "step": 210 | |
| }, | |
| { | |
| "completion_length": 254.84375762939453, | |
| "epoch": 106.0, | |
| "grad_norm": 1.2562417854229282, | |
| "kl": 0.0070953369140625, | |
| "learning_rate": 2.859028674095937e-07, | |
| "loss": 0.0, | |
| "reward": 1.5958333015441895, | |
| "reward_std": 0.36180783808231354, | |
| "rewards/answer_reward_func": 0.6687500178813934, | |
| "rewards/format_reward_func": 0.9270833432674408, | |
| "step": 212 | |
| }, | |
| { | |
| "completion_length": 220.92262268066406, | |
| "epoch": 107.0, | |
| "grad_norm": 1.1976305491708217, | |
| "kl": 0.0076141357421875, | |
| "learning_rate": 2.823338644380566e-07, | |
| "loss": 0.0, | |
| "reward": 1.5916667580604553, | |
| "reward_std": 0.4970894306898117, | |
| "rewards/answer_reward_func": 0.6854166984558105, | |
| "rewards/format_reward_func": 0.9062500298023224, | |
| "step": 214 | |
| }, | |
| { | |
| "completion_length": 230.11607360839844, | |
| "epoch": 108.0, | |
| "grad_norm": 1.2521189573673432, | |
| "kl": 0.011016845703125, | |
| "learning_rate": 2.7875814660817504e-07, | |
| "loss": 0.0, | |
| "reward": 1.6437500715255737, | |
| "reward_std": 0.35511599481105804, | |
| "rewards/answer_reward_func": 0.6958333849906921, | |
| "rewards/format_reward_func": 0.9479166865348816, | |
| "step": 216 | |
| }, | |
| { | |
| "completion_length": 257.1294708251953, | |
| "epoch": 109.0, | |
| "grad_norm": 0.8840586512014985, | |
| "kl": 0.0065155029296875, | |
| "learning_rate": 2.751764564986396e-07, | |
| "loss": 0.0, | |
| "reward": 1.7041666507720947, | |
| "reward_std": 0.4368957430124283, | |
| "rewards/answer_reward_func": 0.7874999940395355, | |
| "rewards/format_reward_func": 0.9166666865348816, | |
| "step": 218 | |
| }, | |
| { | |
| "completion_length": 180.08036041259766, | |
| "epoch": 110.0, | |
| "grad_norm": 1.3367177987973136, | |
| "kl": 0.008544921875, | |
| "learning_rate": 2.715895379284194e-07, | |
| "loss": 0.0, | |
| "reward": 1.7208334803581238, | |
| "reward_std": 0.3184947222471237, | |
| "rewards/answer_reward_func": 0.7833333611488342, | |
| "rewards/format_reward_func": 0.9375000298023224, | |
| "step": 220 | |
| }, | |
| { | |
| "completion_length": 174.14583587646484, | |
| "epoch": 111.0, | |
| "grad_norm": 0.9864081299854107, | |
| "kl": 0.009552001953125, | |
| "learning_rate": 2.6799813580229174e-07, | |
| "loss": 0.0, | |
| "reward": 1.6166667938232422, | |
| "reward_std": 0.45926420390605927, | |
| "rewards/answer_reward_func": 0.7104166746139526, | |
| "rewards/format_reward_func": 0.90625, | |
| "step": 222 | |
| }, | |
| { | |
| "completion_length": 277.2693634033203, | |
| "epoch": 112.0, | |
| "grad_norm": 0.9956535351844211, | |
| "kl": 0.0081634521484375, | |
| "learning_rate": 2.6440299595614606e-07, | |
| "loss": 0.0, | |
| "reward": 1.675000011920929, | |
| "reward_std": 0.4140470027923584, | |
| "rewards/answer_reward_func": 0.7583333551883698, | |
| "rewards/format_reward_func": 0.9166666865348816, | |
| "step": 224 | |
| }, | |
| { | |
| "completion_length": 212.07292938232422, | |
| "epoch": 113.0, | |
| "grad_norm": 1.240347599024976, | |
| "kl": 0.006927490234375, | |
| "learning_rate": 2.6080486500209347e-07, | |
| "loss": 0.0, | |
| "reward": 1.6791666746139526, | |
| "reward_std": 0.4041960537433624, | |
| "rewards/answer_reward_func": 0.7729166448116302, | |
| "rewards/format_reward_func": 0.9062500298023224, | |
| "step": 226 | |
| }, | |
| { | |
| "completion_length": 211.12649536132812, | |
| "epoch": 114.0, | |
| "grad_norm": 1.0191986624515685, | |
| "kl": 0.007720947265625, | |
| "learning_rate": 2.572044901734166e-07, | |
| "loss": 0.0, | |
| "reward": 1.6395833492279053, | |
| "reward_std": 0.5253069698810577, | |
| "rewards/answer_reward_func": 0.7541667222976685, | |
| "rewards/format_reward_func": 0.8854166865348816, | |
| "step": 228 | |
| }, | |
| { | |
| "completion_length": 253.59524536132812, | |
| "epoch": 115.0, | |
| "grad_norm": 1.1187509727468907, | |
| "kl": 0.0093994140625, | |
| "learning_rate": 2.536026191693893e-07, | |
| "loss": 0.0, | |
| "reward": 1.6500001549720764, | |
| "reward_std": 0.5131012797355652, | |
| "rewards/answer_reward_func": 0.7437500357627869, | |
| "rewards/format_reward_func": 0.9062500298023224, | |
| "step": 230 | |
| }, | |
| { | |
| "completion_length": 252.3601303100586, | |
| "epoch": 116.0, | |
| "grad_norm": 0.8071311834392373, | |
| "kl": 0.009674072265625, | |
| "learning_rate": 2.5e-07, | |
| "loss": 0.0, | |
| "reward": 1.6562501192092896, | |
| "reward_std": 0.48235540091991425, | |
| "rewards/answer_reward_func": 0.7708334028720856, | |
| "rewards/format_reward_func": 0.8854166865348816, | |
| "step": 232 | |
| }, | |
| { | |
| "completion_length": 233.3214340209961, | |
| "epoch": 117.0, | |
| "grad_norm": 1.1656552755712852, | |
| "kl": 0.009185791015625, | |
| "learning_rate": 2.4639738083061073e-07, | |
| "loss": 0.0, | |
| "reward": 1.7062499523162842, | |
| "reward_std": 0.4089523106813431, | |
| "rewards/answer_reward_func": 0.7687500417232513, | |
| "rewards/format_reward_func": 0.9375000298023224, | |
| "step": 234 | |
| }, | |
| { | |
| "completion_length": 208.43899536132812, | |
| "epoch": 118.0, | |
| "grad_norm": 1.1265120620864038, | |
| "kl": 0.0083465576171875, | |
| "learning_rate": 2.4279550982658345e-07, | |
| "loss": 0.0, | |
| "reward": 1.6166667342185974, | |
| "reward_std": 0.46437712013721466, | |
| "rewards/answer_reward_func": 0.7104166448116302, | |
| "rewards/format_reward_func": 0.90625, | |
| "step": 236 | |
| }, | |
| { | |
| "completion_length": 224.84524536132812, | |
| "epoch": 119.0, | |
| "grad_norm": 1.0636630331845314, | |
| "kl": 0.0076141357421875, | |
| "learning_rate": 2.3919513499790646e-07, | |
| "loss": 0.0, | |
| "reward": 1.6333333253860474, | |
| "reward_std": 0.47814565896987915, | |
| "rewards/answer_reward_func": 0.7270833849906921, | |
| "rewards/format_reward_func": 0.9062500298023224, | |
| "step": 238 | |
| }, | |
| { | |
| "completion_length": 247.66221618652344, | |
| "epoch": 120.0, | |
| "grad_norm": 1.113225205494867, | |
| "kl": 0.009002685546875, | |
| "learning_rate": 2.3559700404385394e-07, | |
| "loss": 0.0, | |
| "reward": 1.6958333253860474, | |
| "reward_std": 0.4592965245246887, | |
| "rewards/answer_reward_func": 0.768750011920929, | |
| "rewards/format_reward_func": 0.9270833730697632, | |
| "step": 240 | |
| }, | |
| { | |
| "completion_length": 237.17857360839844, | |
| "epoch": 121.0, | |
| "grad_norm": 1.077870033372934, | |
| "kl": 0.006683349609375, | |
| "learning_rate": 2.3200186419770823e-07, | |
| "loss": 0.0, | |
| "reward": 1.5104167461395264, | |
| "reward_std": 0.5212539583444595, | |
| "rewards/answer_reward_func": 0.6562500298023224, | |
| "rewards/format_reward_func": 0.8541666865348816, | |
| "step": 242 | |
| }, | |
| { | |
| "completion_length": 219.99553680419922, | |
| "epoch": 122.0, | |
| "grad_norm": 1.1297698266097218, | |
| "kl": 0.009765625, | |
| "learning_rate": 2.284104620715807e-07, | |
| "loss": 0.0, | |
| "reward": 1.6541667580604553, | |
| "reward_std": 0.48302122950553894, | |
| "rewards/answer_reward_func": 0.737500011920929, | |
| "rewards/format_reward_func": 0.9166666865348816, | |
| "step": 244 | |
| }, | |
| { | |
| "completion_length": 184.85714721679688, | |
| "epoch": 123.0, | |
| "grad_norm": 1.01567299552301, | |
| "kl": 0.012451171875, | |
| "learning_rate": 2.2482354350136043e-07, | |
| "loss": 0.0, | |
| "reward": 1.6791667938232422, | |
| "reward_std": 0.4051380306482315, | |
| "rewards/answer_reward_func": 0.7729167342185974, | |
| "rewards/format_reward_func": 0.90625, | |
| "step": 246 | |
| }, | |
| { | |
| "completion_length": 204.14137268066406, | |
| "epoch": 124.0, | |
| "grad_norm": 1.1409009201232085, | |
| "kl": 0.0103759765625, | |
| "learning_rate": 2.2124185339182496e-07, | |
| "loss": 0.0, | |
| "reward": 1.5937501192092896, | |
| "reward_std": 0.488675519824028, | |
| "rewards/answer_reward_func": 0.6979166567325592, | |
| "rewards/format_reward_func": 0.8958333432674408, | |
| "step": 248 | |
| }, | |
| { | |
| "completion_length": 228.09524536132812, | |
| "epoch": 125.0, | |
| "grad_norm": 1.1428973558078321, | |
| "kl": 0.0098876953125, | |
| "learning_rate": 2.1766613556194344e-07, | |
| "loss": 0.0, | |
| "reward": 1.6812500953674316, | |
| "reward_std": 0.4293702244758606, | |
| "rewards/answer_reward_func": 0.7645833194255829, | |
| "rewards/format_reward_func": 0.9166666865348816, | |
| "step": 250 | |
| }, | |
| { | |
| "completion_length": 265.23958587646484, | |
| "epoch": 126.0, | |
| "grad_norm": 0.9689867414739515, | |
| "kl": 0.010528564453125, | |
| "learning_rate": 2.1409713259040628e-07, | |
| "loss": 0.0, | |
| "reward": 1.6958333849906921, | |
| "reward_std": 0.5003866702318192, | |
| "rewards/answer_reward_func": 0.7895833551883698, | |
| "rewards/format_reward_func": 0.90625, | |
| "step": 252 | |
| }, | |
| { | |
| "completion_length": 238.41815948486328, | |
| "epoch": 127.0, | |
| "grad_norm": 1.2313155770579827, | |
| "kl": 0.010711669921875, | |
| "learning_rate": 2.105355856614115e-07, | |
| "loss": 0.0, | |
| "reward": 1.6770833730697632, | |
| "reward_std": 0.47428610920906067, | |
| "rewards/answer_reward_func": 0.7500000298023224, | |
| "rewards/format_reward_func": 0.9270833432674408, | |
| "step": 254 | |
| }, | |
| { | |
| "completion_length": 232.48661041259766, | |
| "epoch": 128.0, | |
| "grad_norm": 0.9110185027634778, | |
| "kl": 0.008453369140625, | |
| "learning_rate": 2.069822344107413e-07, | |
| "loss": 0.0, | |
| "reward": 1.6916666626930237, | |
| "reward_std": 0.4168810397386551, | |
| "rewards/answer_reward_func": 0.7750000357627869, | |
| "rewards/format_reward_func": 0.9166666865348816, | |
| "step": 256 | |
| }, | |
| { | |
| "completion_length": 241.36458587646484, | |
| "epoch": 129.0, | |
| "grad_norm": 1.2821986198667763, | |
| "kl": 0.01025390625, | |
| "learning_rate": 2.034378167721599e-07, | |
| "loss": 0.0, | |
| "reward": 1.7645832896232605, | |
| "reward_std": 0.30109934508800507, | |
| "rewards/answer_reward_func": 0.8166667222976685, | |
| "rewards/format_reward_func": 0.9479166865348816, | |
| "step": 258 | |
| }, | |
| { | |
| "completion_length": 225.42857360839844, | |
| "epoch": 130.0, | |
| "grad_norm": 0.943480606859016, | |
| "kl": 0.01080322265625, | |
| "learning_rate": 1.9990306882416485e-07, | |
| "loss": 0.0, | |
| "reward": 1.683333396911621, | |
| "reward_std": 0.49519842863082886, | |
| "rewards/answer_reward_func": 0.7770833671092987, | |
| "rewards/format_reward_func": 0.9062500298023224, | |
| "step": 260 | |
| }, | |
| { | |
| "completion_length": 231.6636962890625, | |
| "epoch": 131.0, | |
| "grad_norm": 0.9562007922521701, | |
| "kl": 0.012542724609375, | |
| "learning_rate": 1.9637872463712362e-07, | |
| "loss": 0.0, | |
| "reward": 1.6104167103767395, | |
| "reward_std": 0.5748671591281891, | |
| "rewards/answer_reward_func": 0.7354167103767395, | |
| "rewards/format_reward_func": 0.8750000298023224, | |
| "step": 262 | |
| }, | |
| { | |
| "completion_length": 276.0327453613281, | |
| "epoch": 132.0, | |
| "grad_norm": 0.9641627688993039, | |
| "kl": 0.009033203125, | |
| "learning_rate": 1.9286551612082773e-07, | |
| "loss": 0.0, | |
| "reward": 1.6375000476837158, | |
| "reward_std": 0.4231380522251129, | |
| "rewards/answer_reward_func": 0.7208333611488342, | |
| "rewards/format_reward_func": 0.9166666865348816, | |
| "step": 264 | |
| }, | |
| { | |
| "completion_length": 256.84078216552734, | |
| "epoch": 133.0, | |
| "grad_norm": 1.0911968919904242, | |
| "kl": 0.009490966796875, | |
| "learning_rate": 1.8936417287249446e-07, | |
| "loss": 0.0, | |
| "reward": 1.6666666269302368, | |
| "reward_std": 0.4258534461259842, | |
| "rewards/answer_reward_func": 0.729166716337204, | |
| "rewards/format_reward_func": 0.9375000298023224, | |
| "step": 266 | |
| }, | |
| { | |
| "completion_length": 250.56102752685547, | |
| "epoch": 134.0, | |
| "grad_norm": 1.2221245515880175, | |
| "kl": 0.009674072265625, | |
| "learning_rate": 1.8587542202524985e-07, | |
| "loss": 0.0, | |
| "reward": 1.524999976158142, | |
| "reward_std": 0.5299902558326721, | |
| "rewards/answer_reward_func": 0.6395833790302277, | |
| "rewards/format_reward_func": 0.8854166865348816, | |
| "step": 268 | |
| }, | |
| { | |
| "completion_length": 196.33185577392578, | |
| "epoch": 135.0, | |
| "grad_norm": 1.2840402345310011, | |
| "kl": 0.013519287109375, | |
| "learning_rate": 1.82399988097123e-07, | |
| "loss": 0.0, | |
| "reward": 1.7187500596046448, | |
| "reward_std": 0.3355311304330826, | |
| "rewards/answer_reward_func": 0.78125, | |
| "rewards/format_reward_func": 0.9375000298023224, | |
| "step": 270 | |
| }, | |
| { | |
| "completion_length": 203.56101989746094, | |
| "epoch": 136.0, | |
| "grad_norm": 1.0538807147223688, | |
| "kl": 0.0118408203125, | |
| "learning_rate": 1.7893859284058378e-07, | |
| "loss": 0.0, | |
| "reward": 1.8416666388511658, | |
| "reward_std": 0.21030117571353912, | |
| "rewards/answer_reward_func": 0.8625000715255737, | |
| "rewards/format_reward_func": 0.9791666865348816, | |
| "step": 272 | |
| }, | |
| { | |
| "completion_length": 254.30953979492188, | |
| "epoch": 137.0, | |
| "grad_norm": 0.9659537232220188, | |
| "kl": 0.011077880859375, | |
| "learning_rate": 1.7549195509265407e-07, | |
| "loss": 0.0, | |
| "reward": 1.6791666746139526, | |
| "reward_std": 0.5740651041269302, | |
| "rewards/answer_reward_func": 0.7937500178813934, | |
| "rewards/format_reward_func": 0.8854166865348816, | |
| "step": 274 | |
| }, | |
| { | |
| "completion_length": 199.11905670166016, | |
| "epoch": 138.0, | |
| "grad_norm": 1.0523104695752745, | |
| "kl": 0.012451171875, | |
| "learning_rate": 1.7206079062562536e-07, | |
| "loss": 0.0, | |
| "reward": 1.6354166865348816, | |
| "reward_std": 0.45953696966171265, | |
| "rewards/answer_reward_func": 0.7187500298023224, | |
| "rewards/format_reward_func": 0.9166666865348816, | |
| "step": 276 | |
| }, | |
| { | |
| "completion_length": 253.00596618652344, | |
| "epoch": 139.0, | |
| "grad_norm": 1.2749421525205042, | |
| "kl": 0.01361083984375, | |
| "learning_rate": 1.6864581199841226e-07, | |
| "loss": 0.0, | |
| "reward": 1.5812500715255737, | |
| "reward_std": 0.4656961262226105, | |
| "rewards/answer_reward_func": 0.6749999821186066, | |
| "rewards/format_reward_func": 0.9062500298023224, | |
| "step": 278 | |
| }, | |
| { | |
| "completion_length": 210.05209350585938, | |
| "epoch": 140.0, | |
| "grad_norm": 0.9197691486950679, | |
| "kl": 0.01361083984375, | |
| "learning_rate": 1.6524772840857388e-07, | |
| "loss": 0.0, | |
| "reward": 1.7916668057441711, | |
| "reward_std": 0.28819574415683746, | |
| "rewards/answer_reward_func": 0.8333334028720856, | |
| "rewards/format_reward_func": 0.9583333432674408, | |
| "step": 280 | |
| }, | |
| { | |
| "completion_length": 225.55357360839844, | |
| "epoch": 141.0, | |
| "grad_norm": 1.0481444334257914, | |
| "kl": 0.0125732421875, | |
| "learning_rate": 1.6186724554503237e-07, | |
| "loss": 0.0, | |
| "reward": 1.5562500953674316, | |
| "reward_std": 0.58814936876297, | |
| "rewards/answer_reward_func": 0.6916666924953461, | |
| "rewards/format_reward_func": 0.8645833432674408, | |
| "step": 282 | |
| }, | |
| { | |
| "completion_length": 220.72024536132812, | |
| "epoch": 142.0, | |
| "grad_norm": 0.8606270979580886, | |
| "kl": 0.012176513671875, | |
| "learning_rate": 1.5850506544152103e-07, | |
| "loss": 0.0, | |
| "reward": 1.7083333134651184, | |
| "reward_std": 0.4496448040008545, | |
| "rewards/answer_reward_func": 0.7916666865348816, | |
| "rewards/format_reward_func": 0.9166666865348816, | |
| "step": 284 | |
| }, | |
| { | |
| "completion_length": 243.38245391845703, | |
| "epoch": 143.0, | |
| "grad_norm": 1.0698695536011837, | |
| "kl": 0.0106201171875, | |
| "learning_rate": 1.5516188633079107e-07, | |
| "loss": 0.0, | |
| "reward": 1.6020833253860474, | |
| "reward_std": 0.4612347036600113, | |
| "rewards/answer_reward_func": 0.6958333849906921, | |
| "rewards/format_reward_func": 0.9062500298023224, | |
| "step": 286 | |
| }, | |
| { | |
| "completion_length": 227.4419708251953, | |
| "epoch": 144.0, | |
| "grad_norm": 0.7884961138356886, | |
| "kl": 0.011383056640625, | |
| "learning_rate": 1.5183840249960784e-07, | |
| "loss": 0.0, | |
| "reward": 1.6104166507720947, | |
| "reward_std": 0.4611455947160721, | |
| "rewards/answer_reward_func": 0.7145833671092987, | |
| "rewards/format_reward_func": 0.8958333730697632, | |
| "step": 288 | |
| }, | |
| { | |
| "completion_length": 231.55506896972656, | |
| "epoch": 145.0, | |
| "grad_norm": 0.8361046861951098, | |
| "kl": 0.011993408203125, | |
| "learning_rate": 1.4853530414456612e-07, | |
| "loss": 0.0, | |
| "reward": 1.6041667461395264, | |
| "reward_std": 0.4646473228931427, | |
| "rewards/answer_reward_func": 0.697916716337204, | |
| "rewards/format_reward_func": 0.9062500298023224, | |
| "step": 290 | |
| }, | |
| { | |
| "completion_length": 228.49405670166016, | |
| "epoch": 146.0, | |
| "grad_norm": 0.8068563262758177, | |
| "kl": 0.0104217529296875, | |
| "learning_rate": 1.4525327722875568e-07, | |
| "loss": 0.0, | |
| "reward": 1.683333396911621, | |
| "reward_std": 0.47417452931404114, | |
| "rewards/answer_reward_func": 0.7770833373069763, | |
| "rewards/format_reward_func": 0.9062500298023224, | |
| "step": 292 | |
| }, | |
| { | |
| "completion_length": 194.96875762939453, | |
| "epoch": 147.0, | |
| "grad_norm": 1.1213564709454569, | |
| "kl": 0.013824462890625, | |
| "learning_rate": 1.4199300333930515e-07, | |
| "loss": 0.0, | |
| "reward": 1.7062500715255737, | |
| "reward_std": 0.3444534093141556, | |
| "rewards/answer_reward_func": 0.7687500417232513, | |
| "rewards/format_reward_func": 0.9375000298023224, | |
| "step": 294 | |
| }, | |
| { | |
| "completion_length": 244.4166717529297, | |
| "epoch": 148.0, | |
| "grad_norm": 0.7888089583197829, | |
| "kl": 0.01177978515625, | |
| "learning_rate": 1.3875515954583523e-07, | |
| "loss": 0.0, | |
| "reward": 1.756250023841858, | |
| "reward_std": 0.3626406341791153, | |
| "rewards/answer_reward_func": 0.8083333671092987, | |
| "rewards/format_reward_func": 0.9479166865348816, | |
| "step": 296 | |
| }, | |
| { | |
| "completion_length": 199.47173309326172, | |
| "epoch": 149.0, | |
| "grad_norm": 1.5123863638798163, | |
| "kl": 0.01519775390625, | |
| "learning_rate": 1.3554041825985e-07, | |
| "loss": 0.0, | |
| "reward": 1.7041667103767395, | |
| "reward_std": 0.3532189428806305, | |
| "rewards/answer_reward_func": 0.7666667103767395, | |
| "rewards/format_reward_func": 0.9375, | |
| "step": 298 | |
| }, | |
| { | |
| "completion_length": 239.6413803100586, | |
| "epoch": 150.0, | |
| "grad_norm": 0.9743053302752174, | |
| "kl": 0.011260986328125, | |
| "learning_rate": 1.323494470950949e-07, | |
| "loss": 0.0, | |
| "reward": 1.67083340883255, | |
| "reward_std": 0.43112409114837646, | |
| "rewards/answer_reward_func": 0.7541666924953461, | |
| "rewards/format_reward_func": 0.9166666865348816, | |
| "step": 300 | |
| }, | |
| { | |
| "completion_length": 264.875, | |
| "epoch": 151.0, | |
| "grad_norm": 1.084558861013316, | |
| "kl": 0.010162353515625, | |
| "learning_rate": 1.2918290872891236e-07, | |
| "loss": 0.0, | |
| "reward": 1.5729166865348816, | |
| "reward_std": 0.3881361186504364, | |
| "rewards/answer_reward_func": 0.6458333432674408, | |
| "rewards/format_reward_func": 0.9270833730697632, | |
| "step": 302 | |
| }, | |
| { | |
| "completion_length": 166.75893020629883, | |
| "epoch": 152.0, | |
| "grad_norm": 0.9178971871400314, | |
| "kl": 0.013824462890625, | |
| "learning_rate": 1.260414607646213e-07, | |
| "loss": 0.0, | |
| "reward": 1.6708332896232605, | |
| "reward_std": 0.48874886333942413, | |
| "rewards/answer_reward_func": 0.7645833492279053, | |
| "rewards/format_reward_func": 0.90625, | |
| "step": 304 | |
| }, | |
| { | |
| "completion_length": 229.1294708251953, | |
| "epoch": 153.0, | |
| "grad_norm": 0.9051272338707157, | |
| "kl": 0.012451171875, | |
| "learning_rate": 1.2292575559495143e-07, | |
| "loss": 0.0, | |
| "reward": 1.4937500357627869, | |
| "reward_std": 0.5545496791601181, | |
| "rewards/answer_reward_func": 0.6291666924953461, | |
| "rewards/format_reward_func": 0.8645833432674408, | |
| "step": 306 | |
| }, | |
| { | |
| "completion_length": 205.56548309326172, | |
| "epoch": 154.0, | |
| "grad_norm": 1.1418844839154754, | |
| "kl": 0.012786865234375, | |
| "learning_rate": 1.1983644026655835e-07, | |
| "loss": 0.0, | |
| "reward": 1.5958333611488342, | |
| "reward_std": 0.39635278284549713, | |
| "rewards/answer_reward_func": 0.6687500476837158, | |
| "rewards/format_reward_func": 0.9270833432674408, | |
| "step": 308 | |
| }, | |
| { | |
| "completion_length": 259.0446472167969, | |
| "epoch": 155.0, | |
| "grad_norm": 0.9270214488759818, | |
| "kl": 0.01220703125, | |
| "learning_rate": 1.1677415634565066e-07, | |
| "loss": 0.0, | |
| "reward": 1.6541667580604553, | |
| "reward_std": 0.5274697840213776, | |
| "rewards/answer_reward_func": 0.768750011920929, | |
| "rewards/format_reward_func": 0.8854166865348816, | |
| "step": 310 | |
| }, | |
| { | |
| "completion_length": 187.2574462890625, | |
| "epoch": 156.0, | |
| "grad_norm": 1.406411477220287, | |
| "kl": 0.015869140625, | |
| "learning_rate": 1.1373953978475353e-07, | |
| "loss": 0.0, | |
| "reward": 1.6708332896232605, | |
| "reward_std": 0.4821990430355072, | |
| "rewards/answer_reward_func": 0.7645833790302277, | |
| "rewards/format_reward_func": 0.9062500298023224, | |
| "step": 312 | |
| }, | |
| { | |
| "completion_length": 276.9166717529297, | |
| "epoch": 157.0, | |
| "grad_norm": 0.7990683879825865, | |
| "kl": 0.01318359375, | |
| "learning_rate": 1.1073322079063913e-07, | |
| "loss": 0.0, | |
| "reward": 1.620833396911621, | |
| "reward_std": 0.5315083116292953, | |
| "rewards/answer_reward_func": 0.7458333373069763, | |
| "rewards/format_reward_func": 0.875, | |
| "step": 314 | |
| }, | |
| { | |
| "completion_length": 257.94495391845703, | |
| "epoch": 158.0, | |
| "grad_norm": 1.1464111626599285, | |
| "kl": 0.0115966796875, | |
| "learning_rate": 1.0775582369344946e-07, | |
| "loss": 0.0, | |
| "reward": 1.5750000476837158, | |
| "reward_std": 0.4518410414457321, | |
| "rewards/answer_reward_func": 0.679166704416275, | |
| "rewards/format_reward_func": 0.8958333432674408, | |
| "step": 316 | |
| }, | |
| { | |
| "completion_length": 197.84375, | |
| "epoch": 159.0, | |
| "grad_norm": 0.9180762035814116, | |
| "kl": 0.011810302734375, | |
| "learning_rate": 1.0480796681704077e-07, | |
| "loss": 0.0, | |
| "reward": 1.6875, | |
| "reward_std": 0.5340849161148071, | |
| "rewards/answer_reward_func": 0.7916666567325592, | |
| "rewards/format_reward_func": 0.8958333730697632, | |
| "step": 318 | |
| }, | |
| { | |
| "completion_length": 214.01339721679688, | |
| "epoch": 160.0, | |
| "grad_norm": 0.9726223680985641, | |
| "kl": 0.012664794921875, | |
| "learning_rate": 1.018902623505741e-07, | |
| "loss": 0.0, | |
| "reward": 1.7437500953674316, | |
| "reward_std": 0.38085436820983887, | |
| "rewards/answer_reward_func": 0.7958333790302277, | |
| "rewards/format_reward_func": 0.9479166865348816, | |
| "step": 320 | |
| }, | |
| { | |
| "completion_length": 238.88690948486328, | |
| "epoch": 161.0, | |
| "grad_norm": 1.2613531043331283, | |
| "kl": 0.013427734375, | |
| "learning_rate": 9.900331622138063e-08, | |
| "loss": 0.0, | |
| "reward": 1.6770833730697632, | |
| "reward_std": 0.40811336040496826, | |
| "rewards/answer_reward_func": 0.7395834028720856, | |
| "rewards/format_reward_func": 0.9375000298023224, | |
| "step": 322 | |
| }, | |
| { | |
| "completion_length": 246.1726303100586, | |
| "epoch": 162.0, | |
| "grad_norm": 0.9813919388988234, | |
| "kl": 0.01416015625, | |
| "learning_rate": 9.614772796912681e-08, | |
| "loss": 0.0, | |
| "reward": 1.6854166388511658, | |
| "reward_std": 0.3813091516494751, | |
| "rewards/answer_reward_func": 0.737500011920929, | |
| "rewards/format_reward_func": 0.9479166865348816, | |
| "step": 324 | |
| }, | |
| { | |
| "completion_length": 279.3869094848633, | |
| "epoch": 163.0, | |
| "grad_norm": 0.9383720906424019, | |
| "kl": 0.01165771484375, | |
| "learning_rate": 9.332409062130686e-08, | |
| "loss": 0.0, | |
| "reward": 1.6874999403953552, | |
| "reward_std": 0.3446161448955536, | |
| "rewards/answer_reward_func": 0.7395833432674408, | |
| "rewards/format_reward_func": 0.9479166865348816, | |
| "step": 326 | |
| }, | |
| { | |
| "completion_length": 225.8199462890625, | |
| "epoch": 164.0, | |
| "grad_norm": 1.2329291257068657, | |
| "kl": 0.013763427734375, | |
| "learning_rate": 9.053299057008699e-08, | |
| "loss": 0.0, | |
| "reward": 1.6145833730697632, | |
| "reward_std": 0.4202453941106796, | |
| "rewards/answer_reward_func": 0.7187500298023224, | |
| "rewards/format_reward_func": 0.8958333730697632, | |
| "step": 328 | |
| }, | |
| { | |
| "completion_length": 235.64584350585938, | |
| "epoch": 165.0, | |
| "grad_norm": 1.0271287801634421, | |
| "kl": 0.012237548828125, | |
| "learning_rate": 8.777500745052743e-08, | |
| "loss": 0.0, | |
| "reward": 1.7166666984558105, | |
| "reward_std": 0.43931248784065247, | |
| "rewards/answer_reward_func": 0.8000000417232513, | |
| "rewards/format_reward_func": 0.9166666865348816, | |
| "step": 330 | |
| }, | |
| { | |
| "completion_length": 232.68006896972656, | |
| "epoch": 166.0, | |
| "grad_norm": 0.915865026213559, | |
| "kl": 0.012603759765625, | |
| "learning_rate": 8.505071402020892e-08, | |
| "loss": 0.0, | |
| "reward": 1.6874999403953552, | |
| "reward_std": 0.5371277630329132, | |
| "rewards/answer_reward_func": 0.7916666865348816, | |
| "rewards/format_reward_func": 0.8958333432674408, | |
| "step": 332 | |
| }, | |
| { | |
| "completion_length": 217.71131896972656, | |
| "epoch": 167.0, | |
| "grad_norm": 0.9120790002082919, | |
| "kl": 0.013275146484375, | |
| "learning_rate": 8.236067604028562e-08, | |
| "loss": 0.0, | |
| "reward": 1.675000011920929, | |
| "reward_std": 0.563975989818573, | |
| "rewards/answer_reward_func": 0.7895833849906921, | |
| "rewards/format_reward_func": 0.8854166865348816, | |
| "step": 334 | |
| }, | |
| { | |
| "completion_length": 196.93899536132812, | |
| "epoch": 168.0, | |
| "grad_norm": 1.1307071803466724, | |
| "kl": 0.015533447265625, | |
| "learning_rate": 7.970545215799327e-08, | |
| "loss": 0.0, | |
| "reward": 1.6979166865348816, | |
| "reward_std": 0.45206770300865173, | |
| "rewards/answer_reward_func": 0.7812500596046448, | |
| "rewards/format_reward_func": 0.9166666865348816, | |
| "step": 336 | |
| }, | |
| { | |
| "completion_length": 211.1324462890625, | |
| "epoch": 169.0, | |
| "grad_norm": 1.0318110502343532, | |
| "kl": 0.014373779296875, | |
| "learning_rate": 7.708559379063204e-08, | |
| "loss": 0.0, | |
| "reward": 1.6083333492279053, | |
| "reward_std": 0.4994523823261261, | |
| "rewards/answer_reward_func": 0.7229166924953461, | |
| "rewards/format_reward_func": 0.8854166865348816, | |
| "step": 338 | |
| }, | |
| { | |
| "completion_length": 249.483642578125, | |
| "epoch": 170.0, | |
| "grad_norm": 1.0228436471160482, | |
| "kl": 0.01397705078125, | |
| "learning_rate": 7.45016450110534e-08, | |
| "loss": 0.0, | |
| "reward": 1.725000023841858, | |
| "reward_std": 0.4635240435600281, | |
| "rewards/answer_reward_func": 0.8083333671092987, | |
| "rewards/format_reward_func": 0.9166666865348816, | |
| "step": 340 | |
| }, | |
| { | |
| "completion_length": 181.33928680419922, | |
| "epoch": 171.0, | |
| "grad_norm": 0.9237959973304003, | |
| "kl": 0.013946533203125, | |
| "learning_rate": 7.195414243467029e-08, | |
| "loss": 0.0, | |
| "reward": 1.6895833015441895, | |
| "reward_std": 0.5353528708219528, | |
| "rewards/answer_reward_func": 0.7937500476837158, | |
| "rewards/format_reward_func": 0.8958333432674408, | |
| "step": 342 | |
| }, | |
| { | |
| "completion_length": 197.4136962890625, | |
| "epoch": 172.0, | |
| "grad_norm": 1.0940015327651587, | |
| "kl": 0.013580322265625, | |
| "learning_rate": 6.944361510801763e-08, | |
| "loss": 0.0, | |
| "reward": 1.8708333373069763, | |
| "reward_std": 0.22116978466510773, | |
| "rewards/answer_reward_func": 0.8916666805744171, | |
| "rewards/format_reward_func": 0.9791666865348816, | |
| "step": 344 | |
| }, | |
| { | |
| "completion_length": 270.08929443359375, | |
| "epoch": 173.0, | |
| "grad_norm": 0.8732317923979456, | |
| "kl": 0.0106201171875, | |
| "learning_rate": 6.697058439888283e-08, | |
| "loss": 0.0, | |
| "reward": 1.6125000715255737, | |
| "reward_std": 0.46661631762981415, | |
| "rewards/answer_reward_func": 0.6958333253860474, | |
| "rewards/format_reward_func": 0.9166666865348816, | |
| "step": 346 | |
| }, | |
| { | |
| "completion_length": 203.64435577392578, | |
| "epoch": 174.0, | |
| "grad_norm": 1.0885104682621713, | |
| "kl": 0.0135498046875, | |
| "learning_rate": 6.453556388803288e-08, | |
| "loss": 0.0, | |
| "reward": 1.600000023841858, | |
| "reward_std": 0.5343074351549149, | |
| "rewards/answer_reward_func": 0.7041667103767395, | |
| "rewards/format_reward_func": 0.8958333432674408, | |
| "step": 348 | |
| }, | |
| { | |
| "completion_length": 211.68155670166016, | |
| "epoch": 175.0, | |
| "grad_norm": 1.4782213044122223, | |
| "kl": 0.01861572265625, | |
| "learning_rate": 6.213905926255697e-08, | |
| "loss": 0.0, | |
| "reward": 1.7291668057441711, | |
| "reward_std": 0.3893900662660599, | |
| "rewards/answer_reward_func": 0.7916666865348816, | |
| "rewards/format_reward_func": 0.9375, | |
| "step": 350 | |
| }, | |
| { | |
| "completion_length": 156.66071701049805, | |
| "epoch": 176.0, | |
| "grad_norm": 0.9393862503631087, | |
| "kl": 0.013519287109375, | |
| "learning_rate": 5.978156821084987e-08, | |
| "loss": 0.0, | |
| "reward": 1.5104167461395264, | |
| "reward_std": 0.479897677898407, | |
| "rewards/answer_reward_func": 0.6145833432674408, | |
| "rewards/format_reward_func": 0.8958333432674408, | |
| "step": 352 | |
| }, | |
| { | |
| "completion_length": 199.02828216552734, | |
| "epoch": 177.0, | |
| "grad_norm": 1.3069044202145068, | |
| "kl": 0.01422119140625, | |
| "learning_rate": 5.7463580319254853e-08, | |
| "loss": 0.0, | |
| "reward": 1.6895833015441895, | |
| "reward_std": 0.38866522908210754, | |
| "rewards/answer_reward_func": 0.762499988079071, | |
| "rewards/format_reward_func": 0.9270833730697632, | |
| "step": 354 | |
| }, | |
| { | |
| "completion_length": 189.39286041259766, | |
| "epoch": 178.0, | |
| "grad_norm": 1.0161007756312446, | |
| "kl": 0.015289306640625, | |
| "learning_rate": 5.518557697039081e-08, | |
| "loss": 0.0, | |
| "reward": 1.706250011920929, | |
| "reward_std": 0.34542496502399445, | |
| "rewards/answer_reward_func": 0.7583333849906921, | |
| "rewards/format_reward_func": 0.9479166865348816, | |
| "step": 356 | |
| }, | |
| { | |
| "completion_length": 224.97024536132812, | |
| "epoch": 179.0, | |
| "grad_norm": 1.311399818971272, | |
| "kl": 0.01507568359375, | |
| "learning_rate": 5.294803124318145e-08, | |
| "loss": 0.0, | |
| "reward": 1.6791666746139526, | |
| "reward_std": 0.4293544441461563, | |
| "rewards/answer_reward_func": 0.7520833015441895, | |
| "rewards/format_reward_func": 0.9270833432674408, | |
| "step": 358 | |
| }, | |
| { | |
| "completion_length": 222.889892578125, | |
| "epoch": 180.0, | |
| "grad_norm": 1.1475919639221568, | |
| "kl": 0.013427734375, | |
| "learning_rate": 5.07514078146106e-08, | |
| "loss": 0.0, | |
| "reward": 1.5791667699813843, | |
| "reward_std": 0.47242018580436707, | |
| "rewards/answer_reward_func": 0.6833333373069763, | |
| "rewards/format_reward_func": 0.8958333730697632, | |
| "step": 360 | |
| }, | |
| { | |
| "completion_length": 178.65625, | |
| "epoch": 181.0, | |
| "grad_norm": 1.0645169958717802, | |
| "kl": 0.01220703125, | |
| "learning_rate": 4.859616286322094e-08, | |
| "loss": 0.0, | |
| "reward": 1.7187500596046448, | |
| "reward_std": 0.5216688811779022, | |
| "rewards/answer_reward_func": 0.8020833432674408, | |
| "rewards/format_reward_func": 0.9166666865348816, | |
| "step": 362 | |
| }, | |
| { | |
| "completion_length": 215.66667938232422, | |
| "epoch": 182.0, | |
| "grad_norm": 0.7634779125291854, | |
| "kl": 0.01507568359375, | |
| "learning_rate": 4.648274397437829e-08, | |
| "loss": 0.0, | |
| "reward": 1.6770833730697632, | |
| "reward_std": 0.45663949847221375, | |
| "rewards/answer_reward_func": 0.75, | |
| "rewards/format_reward_func": 0.9270833730697632, | |
| "step": 364 | |
| }, | |
| { | |
| "completion_length": 255.3482208251953, | |
| "epoch": 183.0, | |
| "grad_norm": 1.1451895327728272, | |
| "kl": 0.01422119140625, | |
| "learning_rate": 4.4411590047320617e-08, | |
| "loss": 0.0, | |
| "reward": 1.7062499523162842, | |
| "reward_std": 0.3319532126188278, | |
| "rewards/answer_reward_func": 0.7583333849906921, | |
| "rewards/format_reward_func": 0.9479166865348816, | |
| "step": 366 | |
| }, | |
| { | |
| "completion_length": 175.7872085571289, | |
| "epoch": 184.0, | |
| "grad_norm": 1.138275541917393, | |
| "kl": 0.01885986328125, | |
| "learning_rate": 4.2383131204010494e-08, | |
| "loss": 0.0, | |
| "reward": 1.5458333492279053, | |
| "reward_std": 0.5794899761676788, | |
| "rewards/answer_reward_func": 0.6812499761581421, | |
| "rewards/format_reward_func": 0.8645833730697632, | |
| "step": 368 | |
| }, | |
| { | |
| "completion_length": 220.264892578125, | |
| "epoch": 185.0, | |
| "grad_norm": 1.2941468258226874, | |
| "kl": 0.01434326171875, | |
| "learning_rate": 4.039778869981064e-08, | |
| "loss": 0.0, | |
| "reward": 1.6541666388511658, | |
| "reward_std": 0.3757530450820923, | |
| "rewards/answer_reward_func": 0.7062500417232513, | |
| "rewards/format_reward_func": 0.9479166865348816, | |
| "step": 370 | |
| }, | |
| { | |
| "completion_length": 180.15625762939453, | |
| "epoch": 186.0, | |
| "grad_norm": 1.1191534930075977, | |
| "kl": 0.014617919921875, | |
| "learning_rate": 3.845597483600049e-08, | |
| "loss": 0.0, | |
| "reward": 1.6958333849906921, | |
| "reward_std": 0.42705224454402924, | |
| "rewards/answer_reward_func": 0.768750011920929, | |
| "rewards/format_reward_func": 0.9270833432674408, | |
| "step": 372 | |
| }, | |
| { | |
| "completion_length": 193.14732360839844, | |
| "epoch": 187.0, | |
| "grad_norm": 1.5659699092387518, | |
| "kl": 0.015594482421875, | |
| "learning_rate": 3.655809287415284e-08, | |
| "loss": 0.0, | |
| "reward": 1.6958333253860474, | |
| "reward_std": 0.3121452108025551, | |
| "rewards/answer_reward_func": 0.7375000417232513, | |
| "rewards/format_reward_func": 0.9583333432674408, | |
| "step": 374 | |
| }, | |
| { | |
| "completion_length": 175.52828216552734, | |
| "epoch": 188.0, | |
| "grad_norm": 1.0098836488478027, | |
| "kl": 0.0152587890625, | |
| "learning_rate": 3.4704536952387285e-08, | |
| "loss": 0.0, | |
| "reward": 1.754166603088379, | |
| "reward_std": 0.35266736149787903, | |
| "rewards/answer_reward_func": 0.8062500059604645, | |
| "rewards/format_reward_func": 0.9479166865348816, | |
| "step": 376 | |
| }, | |
| { | |
| "completion_length": 264.1756134033203, | |
| "epoch": 189.0, | |
| "grad_norm": 0.9574953446250056, | |
| "kl": 0.0113525390625, | |
| "learning_rate": 3.2895692003518575e-08, | |
| "loss": 0.0, | |
| "reward": 1.7229167222976685, | |
| "reward_std": 0.3303990066051483, | |
| "rewards/answer_reward_func": 0.7750000357627869, | |
| "rewards/format_reward_func": 0.9479166865348816, | |
| "step": 378 | |
| }, | |
| { | |
| "completion_length": 193.55208587646484, | |
| "epoch": 190.0, | |
| "grad_norm": 1.238270158742224, | |
| "kl": 0.01446533203125, | |
| "learning_rate": 3.113193367511635e-08, | |
| "loss": 0.0, | |
| "reward": 1.6812500357627869, | |
| "reward_std": 0.41038842499256134, | |
| "rewards/answer_reward_func": 0.7541666924953461, | |
| "rewards/format_reward_func": 0.9270833432674408, | |
| "step": 380 | |
| }, | |
| { | |
| "completion_length": 200.19792938232422, | |
| "epoch": 191.0, | |
| "grad_norm": 0.9743369650595539, | |
| "kl": 0.0125732421875, | |
| "learning_rate": 2.9413628251493934e-08, | |
| "loss": 0.0, | |
| "reward": 1.7062499523162842, | |
| "reward_std": 0.4908797889947891, | |
| "rewards/answer_reward_func": 0.8000000715255737, | |
| "rewards/format_reward_func": 0.90625, | |
| "step": 382 | |
| }, | |
| { | |
| "completion_length": 170.07440948486328, | |
| "epoch": 192.0, | |
| "grad_norm": 1.3063146962963188, | |
| "kl": 0.013519287109375, | |
| "learning_rate": 2.774113257764066e-08, | |
| "loss": 0.0, | |
| "reward": 1.7333333492279053, | |
| "reward_std": 0.3345176726579666, | |
| "rewards/answer_reward_func": 0.7958333790302277, | |
| "rewards/format_reward_func": 0.9375000298023224, | |
| "step": 384 | |
| }, | |
| { | |
| "completion_length": 247.58482360839844, | |
| "epoch": 193.0, | |
| "grad_norm": 0.9984056006045743, | |
| "kl": 0.0120849609375, | |
| "learning_rate": 2.611479398511518e-08, | |
| "loss": 0.0, | |
| "reward": 1.7083333730697632, | |
| "reward_std": 0.3646298050880432, | |
| "rewards/answer_reward_func": 0.7708333730697632, | |
| "rewards/format_reward_func": 0.9375, | |
| "step": 386 | |
| }, | |
| { | |
| "completion_length": 238.2574462890625, | |
| "epoch": 194.0, | |
| "grad_norm": 1.5444205983532193, | |
| "kl": 0.013885498046875, | |
| "learning_rate": 2.4534950219914057e-08, | |
| "loss": 0.0, | |
| "reward": 1.7312500476837158, | |
| "reward_std": 0.38924433290958405, | |
| "rewards/answer_reward_func": 0.8041666448116302, | |
| "rewards/format_reward_func": 0.9270833432674408, | |
| "step": 388 | |
| }, | |
| { | |
| "completion_length": 226.72173309326172, | |
| "epoch": 195.0, | |
| "grad_norm": 0.8197579373143684, | |
| "kl": 0.009979248046875, | |
| "learning_rate": 2.300192937233128e-08, | |
| "loss": 0.0, | |
| "reward": 1.6791666746139526, | |
| "reward_std": 0.4052896499633789, | |
| "rewards/answer_reward_func": 0.7416667342185974, | |
| "rewards/format_reward_func": 0.9375, | |
| "step": 390 | |
| }, | |
| { | |
| "completion_length": 232.91964721679688, | |
| "epoch": 196.0, | |
| "grad_norm": 1.0006804609593394, | |
| "kl": 0.013702392578125, | |
| "learning_rate": 2.1516049808822935e-08, | |
| "loss": 0.0, | |
| "reward": 1.6479166746139526, | |
| "reward_std": 0.5231119990348816, | |
| "rewards/answer_reward_func": 0.7520833611488342, | |
| "rewards/format_reward_func": 0.8958333432674408, | |
| "step": 392 | |
| }, | |
| { | |
| "completion_length": 191.55803680419922, | |
| "epoch": 197.0, | |
| "grad_norm": 1.1112988487037212, | |
| "kl": 0.011810302734375, | |
| "learning_rate": 2.007762010589098e-08, | |
| "loss": 0.0, | |
| "reward": 1.6854167580604553, | |
| "reward_std": 0.4321441501379013, | |
| "rewards/answer_reward_func": 0.7583333551883698, | |
| "rewards/format_reward_func": 0.9270833432674408, | |
| "step": 394 | |
| }, | |
| { | |
| "completion_length": 233.3482208251953, | |
| "epoch": 198.0, | |
| "grad_norm": 1.244460904016601, | |
| "kl": 0.013214111328125, | |
| "learning_rate": 1.8686938986000627e-08, | |
| "loss": 0.0, | |
| "reward": 1.6875000596046448, | |
| "reward_std": 0.4832369536161423, | |
| "rewards/answer_reward_func": 0.7604166567325592, | |
| "rewards/format_reward_func": 0.9270833730697632, | |
| "step": 396 | |
| }, | |
| { | |
| "completion_length": 224.22471618652344, | |
| "epoch": 199.0, | |
| "grad_norm": 1.1490919776212416, | |
| "kl": 0.0167236328125, | |
| "learning_rate": 1.734429525554365e-08, | |
| "loss": 0.0, | |
| "reward": 1.5958333611488342, | |
| "reward_std": 0.553360641002655, | |
| "rewards/answer_reward_func": 0.7104166746139526, | |
| "rewards/format_reward_func": 0.8854166865348816, | |
| "step": 398 | |
| }, | |
| { | |
| "completion_length": 255.5982208251953, | |
| "epoch": 200.0, | |
| "grad_norm": 0.9875662145593327, | |
| "kl": 0.01202392578125, | |
| "learning_rate": 1.604996774486145e-08, | |
| "loss": 0.0, | |
| "reward": 1.7104166746139526, | |
| "reward_std": 0.44989022612571716, | |
| "rewards/answer_reward_func": 0.793749988079071, | |
| "rewards/format_reward_func": 0.9166666865348816, | |
| "step": 400 | |
| }, | |
| { | |
| "completion_length": 188.99405670166016, | |
| "epoch": 201.0, | |
| "grad_norm": 0.9195215006245523, | |
| "kl": 0.012725830078125, | |
| "learning_rate": 1.4804225250339281e-08, | |
| "loss": 0.0, | |
| "reward": 1.8250000476837158, | |
| "reward_std": 0.39560186117887497, | |
| "rewards/answer_reward_func": 0.8875000476837158, | |
| "rewards/format_reward_func": 0.9375000298023224, | |
| "step": 402 | |
| }, | |
| { | |
| "completion_length": 187.25298309326172, | |
| "epoch": 202.0, | |
| "grad_norm": 1.6727983005324356, | |
| "kl": 0.01751708984375, | |
| "learning_rate": 1.360732647858498e-08, | |
| "loss": 0.0, | |
| "reward": 1.7270833849906921, | |
| "reward_std": 0.4138026833534241, | |
| "rewards/answer_reward_func": 0.7895833551883698, | |
| "rewards/format_reward_func": 0.9375000298023224, | |
| "step": 404 | |
| }, | |
| { | |
| "completion_length": 192.87798309326172, | |
| "epoch": 203.0, | |
| "grad_norm": 1.083839122653639, | |
| "kl": 0.014892578125, | |
| "learning_rate": 1.2459519992702311e-08, | |
| "loss": 0.0, | |
| "reward": 1.7666667699813843, | |
| "reward_std": 0.30420470237731934, | |
| "rewards/answer_reward_func": 0.8083333373069763, | |
| "rewards/format_reward_func": 0.9583333432674408, | |
| "step": 406 | |
| }, | |
| { | |
| "completion_length": 242.02084350585938, | |
| "epoch": 204.0, | |
| "grad_norm": 1.1194918345284006, | |
| "kl": 0.01318359375, | |
| "learning_rate": 1.1361044160671629e-08, | |
| "loss": 0.0, | |
| "reward": 1.6937499642372131, | |
| "reward_std": 0.3781377822160721, | |
| "rewards/answer_reward_func": 0.7666666805744171, | |
| "rewards/format_reward_func": 0.9270833432674408, | |
| "step": 408 | |
| }, | |
| { | |
| "completion_length": 215.93006896972656, | |
| "epoch": 205.0, | |
| "grad_norm": 1.1017859873234446, | |
| "kl": 0.011810302734375, | |
| "learning_rate": 1.0312127105846947e-08, | |
| "loss": 0.0, | |
| "reward": 1.693750023841858, | |
| "reward_std": 0.48744727671146393, | |
| "rewards/answer_reward_func": 0.7770833671092987, | |
| "rewards/format_reward_func": 0.9166666865348816, | |
| "step": 410 | |
| }, | |
| { | |
| "completion_length": 186.28423309326172, | |
| "epoch": 206.0, | |
| "grad_norm": 1.0624614282595348, | |
| "kl": 0.01776123046875, | |
| "learning_rate": 9.312986659581301e-09, | |
| "loss": 0.0, | |
| "reward": 1.5895833373069763, | |
| "reward_std": 0.45519132912158966, | |
| "rewards/answer_reward_func": 0.6833333373069763, | |
| "rewards/format_reward_func": 0.9062500298023224, | |
| "step": 412 | |
| }, | |
| { | |
| "completion_length": 233.00894165039062, | |
| "epoch": 207.0, | |
| "grad_norm": 0.8851564318058255, | |
| "kl": 0.012847900390625, | |
| "learning_rate": 8.363830315988945e-09, | |
| "loss": 0.0, | |
| "reward": 1.6791667342185974, | |
| "reward_std": 0.4650019705295563, | |
| "rewards/answer_reward_func": 0.772916704416275, | |
| "rewards/format_reward_func": 0.9062500298023224, | |
| "step": 414 | |
| }, | |
| { | |
| "completion_length": 194.01190948486328, | |
| "epoch": 208.0, | |
| "grad_norm": 1.155127800385124, | |
| "kl": 0.0128173828125, | |
| "learning_rate": 7.46485518885462e-09, | |
| "loss": 0.0, | |
| "reward": 1.6687501072883606, | |
| "reward_std": 0.34395796060562134, | |
| "rewards/answer_reward_func": 0.7208333611488342, | |
| "rewards/format_reward_func": 0.9479166865348816, | |
| "step": 416 | |
| }, | |
| { | |
| "completion_length": 169.8482208251953, | |
| "epoch": 209.0, | |
| "grad_norm": 1.1199426121122114, | |
| "kl": 0.017425537109375, | |
| "learning_rate": 6.616247970698319e-09, | |
| "loss": 0.0, | |
| "reward": 1.693750023841858, | |
| "reward_std": 0.3943335711956024, | |
| "rewards/answer_reward_func": 0.7562500536441803, | |
| "rewards/format_reward_func": 0.9375, | |
| "step": 418 | |
| }, | |
| { | |
| "completion_length": 243.14732360839844, | |
| "epoch": 210.0, | |
| "grad_norm": 0.8877069696048447, | |
| "kl": 0.01654052734375, | |
| "learning_rate": 5.8181848940044855e-09, | |
| "loss": 0.0, | |
| "reward": 1.6104167699813843, | |
| "reward_std": 0.4525974839925766, | |
| "rewards/answer_reward_func": 0.7041666805744171, | |
| "rewards/format_reward_func": 0.9062500298023224, | |
| "step": 420 | |
| }, | |
| { | |
| "completion_length": 289.8839416503906, | |
| "epoch": 211.0, | |
| "grad_norm": 0.8870179616904256, | |
| "kl": 0.011932373046875, | |
| "learning_rate": 5.070831694623135e-09, | |
| "loss": 0.0, | |
| "reward": 1.5895834565162659, | |
| "reward_std": 0.5177792310714722, | |
| "rewards/answer_reward_func": 0.6937500536441803, | |
| "rewards/format_reward_func": 0.8958333432674408, | |
| "step": 422 | |
| }, | |
| { | |
| "completion_length": 151.6934585571289, | |
| "epoch": 212.0, | |
| "grad_norm": 1.1557356844907873, | |
| "kl": 0.01593017578125, | |
| "learning_rate": 4.374343577351336e-09, | |
| "loss": 0.0, | |
| "reward": 1.7083333730697632, | |
| "reward_std": 0.354248970746994, | |
| "rewards/answer_reward_func": 0.7604166865348816, | |
| "rewards/format_reward_func": 0.9479166865348816, | |
| "step": 424 | |
| }, | |
| { | |
| "completion_length": 181.48959350585938, | |
| "epoch": 213.0, | |
| "grad_norm": 0.9183490231413828, | |
| "kl": 0.015350341796875, | |
| "learning_rate": 3.7288651837012745e-09, | |
| "loss": 0.0, | |
| "reward": 1.7020833492279053, | |
| "reward_std": 0.3603343367576599, | |
| "rewards/answer_reward_func": 0.7645833790302277, | |
| "rewards/format_reward_func": 0.9375000298023224, | |
| "step": 426 | |
| }, | |
| { | |
| "completion_length": 178.89137268066406, | |
| "epoch": 214.0, | |
| "grad_norm": 1.2202651910052664, | |
| "kl": 0.0166015625, | |
| "learning_rate": 3.134530561862081e-09, | |
| "loss": 0.0, | |
| "reward": 1.6979166865348816, | |
| "reward_std": 0.45207175612449646, | |
| "rewards/answer_reward_func": 0.78125, | |
| "rewards/format_reward_func": 0.9166666865348816, | |
| "step": 428 | |
| }, | |
| { | |
| "completion_length": 256.625, | |
| "epoch": 215.0, | |
| "grad_norm": 0.9481831734670307, | |
| "kl": 0.013458251953125, | |
| "learning_rate": 2.5914631388619103e-09, | |
| "loss": 0.0, | |
| "reward": 1.7041667103767395, | |
| "reward_std": 0.3960050940513611, | |
| "rewards/answer_reward_func": 0.7666667103767395, | |
| "rewards/format_reward_func": 0.9375000298023224, | |
| "step": 430 | |
| }, | |
| { | |
| "completion_length": 250.11905670166016, | |
| "epoch": 216.0, | |
| "grad_norm": 1.037124203173195, | |
| "kl": 0.0146484375, | |
| "learning_rate": 2.0997756949353297e-09, | |
| "loss": 0.0, | |
| "reward": 1.7562499642372131, | |
| "reward_std": 0.3770013302564621, | |
| "rewards/answer_reward_func": 0.8187500536441803, | |
| "rewards/format_reward_func": 0.9375000298023224, | |
| "step": 432 | |
| }, | |
| { | |
| "completion_length": 255.68006896972656, | |
| "epoch": 217.0, | |
| "grad_norm": 1.1918462249746316, | |
| "kl": 0.01416015625, | |
| "learning_rate": 1.6595703401020844e-09, | |
| "loss": 0.0, | |
| "reward": 1.6354167461395264, | |
| "reward_std": 0.4074978083372116, | |
| "rewards/answer_reward_func": 0.7187500298023224, | |
| "rewards/format_reward_func": 0.9166666865348816, | |
| "step": 434 | |
| }, | |
| { | |
| "completion_length": 232.20982360839844, | |
| "epoch": 218.0, | |
| "grad_norm": 0.7812593865128343, | |
| "kl": 0.01055908203125, | |
| "learning_rate": 1.2709384929615596e-09, | |
| "loss": 0.0, | |
| "reward": 1.6208333373069763, | |
| "reward_std": 0.4629315435886383, | |
| "rewards/answer_reward_func": 0.7249999940395355, | |
| "rewards/format_reward_func": 0.8958333432674408, | |
| "step": 436 | |
| }, | |
| { | |
| "completion_length": 174.2559585571289, | |
| "epoch": 219.0, | |
| "grad_norm": 1.1014720216285023, | |
| "kl": 0.01483154296875, | |
| "learning_rate": 9.339608617077165e-10, | |
| "loss": 0.0, | |
| "reward": 1.6354166865348816, | |
| "reward_std": 0.4915326237678528, | |
| "rewards/answer_reward_func": 0.7187499701976776, | |
| "rewards/format_reward_func": 0.9166666865348816, | |
| "step": 438 | |
| }, | |
| { | |
| "completion_length": 246.4464340209961, | |
| "epoch": 220.0, | |
| "grad_norm": 1.1926753556498022, | |
| "kl": 0.014129638671875, | |
| "learning_rate": 6.487074273681114e-10, | |
| "loss": 0.0, | |
| "reward": 1.67083340883255, | |
| "reward_std": 0.32807309925556183, | |
| "rewards/answer_reward_func": 0.7437499761581421, | |
| "rewards/format_reward_func": 0.9270833432674408, | |
| "step": 440 | |
| }, | |
| { | |
| "completion_length": 166.12649536132812, | |
| "epoch": 221.0, | |
| "grad_norm": 1.2191019490862662, | |
| "kl": 0.016021728515625, | |
| "learning_rate": 4.152374292708538e-10, | |
| "loss": 0.0, | |
| "reward": 1.6312501430511475, | |
| "reward_std": 0.5295368134975433, | |
| "rewards/answer_reward_func": 0.7354166805744171, | |
| "rewards/format_reward_func": 0.8958333432674408, | |
| "step": 442 | |
| }, | |
| { | |
| "completion_length": 206.5401840209961, | |
| "epoch": 222.0, | |
| "grad_norm": 1.1066549322199848, | |
| "kl": 0.01348876953125, | |
| "learning_rate": 2.3359935274214204e-10, | |
| "loss": 0.0, | |
| "reward": 1.7000001072883606, | |
| "reward_std": 0.4501575529575348, | |
| "rewards/answer_reward_func": 0.7833333611488342, | |
| "rewards/format_reward_func": 0.9166666865348816, | |
| "step": 444 | |
| }, | |
| { | |
| "completion_length": 198.20089721679688, | |
| "epoch": 223.0, | |
| "grad_norm": 1.126642799573971, | |
| "kl": 0.01629638671875, | |
| "learning_rate": 1.0383091903720665e-10, | |
| "loss": 0.0, | |
| "reward": 1.7145833373069763, | |
| "reward_std": 0.368631511926651, | |
| "rewards/answer_reward_func": 0.7666666507720947, | |
| "rewards/format_reward_func": 0.9479166865348816, | |
| "step": 446 | |
| }, | |
| { | |
| "completion_length": 190.87203216552734, | |
| "epoch": 224.0, | |
| "grad_norm": 0.8176781104321668, | |
| "kl": 0.016876220703125, | |
| "learning_rate": 2.595907750671533e-11, | |
| "loss": 0.0, | |
| "reward": 1.7895833849906921, | |
| "reward_std": 0.31373436748981476, | |
| "rewards/answer_reward_func": 0.831250011920929, | |
| "rewards/format_reward_func": 0.9583333730697632, | |
| "step": 448 | |
| }, | |
| { | |
| "completion_length": 223.83929443359375, | |
| "epoch": 225.0, | |
| "grad_norm": 0.8752615103675867, | |
| "kl": 0.0125732421875, | |
| "learning_rate": 0.0, | |
| "loss": 0.0, | |
| "reward": 1.6062501072883606, | |
| "reward_std": 0.4855257570743561, | |
| "rewards/answer_reward_func": 0.7208333611488342, | |
| "rewards/format_reward_func": 0.8854166865348816, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 225.0, | |
| "step": 450, | |
| "total_flos": 0.0, | |
| "train_loss": 7.9592482841109e-06, | |
| "train_runtime": 6112.3984, | |
| "train_samples_per_second": 3.534, | |
| "train_steps_per_second": 0.074 | |
| } | |
| ], | |
| "logging_steps": 2, | |
| "max_steps": 450, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 225, | |
| "save_steps": 32, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |