cc4718's picture
Model save
9cf052e verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 225.0,
"eval_steps": 500,
"global_step": 450,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 276.0401916503906,
"epoch": 1.0,
"grad_norm": 1.3174133685527831,
"kl": 0.0,
"learning_rate": 7.142857142857142e-08,
"loss": -0.0,
"reward": 1.6562499403953552,
"reward_std": 0.4393078237771988,
"rewards/answer_reward_func": 0.7499999701976776,
"rewards/format_reward_func": 0.9062500298023224,
"step": 2
},
{
"completion_length": 258.4851303100586,
"epoch": 2.0,
"grad_norm": 1.1213410501938252,
"kl": 0.0002532005310058594,
"learning_rate": 1.4285714285714285e-07,
"loss": 0.0,
"reward": 1.6145833730697632,
"reward_std": 0.5048187077045441,
"rewards/answer_reward_func": 0.7395834028720856,
"rewards/format_reward_func": 0.8750000298023224,
"step": 4
},
{
"completion_length": 195.49553680419922,
"epoch": 3.0,
"grad_norm": 1.0016861118507379,
"kl": 0.00025844573974609375,
"learning_rate": 2.1428571428571426e-07,
"loss": 0.0,
"reward": 1.7062500715255737,
"reward_std": 0.35797178000211716,
"rewards/answer_reward_func": 0.768750011920929,
"rewards/format_reward_func": 0.9375000298023224,
"step": 6
},
{
"completion_length": 229.18155670166016,
"epoch": 4.0,
"grad_norm": 1.2018085544480408,
"kl": 0.0002665519714355469,
"learning_rate": 2.857142857142857e-07,
"loss": 0.0,
"reward": 1.5499999523162842,
"reward_std": 0.6482782661914825,
"rewards/answer_reward_func": 0.706250011920929,
"rewards/format_reward_func": 0.8437500298023224,
"step": 8
},
{
"completion_length": 229.14732360839844,
"epoch": 5.0,
"grad_norm": 1.0483380939796498,
"kl": 0.00025463104248046875,
"learning_rate": 3.5714285714285716e-07,
"loss": 0.0,
"reward": 1.5416666865348816,
"reward_std": 0.5995936393737793,
"rewards/answer_reward_func": 0.6666666865348816,
"rewards/format_reward_func": 0.8750000298023224,
"step": 10
},
{
"completion_length": 264.84227752685547,
"epoch": 6.0,
"grad_norm": 0.9353624844800403,
"kl": 0.00026226043701171875,
"learning_rate": 4.285714285714285e-07,
"loss": 0.0,
"reward": 1.6062499284744263,
"reward_std": 0.5437474548816681,
"rewards/answer_reward_func": 0.7104166746139526,
"rewards/format_reward_func": 0.8958333730697632,
"step": 12
},
{
"completion_length": 250.52828216552734,
"epoch": 7.0,
"grad_norm": 0.9582379182354116,
"kl": 0.00023317337036132812,
"learning_rate": 5e-07,
"loss": 0.0,
"reward": 1.4354166388511658,
"reward_std": 0.6316270232200623,
"rewards/answer_reward_func": 0.6229166984558105,
"rewards/format_reward_func": 0.8125000298023224,
"step": 14
},
{
"completion_length": 251.17857360839844,
"epoch": 8.0,
"grad_norm": 0.9981178249237231,
"kl": 0.00026416778564453125,
"learning_rate": 4.999740409224932e-07,
"loss": 0.0,
"reward": 1.683333396911621,
"reward_std": 0.4488546848297119,
"rewards/answer_reward_func": 0.7562499940395355,
"rewards/format_reward_func": 0.9270833432674408,
"step": 16
},
{
"completion_length": 193.23214721679688,
"epoch": 9.0,
"grad_norm": 0.9392773566245655,
"kl": 0.0002613067626953125,
"learning_rate": 4.998961690809627e-07,
"loss": 0.0,
"reward": 1.6416666507720947,
"reward_std": 0.5428857207298279,
"rewards/answer_reward_func": 0.7354166507720947,
"rewards/format_reward_func": 0.9062500298023224,
"step": 18
},
{
"completion_length": 260.5014953613281,
"epoch": 10.0,
"grad_norm": 1.1286522636909715,
"kl": 0.00028705596923828125,
"learning_rate": 4.997664006472578e-07,
"loss": 0.0,
"reward": 1.4458333849906921,
"reward_std": 0.6854044795036316,
"rewards/answer_reward_func": 0.6541666686534882,
"rewards/format_reward_func": 0.7916666865348816,
"step": 20
},
{
"completion_length": 227.18006896972656,
"epoch": 11.0,
"grad_norm": 1.207761352187691,
"kl": 0.00022029876708984375,
"learning_rate": 4.995847625707292e-07,
"loss": 0.0,
"reward": 1.7687500715255737,
"reward_std": 0.3790818303823471,
"rewards/answer_reward_func": 0.8208333849906921,
"rewards/format_reward_func": 0.9479166865348816,
"step": 22
},
{
"completion_length": 286.2544708251953,
"epoch": 12.0,
"grad_norm": 0.8620527379171249,
"kl": 0.00023794174194335938,
"learning_rate": 4.993512925726318e-07,
"loss": 0.0,
"reward": 1.5958333611488342,
"reward_std": 0.5891686081886292,
"rewards/answer_reward_func": 0.7208333313465118,
"rewards/format_reward_func": 0.8750000298023224,
"step": 24
},
{
"completion_length": 257.24107360839844,
"epoch": 13.0,
"grad_norm": 1.0091569196984636,
"kl": 0.00029087066650390625,
"learning_rate": 4.990660391382923e-07,
"loss": 0.0,
"reward": 1.645833432674408,
"reward_std": 0.5208363234996796,
"rewards/answer_reward_func": 0.7500000298023224,
"rewards/format_reward_func": 0.8958333432674408,
"step": 26
},
{
"completion_length": 242.4806671142578,
"epoch": 14.0,
"grad_norm": 1.0541615732715437,
"kl": 0.000263214111328125,
"learning_rate": 4.987290615070384e-07,
"loss": 0.0,
"reward": 1.5625,
"reward_std": 0.5553774684667587,
"rewards/answer_reward_func": 0.6770833432674408,
"rewards/format_reward_func": 0.8854166865348816,
"step": 28
},
{
"completion_length": 282.1651916503906,
"epoch": 15.0,
"grad_norm": 0.9687037221617821,
"kl": 0.0002727508544921875,
"learning_rate": 4.983404296598978e-07,
"loss": 0.0,
"reward": 1.618749976158142,
"reward_std": 0.5825705528259277,
"rewards/answer_reward_func": 0.7333332896232605,
"rewards/format_reward_func": 0.8854166865348816,
"step": 30
},
{
"completion_length": 296.96429443359375,
"epoch": 16.0,
"grad_norm": 0.9052560635102206,
"kl": 0.00027942657470703125,
"learning_rate": 4.979002243050646e-07,
"loss": 0.0,
"reward": 1.6041666865348816,
"reward_std": 0.5438288599252701,
"rewards/answer_reward_func": 0.7187500596046448,
"rewards/format_reward_func": 0.8854166865348816,
"step": 32
},
{
"completion_length": 252.49703216552734,
"epoch": 17.0,
"grad_norm": 1.2903003085293443,
"kl": 0.000274658203125,
"learning_rate": 4.974085368611381e-07,
"loss": 0.0,
"reward": 1.4916666746139526,
"reward_std": 0.6277603805065155,
"rewards/answer_reward_func": 0.6895833611488342,
"rewards/format_reward_func": 0.8020833432674408,
"step": 34
},
{
"completion_length": 232.88690948486328,
"epoch": 18.0,
"grad_norm": 1.244565109610492,
"kl": 0.0002675056457519531,
"learning_rate": 4.968654694381379e-07,
"loss": 0.0,
"reward": 1.543749988079071,
"reward_std": 0.5515827536582947,
"rewards/answer_reward_func": 0.6687500178813934,
"rewards/format_reward_func": 0.8750000298023224,
"step": 36
},
{
"completion_length": 243.01786041259766,
"epoch": 19.0,
"grad_norm": 1.1406446283382206,
"kl": 0.000301361083984375,
"learning_rate": 4.962711348162987e-07,
"loss": 0.0,
"reward": 1.5791666507720947,
"reward_std": 0.5238045454025269,
"rewards/answer_reward_func": 0.6937499940395355,
"rewards/format_reward_func": 0.8854166865348816,
"step": 38
},
{
"completion_length": 274.58631896972656,
"epoch": 20.0,
"grad_norm": 0.9070852985537547,
"kl": 0.00029468536376953125,
"learning_rate": 4.956256564226487e-07,
"loss": 0.0,
"reward": 1.583333432674408,
"reward_std": 0.5769022554159164,
"rewards/answer_reward_func": 0.7083333730697632,
"rewards/format_reward_func": 0.8750000298023224,
"step": 40
},
{
"completion_length": 269.13245391845703,
"epoch": 21.0,
"grad_norm": 0.9374825554752583,
"kl": 0.0003261566162109375,
"learning_rate": 4.949291683053768e-07,
"loss": 0.0,
"reward": 1.574999988079071,
"reward_std": 0.459792360663414,
"rewards/answer_reward_func": 0.7000000178813934,
"rewards/format_reward_func": 0.8750000298023224,
"step": 42
},
{
"completion_length": 219.09673309326172,
"epoch": 22.0,
"grad_norm": 1.0791631460921267,
"kl": 0.00034618377685546875,
"learning_rate": 4.941818151059955e-07,
"loss": 0.0,
"reward": 1.6729166507720947,
"reward_std": 0.416482537984848,
"rewards/answer_reward_func": 0.7666666507720947,
"rewards/format_reward_func": 0.9062500298023224,
"step": 44
},
{
"completion_length": 295.5714416503906,
"epoch": 23.0,
"grad_norm": 0.9749875952370675,
"kl": 0.000362396240234375,
"learning_rate": 4.933837520293017e-07,
"loss": 0.0,
"reward": 1.5750000476837158,
"reward_std": 0.46283578872680664,
"rewards/answer_reward_func": 0.6687500476837158,
"rewards/format_reward_func": 0.90625,
"step": 46
},
{
"completion_length": 252.36013793945312,
"epoch": 24.0,
"grad_norm": 1.0280635613370392,
"kl": 0.00028324127197265625,
"learning_rate": 4.925351448111454e-07,
"loss": 0.0,
"reward": 1.6520832777023315,
"reward_std": 0.4525061100721359,
"rewards/answer_reward_func": 0.7458333671092987,
"rewards/format_reward_func": 0.90625,
"step": 48
},
{
"completion_length": 282.2053680419922,
"epoch": 25.0,
"grad_norm": 0.9539175555414532,
"kl": 0.00036907196044921875,
"learning_rate": 4.91636169684011e-07,
"loss": 0.0,
"reward": 1.5645832419395447,
"reward_std": 0.5369424521923065,
"rewards/answer_reward_func": 0.7000000476837158,
"rewards/format_reward_func": 0.8645833432674408,
"step": 50
},
{
"completion_length": 251.30060577392578,
"epoch": 26.0,
"grad_norm": 1.044878374634429,
"kl": 0.00030231475830078125,
"learning_rate": 4.906870133404186e-07,
"loss": 0.0,
"reward": 1.5916666388511658,
"reward_std": 0.4689074903726578,
"rewards/answer_reward_func": 0.7375000417232513,
"rewards/format_reward_func": 0.8541666865348816,
"step": 52
},
{
"completion_length": 299.84674072265625,
"epoch": 27.0,
"grad_norm": 1.2843916568865072,
"kl": 0.00040435791015625,
"learning_rate": 4.896878728941531e-07,
"loss": 0.0,
"reward": 1.508333444595337,
"reward_std": 0.6762565672397614,
"rewards/answer_reward_func": 0.6854166984558105,
"rewards/format_reward_func": 0.8229166865348816,
"step": 54
},
{
"completion_length": 243.796142578125,
"epoch": 28.0,
"grad_norm": 0.900181145652731,
"kl": 0.0004787445068359375,
"learning_rate": 4.886389558393284e-07,
"loss": 0.0,
"reward": 1.7166666984558105,
"reward_std": 0.39391638338565826,
"rewards/answer_reward_func": 0.7895833253860474,
"rewards/format_reward_func": 0.9270833432674408,
"step": 56
},
{
"completion_length": 231.99553680419922,
"epoch": 29.0,
"grad_norm": 1.0630743880615185,
"kl": 0.000530242919921875,
"learning_rate": 4.875404800072976e-07,
"loss": 0.0,
"reward": 1.6895833611488342,
"reward_std": 0.3743758350610733,
"rewards/answer_reward_func": 0.762499988079071,
"rewards/format_reward_func": 0.9270833432674408,
"step": 58
},
{
"completion_length": 269.95982360839844,
"epoch": 30.0,
"grad_norm": 1.0709394734964757,
"kl": 0.00045013427734375,
"learning_rate": 4.86392673521415e-07,
"loss": 0.0,
"reward": 1.5354167222976685,
"reward_std": 0.5249515026807785,
"rewards/answer_reward_func": 0.6604166626930237,
"rewards/format_reward_func": 0.8750000298023224,
"step": 60
},
{
"completion_length": 261.08631896972656,
"epoch": 31.0,
"grad_norm": 0.8641809174103421,
"kl": 0.0004787445068359375,
"learning_rate": 4.851957747496606e-07,
"loss": 0.0,
"reward": 1.5916667580604553,
"reward_std": 0.46947768330574036,
"rewards/answer_reward_func": 0.6958333253860474,
"rewards/format_reward_func": 0.8958333432674408,
"step": 62
},
{
"completion_length": 269.31846618652344,
"epoch": 32.0,
"grad_norm": 0.8600216221217903,
"kl": 0.000507354736328125,
"learning_rate": 4.839500322551386e-07,
"loss": 0.0,
"reward": 1.6083332896232605,
"reward_std": 0.43799301981925964,
"rewards/answer_reward_func": 0.7229167222976685,
"rewards/format_reward_func": 0.8854166865348816,
"step": 64
},
{
"completion_length": 267.78721618652344,
"epoch": 33.0,
"grad_norm": 1.0025448441816613,
"kl": 0.000667572021484375,
"learning_rate": 4.826557047444563e-07,
"loss": 0.0,
"reward": 1.4520833492279053,
"reward_std": 0.6994097530841827,
"rewards/answer_reward_func": 0.6187500357627869,
"rewards/format_reward_func": 0.8333333730697632,
"step": 66
},
{
"completion_length": 188.45387268066406,
"epoch": 34.0,
"grad_norm": 1.126524575820377,
"kl": 0.00066375732421875,
"learning_rate": 4.813130610139993e-07,
"loss": 0.0,
"reward": 1.7291666269302368,
"reward_std": 0.4704573303461075,
"rewards/answer_reward_func": 0.8125,
"rewards/format_reward_func": 0.9166666865348816,
"step": 68
},
{
"completion_length": 261.0133972167969,
"epoch": 35.0,
"grad_norm": 1.022490845344434,
"kl": 0.000644683837890625,
"learning_rate": 4.799223798941089e-07,
"loss": 0.0,
"reward": 1.6062500476837158,
"reward_std": 0.41893763840198517,
"rewards/answer_reward_func": 0.6791667342185974,
"rewards/format_reward_func": 0.9270833432674408,
"step": 70
},
{
"completion_length": 256.7976379394531,
"epoch": 36.0,
"grad_norm": 1.1014000572624154,
"kl": 0.0005702972412109375,
"learning_rate": 4.78483950191177e-07,
"loss": 0.0,
"reward": 1.6583333611488342,
"reward_std": 0.3685748428106308,
"rewards/answer_reward_func": 0.7312500774860382,
"rewards/format_reward_func": 0.9270833432674408,
"step": 72
},
{
"completion_length": 243.47917938232422,
"epoch": 37.0,
"grad_norm": 0.9707014299449478,
"kl": 0.000640869140625,
"learning_rate": 4.769980706276687e-07,
"loss": 0.0,
"reward": 1.6625000834465027,
"reward_std": 0.4419756233692169,
"rewards/answer_reward_func": 0.7458333373069763,
"rewards/format_reward_func": 0.9166666865348816,
"step": 74
},
{
"completion_length": 263.03424072265625,
"epoch": 38.0,
"grad_norm": 1.1820665004678212,
"kl": 0.0007991790771484375,
"learning_rate": 4.7546504978008595e-07,
"loss": 0.0,
"reward": 1.6458333730697632,
"reward_std": 0.4905931055545807,
"rewards/answer_reward_func": 0.7395833730697632,
"rewards/format_reward_func": 0.9062500298023224,
"step": 76
},
{
"completion_length": 261.6458511352539,
"epoch": 39.0,
"grad_norm": 1.1213274436179879,
"kl": 0.0009899139404296875,
"learning_rate": 4.738852060148848e-07,
"loss": 0.0,
"reward": 1.64166659116745,
"reward_std": 0.48026007413864136,
"rewards/answer_reward_func": 0.7354167103767395,
"rewards/format_reward_func": 0.9062500298023224,
"step": 78
},
{
"completion_length": 207.6875,
"epoch": 40.0,
"grad_norm": 1.1448749496253416,
"kl": 0.0008602142333984375,
"learning_rate": 4.722588674223593e-07,
"loss": 0.0,
"reward": 1.5166666507720947,
"reward_std": 0.6429217755794525,
"rewards/answer_reward_func": 0.6729167103767395,
"rewards/format_reward_func": 0.8437500298023224,
"step": 80
},
{
"completion_length": 266.53125,
"epoch": 41.0,
"grad_norm": 1.811926744689833,
"kl": 0.00104522705078125,
"learning_rate": 4.70586371748506e-07,
"loss": 0.0,
"reward": 1.7104166746139526,
"reward_std": 0.4602654129266739,
"rewards/answer_reward_func": 0.783333420753479,
"rewards/format_reward_func": 0.9270833432674408,
"step": 82
},
{
"completion_length": 194.20238876342773,
"epoch": 42.0,
"grad_norm": 1.270434695255441,
"kl": 0.0009632110595703125,
"learning_rate": 4.6886806632488363e-07,
"loss": 0.0,
"reward": 1.5916666984558105,
"reward_std": 0.5647197067737579,
"rewards/answer_reward_func": 0.737500011920929,
"rewards/format_reward_func": 0.8541666865348816,
"step": 84
},
{
"completion_length": 275.46429443359375,
"epoch": 43.0,
"grad_norm": 1.2073773531159788,
"kl": 0.000888824462890625,
"learning_rate": 4.6710430799648143e-07,
"loss": 0.0,
"reward": 1.477083444595337,
"reward_std": 0.5555408447980881,
"rewards/answer_reward_func": 0.612500011920929,
"rewards/format_reward_func": 0.8645833432674408,
"step": 86
},
{
"completion_length": 282.5639953613281,
"epoch": 44.0,
"grad_norm": 1.064636286784791,
"kl": 0.00095367431640625,
"learning_rate": 4.652954630476127e-07,
"loss": 0.0,
"reward": 1.6437500715255737,
"reward_std": 0.3799179792404175,
"rewards/answer_reward_func": 0.7062500715255737,
"rewards/format_reward_func": 0.9375,
"step": 88
},
{
"completion_length": 272.7544708251953,
"epoch": 45.0,
"grad_norm": 1.042110371163498,
"kl": 0.00125885009765625,
"learning_rate": 4.6344190712584713e-07,
"loss": 0.0,
"reward": 1.5416666865348816,
"reward_std": 0.5731079876422882,
"rewards/answer_reward_func": 0.6875,
"rewards/format_reward_func": 0.8541666865348816,
"step": 90
},
{
"completion_length": 256.8943634033203,
"epoch": 46.0,
"grad_norm": 0.9870127861284985,
"kl": 0.001251220703125,
"learning_rate": 4.615440251639995e-07,
"loss": 0.0,
"reward": 1.65625,
"reward_std": 0.4869039058685303,
"rewards/answer_reward_func": 0.7395833432674408,
"rewards/format_reward_func": 0.9166666865348816,
"step": 92
},
{
"completion_length": 211.87054443359375,
"epoch": 47.0,
"grad_norm": 0.9671737851691804,
"kl": 0.001361846923828125,
"learning_rate": 4.596022113001894e-07,
"loss": 0.0,
"reward": 1.6625000834465027,
"reward_std": 0.5325099229812622,
"rewards/answer_reward_func": 0.7666666805744171,
"rewards/format_reward_func": 0.8958333432674408,
"step": 94
},
{
"completion_length": 193.9732208251953,
"epoch": 48.0,
"grad_norm": 0.967143862670542,
"kl": 0.001125335693359375,
"learning_rate": 4.576168687959895e-07,
"loss": 0.0,
"reward": 1.5833333730697632,
"reward_std": 0.5218211710453033,
"rewards/answer_reward_func": 0.6874999701976776,
"rewards/format_reward_func": 0.8958333730697632,
"step": 96
},
{
"completion_length": 235.55209350585938,
"epoch": 49.0,
"grad_norm": 0.8679119146176711,
"kl": 0.00141143798828125,
"learning_rate": 4.555884099526793e-07,
"loss": 0.0,
"reward": 1.6437500715255737,
"reward_std": 0.4265412539243698,
"rewards/answer_reward_func": 0.7479167282581329,
"rewards/format_reward_func": 0.8958333730697632,
"step": 98
},
{
"completion_length": 258.24107360839844,
"epoch": 50.0,
"grad_norm": 0.9735600110189709,
"kl": 0.00128936767578125,
"learning_rate": 4.5351725602562174e-07,
"loss": 0.0,
"reward": 1.5458332896232605,
"reward_std": 0.5863806307315826,
"rewards/answer_reward_func": 0.6812499761581421,
"rewards/format_reward_func": 0.8645833432674408,
"step": 100
},
{
"completion_length": 251.37798309326172,
"epoch": 51.0,
"grad_norm": 1.1010153912626721,
"kl": 0.00257110595703125,
"learning_rate": 4.514038371367791e-07,
"loss": 0.0,
"reward": 1.6875000596046448,
"reward_std": 0.3308330178260803,
"rewards/answer_reward_func": 0.7395833432674408,
"rewards/format_reward_func": 0.9479166865348816,
"step": 102
},
{
"completion_length": 265.4866180419922,
"epoch": 52.0,
"grad_norm": 0.9819207836814513,
"kl": 0.0019073486328125,
"learning_rate": 4.4924859218538936e-07,
"loss": 0.0,
"reward": 1.631250023841858,
"reward_std": 0.4357140362262726,
"rewards/answer_reward_func": 0.7250000238418579,
"rewards/format_reward_func": 0.90625,
"step": 104
},
{
"completion_length": 233.65030670166016,
"epoch": 53.0,
"grad_norm": 1.0300874744860617,
"kl": 0.001468658447265625,
"learning_rate": 4.470519687568185e-07,
"loss": 0.0,
"reward": 1.4812501072883606,
"reward_std": 0.5527896136045456,
"rewards/answer_reward_func": 0.6375000774860382,
"rewards/format_reward_func": 0.84375,
"step": 106
},
{
"completion_length": 206.92857360839844,
"epoch": 54.0,
"grad_norm": 1.1141272522177064,
"kl": 0.001773834228515625,
"learning_rate": 4.4481442302960923e-07,
"loss": 0.0,
"reward": 1.7687500715255737,
"reward_std": 0.3796774446964264,
"rewards/answer_reward_func": 0.820833295583725,
"rewards/format_reward_func": 0.9479166865348816,
"step": 108
},
{
"completion_length": 221.09673309326172,
"epoch": 55.0,
"grad_norm": 1.352829863229064,
"kl": 0.002471923828125,
"learning_rate": 4.4253641968074505e-07,
"loss": 0.0,
"reward": 1.5208333730697632,
"reward_std": 0.5520410537719727,
"rewards/answer_reward_func": 0.6562500298023224,
"rewards/format_reward_func": 0.8645833432674408,
"step": 110
},
{
"completion_length": 257.77232360839844,
"epoch": 56.0,
"grad_norm": 1.1920858052182373,
"kl": 0.00225067138671875,
"learning_rate": 4.402184317891501e-07,
"loss": 0.0,
"reward": 1.5583334565162659,
"reward_std": 0.5900199711322784,
"rewards/answer_reward_func": 0.6937500238418579,
"rewards/format_reward_func": 0.8645833730697632,
"step": 112
},
{
"completion_length": 235.68899536132812,
"epoch": 57.0,
"grad_norm": 1.3653230138170256,
"kl": 0.00194549560546875,
"learning_rate": 4.37860940737443e-07,
"loss": 0.0,
"reward": 1.6520833373069763,
"reward_std": 0.46646520495414734,
"rewards/answer_reward_func": 0.7458333969116211,
"rewards/format_reward_func": 0.9062500298023224,
"step": 114
},
{
"completion_length": 258.796142578125,
"epoch": 58.0,
"grad_norm": 0.8403380498881522,
"kl": 0.001804351806640625,
"learning_rate": 4.354644361119671e-07,
"loss": 0.0,
"reward": 1.600000023841858,
"reward_std": 0.505972146987915,
"rewards/answer_reward_func": 0.7145833671092987,
"rewards/format_reward_func": 0.8854166865348816,
"step": 116
},
{
"completion_length": 249.9494171142578,
"epoch": 59.0,
"grad_norm": 1.0657379909631177,
"kl": 0.001987457275390625,
"learning_rate": 4.3302941560111716e-07,
"loss": 0.0,
"reward": 1.6354166865348816,
"reward_std": 0.4670645296573639,
"rewards/answer_reward_func": 0.71875,
"rewards/format_reward_func": 0.9166666865348816,
"step": 118
},
{
"completion_length": 223.75149536132812,
"epoch": 60.0,
"grad_norm": 1.0364423920157473,
"kl": 0.0026397705078125,
"learning_rate": 4.3055638489198236e-07,
"loss": 0.0,
"reward": 1.6645833849906921,
"reward_std": 0.36297454684972763,
"rewards/answer_reward_func": 0.7270833849906921,
"rewards/format_reward_func": 0.9375000298023224,
"step": 120
},
{
"completion_length": 231.37798309326172,
"epoch": 61.0,
"grad_norm": 0.689843165639308,
"kl": 0.00292205810546875,
"learning_rate": 4.280458575653296e-07,
"loss": 0.0,
"reward": 1.625,
"reward_std": 0.4028606414794922,
"rewards/answer_reward_func": 0.7083333730697632,
"rewards/format_reward_func": 0.9166666865348816,
"step": 122
},
{
"completion_length": 205.21726989746094,
"epoch": 62.0,
"grad_norm": 0.9343619997416344,
"kl": 0.002593994140625,
"learning_rate": 4.2549835498894665e-07,
"loss": 0.0,
"reward": 1.6812500357627869,
"reward_std": 0.3913162499666214,
"rewards/answer_reward_func": 0.7645833492279053,
"rewards/format_reward_func": 0.9166666865348816,
"step": 124
},
{
"completion_length": 244.34524536132812,
"epoch": 63.0,
"grad_norm": 0.8737346807108217,
"kl": 0.00287628173828125,
"learning_rate": 4.229144062093679e-07,
"loss": 0.0,
"reward": 1.6604167222976685,
"reward_std": 0.3475506007671356,
"rewards/answer_reward_func": 0.7229166924953461,
"rewards/format_reward_func": 0.9375,
"step": 126
},
{
"completion_length": 207.37053680419922,
"epoch": 64.0,
"grad_norm": 0.9790196814028996,
"kl": 0.003173828125,
"learning_rate": 4.2029454784200675e-07,
"loss": 0.0,
"reward": 1.5520833730697632,
"reward_std": 0.5492734163999557,
"rewards/answer_reward_func": 0.6770833432674408,
"rewards/format_reward_func": 0.8750000298023224,
"step": 128
},
{
"completion_length": 243.30506896972656,
"epoch": 65.0,
"grad_norm": 0.8985131154749594,
"kl": 0.00261688232421875,
"learning_rate": 4.1763932395971433e-07,
"loss": 0.0,
"reward": 1.5479167103767395,
"reward_std": 0.5215294808149338,
"rewards/answer_reward_func": 0.6729166805744171,
"rewards/format_reward_func": 0.8750000298023224,
"step": 130
},
{
"completion_length": 187.57887268066406,
"epoch": 66.0,
"grad_norm": 1.3006465432399807,
"kl": 0.00403594970703125,
"learning_rate": 4.1494928597979117e-07,
"loss": 0.0,
"reward": 1.7729166746139526,
"reward_std": 0.29627224802970886,
"rewards/answer_reward_func": 0.8250000774860382,
"rewards/format_reward_func": 0.9479166865348816,
"step": 132
},
{
"completion_length": 234.69644165039062,
"epoch": 67.0,
"grad_norm": 1.3361777010265043,
"kl": 0.003692626953125,
"learning_rate": 4.122249925494726e-07,
"loss": 0.0,
"reward": 1.6229167580604553,
"reward_std": 0.42341606318950653,
"rewards/answer_reward_func": 0.7062499821186066,
"rewards/format_reward_func": 0.9166666865348816,
"step": 134
},
{
"completion_length": 202.57589721679688,
"epoch": 68.0,
"grad_norm": 1.17339881745284,
"kl": 0.004425048828125,
"learning_rate": 4.094670094299131e-07,
"loss": 0.0,
"reward": 1.6874999403953552,
"reward_std": 0.44907137751579285,
"rewards/answer_reward_func": 0.7708333134651184,
"rewards/format_reward_func": 0.9166666865348816,
"step": 136
},
{
"completion_length": 213.21875762939453,
"epoch": 69.0,
"grad_norm": 0.7429382258526259,
"kl": 0.00403594970703125,
"learning_rate": 4.066759093786931e-07,
"loss": 0.0,
"reward": 1.756250023841858,
"reward_std": 0.4163903295993805,
"rewards/answer_reward_func": 0.8291666805744171,
"rewards/format_reward_func": 0.9270833432674408,
"step": 138
},
{
"completion_length": 270.60120391845703,
"epoch": 70.0,
"grad_norm": 0.9593428203594789,
"kl": 0.00347137451171875,
"learning_rate": 4.038522720308732e-07,
"loss": 0.0,
"reward": 1.7083333134651184,
"reward_std": 0.5051226913928986,
"rewards/answer_reward_func": 0.8125000298023224,
"rewards/format_reward_func": 0.8958333432674408,
"step": 140
},
{
"completion_length": 278.47471618652344,
"epoch": 71.0,
"grad_norm": 0.9836950616198032,
"kl": 0.00399017333984375,
"learning_rate": 4.009966837786194e-07,
"loss": 0.0,
"reward": 1.5312500596046448,
"reward_std": 0.5006265342235565,
"rewards/answer_reward_func": 0.625,
"rewards/format_reward_func": 0.9062500298023224,
"step": 142
},
{
"completion_length": 262.28125,
"epoch": 72.0,
"grad_norm": 1.007097659012535,
"kl": 0.00365447998046875,
"learning_rate": 3.981097376494259e-07,
"loss": 0.0,
"reward": 1.5854166746139526,
"reward_std": 0.4385734647512436,
"rewards/answer_reward_func": 0.679166704416275,
"rewards/format_reward_func": 0.9062500298023224,
"step": 144
},
{
"completion_length": 201.98065948486328,
"epoch": 73.0,
"grad_norm": 0.9873674045003755,
"kl": 0.00390625,
"learning_rate": 3.951920331829592e-07,
"loss": 0.0,
"reward": 1.7104167342185974,
"reward_std": 0.41417770087718964,
"rewards/answer_reward_func": 0.7833333313465118,
"rewards/format_reward_func": 0.9270833432674408,
"step": 146
},
{
"completion_length": 233.78274536132812,
"epoch": 74.0,
"grad_norm": 0.8019175311994922,
"kl": 0.0042877197265625,
"learning_rate": 3.922441763065506e-07,
"loss": 0.0,
"reward": 1.6625000834465027,
"reward_std": 0.5313624888658524,
"rewards/answer_reward_func": 0.7666666507720947,
"rewards/format_reward_func": 0.8958333432674408,
"step": 148
},
{
"completion_length": 215.09673309326172,
"epoch": 75.0,
"grad_norm": 1.1807131764400134,
"kl": 0.004913330078125,
"learning_rate": 3.8926677920936093e-07,
"loss": 0.0,
"reward": 1.7541666626930237,
"reward_std": 0.37399402260780334,
"rewards/answer_reward_func": 0.8270833790302277,
"rewards/format_reward_func": 0.9270833432674408,
"step": 150
},
{
"completion_length": 236.639892578125,
"epoch": 76.0,
"grad_norm": 0.8847773717166705,
"kl": 0.00482177734375,
"learning_rate": 3.862604602152464e-07,
"loss": 0.0,
"reward": 1.600000023841858,
"reward_std": 0.4837402403354645,
"rewards/answer_reward_func": 0.6937500536441803,
"rewards/format_reward_func": 0.9062500298023224,
"step": 152
},
{
"completion_length": 256.1979217529297,
"epoch": 77.0,
"grad_norm": 1.1407877139333158,
"kl": 0.00395965576171875,
"learning_rate": 3.8322584365434934e-07,
"loss": 0.0,
"reward": 1.6541666984558105,
"reward_std": 0.4243648201227188,
"rewards/answer_reward_func": 0.7479166686534882,
"rewards/format_reward_func": 0.90625,
"step": 154
},
{
"completion_length": 273.0744094848633,
"epoch": 78.0,
"grad_norm": 0.9978906361186654,
"kl": 0.004241943359375,
"learning_rate": 3.8016355973344173e-07,
"loss": 0.0,
"reward": 1.5770833492279053,
"reward_std": 0.43396155536174774,
"rewards/answer_reward_func": 0.6812500357627869,
"rewards/format_reward_func": 0.8958333730697632,
"step": 156
},
{
"completion_length": 184.15774536132812,
"epoch": 79.0,
"grad_norm": 1.6097549828202833,
"kl": 0.006256103515625,
"learning_rate": 3.7707424440504863e-07,
"loss": 0.0,
"reward": 1.5937500596046448,
"reward_std": 0.47065815329551697,
"rewards/answer_reward_func": 0.6770833432674408,
"rewards/format_reward_func": 0.9166666865348816,
"step": 158
},
{
"completion_length": 258.27679443359375,
"epoch": 80.0,
"grad_norm": 1.0827800443504645,
"kl": 0.0040435791015625,
"learning_rate": 3.739585392353787e-07,
"loss": 0.0,
"reward": 1.5458333492279053,
"reward_std": 0.48180142045021057,
"rewards/answer_reward_func": 0.6500000357627869,
"rewards/format_reward_func": 0.8958333432674408,
"step": 160
},
{
"completion_length": 225.63839721679688,
"epoch": 81.0,
"grad_norm": 0.9501823254901924,
"kl": 0.0042724609375,
"learning_rate": 3.7081709127108767e-07,
"loss": 0.0,
"reward": 1.7666666507720947,
"reward_std": 0.300030842423439,
"rewards/answer_reward_func": 0.8187500238418579,
"rewards/format_reward_func": 0.9479166865348816,
"step": 162
},
{
"completion_length": 197.98214721679688,
"epoch": 82.0,
"grad_norm": 1.1449475824136237,
"kl": 0.0059356689453125,
"learning_rate": 3.6765055290490513e-07,
"loss": 0.0,
"reward": 1.7208333611488342,
"reward_std": 0.40443800389766693,
"rewards/answer_reward_func": 0.7937500774860382,
"rewards/format_reward_func": 0.9270833432674408,
"step": 164
},
{
"completion_length": 279.3407897949219,
"epoch": 83.0,
"grad_norm": 1.0930660569963664,
"kl": 0.00414276123046875,
"learning_rate": 3.644595817401501e-07,
"loss": 0.0,
"reward": 1.6229166984558105,
"reward_std": 0.3718283176422119,
"rewards/answer_reward_func": 0.6854166984558105,
"rewards/format_reward_func": 0.9375000298023224,
"step": 166
},
{
"completion_length": 267.35120391845703,
"epoch": 84.0,
"grad_norm": 0.9881293106299988,
"kl": 0.005523681640625,
"learning_rate": 3.6124484045416483e-07,
"loss": 0.0,
"reward": 1.6041667461395264,
"reward_std": 0.5452571213245392,
"rewards/answer_reward_func": 0.71875,
"rewards/format_reward_func": 0.8854166865348816,
"step": 168
},
{
"completion_length": 195.10565948486328,
"epoch": 85.0,
"grad_norm": 0.917551086901663,
"kl": 0.0063629150390625,
"learning_rate": 3.580069966606949e-07,
"loss": 0.0,
"reward": 1.5958333015441895,
"reward_std": 0.42350558936595917,
"rewards/answer_reward_func": 0.679166704416275,
"rewards/format_reward_func": 0.9166666865348816,
"step": 170
},
{
"completion_length": 279.7306594848633,
"epoch": 86.0,
"grad_norm": 0.9580308294096218,
"kl": 0.00469970703125,
"learning_rate": 3.547467227712444e-07,
"loss": 0.0,
"reward": 1.6729167699813843,
"reward_std": 0.43303608894348145,
"rewards/answer_reward_func": 0.7458333373069763,
"rewards/format_reward_func": 0.9270833730697632,
"step": 172
},
{
"completion_length": 275.31846618652344,
"epoch": 87.0,
"grad_norm": 1.413142917482677,
"kl": 0.006561279296875,
"learning_rate": 3.5146469585543386e-07,
"loss": 0.0,
"reward": 1.48333340883255,
"reward_std": 0.5618228912353516,
"rewards/answer_reward_func": 0.6291667222976685,
"rewards/format_reward_func": 0.8541666865348816,
"step": 174
},
{
"completion_length": 261.55804443359375,
"epoch": 88.0,
"grad_norm": 1.2298546234313372,
"kl": 0.0056304931640625,
"learning_rate": 3.481615975003922e-07,
"loss": 0.0,
"reward": 1.7395833730697632,
"reward_std": 0.4074627459049225,
"rewards/answer_reward_func": 0.8020833432674408,
"rewards/format_reward_func": 0.9375000298023224,
"step": 176
},
{
"completion_length": 225.52976989746094,
"epoch": 89.0,
"grad_norm": 0.9886590206068148,
"kl": 0.0056610107421875,
"learning_rate": 3.448381136692089e-07,
"loss": 0.0,
"reward": 1.6395833492279053,
"reward_std": 0.46003973484039307,
"rewards/answer_reward_func": 0.7333333790302277,
"rewards/format_reward_func": 0.9062500298023224,
"step": 178
},
{
"completion_length": 233.577392578125,
"epoch": 90.0,
"grad_norm": 1.015531261972698,
"kl": 0.00555419921875,
"learning_rate": 3.4149493455847897e-07,
"loss": 0.0,
"reward": 1.6687501072883606,
"reward_std": 0.45826081931591034,
"rewards/answer_reward_func": 0.7520833313465118,
"rewards/format_reward_func": 0.9166666865348816,
"step": 180
},
{
"completion_length": 259.03424072265625,
"epoch": 91.0,
"grad_norm": 0.979842350756012,
"kl": 0.005523681640625,
"learning_rate": 3.3813275445496766e-07,
"loss": 0.0,
"reward": 1.5625000596046448,
"reward_std": 0.5489896833896637,
"rewards/answer_reward_func": 0.6875000596046448,
"rewards/format_reward_func": 0.875,
"step": 182
},
{
"completion_length": 293.6592330932617,
"epoch": 92.0,
"grad_norm": 0.9379840832402985,
"kl": 0.0054168701171875,
"learning_rate": 3.347522715914262e-07,
"loss": 0.0,
"reward": 1.5604167580604553,
"reward_std": 0.5173148959875107,
"rewards/answer_reward_func": 0.6750000417232513,
"rewards/format_reward_func": 0.8854166865348816,
"step": 184
},
{
"completion_length": 231.08185577392578,
"epoch": 93.0,
"grad_norm": 1.3551218453321658,
"kl": 0.0079193115234375,
"learning_rate": 3.313541880015877e-07,
"loss": 0.0,
"reward": 1.6229167580604553,
"reward_std": 0.498469278216362,
"rewards/answer_reward_func": 0.7166667282581329,
"rewards/format_reward_func": 0.90625,
"step": 186
},
{
"completion_length": 184.6636962890625,
"epoch": 94.0,
"grad_norm": 0.7343039648552588,
"kl": 0.0069122314453125,
"learning_rate": 3.279392093743747e-07,
"loss": 0.0,
"reward": 1.6437500715255737,
"reward_std": 0.40093202888965607,
"rewards/answer_reward_func": 0.7374999821186066,
"rewards/format_reward_func": 0.90625,
"step": 188
},
{
"completion_length": 164.43750381469727,
"epoch": 95.0,
"grad_norm": 1.3333212202349367,
"kl": 0.009368896484375,
"learning_rate": 3.245080449073459e-07,
"loss": 0.0,
"reward": 1.518750011920929,
"reward_std": 0.5581459701061249,
"rewards/answer_reward_func": 0.6645833551883698,
"rewards/format_reward_func": 0.8541666865348816,
"step": 190
},
{
"completion_length": 243.56846618652344,
"epoch": 96.0,
"grad_norm": 0.9202461820347047,
"kl": 0.004638671875,
"learning_rate": 3.210614071594162e-07,
"loss": 0.0,
"reward": 1.5416667461395264,
"reward_std": 0.6040982007980347,
"rewards/answer_reward_func": 0.6666666865348816,
"rewards/format_reward_func": 0.875,
"step": 192
},
{
"completion_length": 226.83928680419922,
"epoch": 97.0,
"grad_norm": 1.0722947522372448,
"kl": 0.0070037841796875,
"learning_rate": 3.1760001190287695e-07,
"loss": 0.0,
"reward": 1.6041668057441711,
"reward_std": 0.4665968716144562,
"rewards/answer_reward_func": 0.7083334028720856,
"rewards/format_reward_func": 0.8958333432674408,
"step": 194
},
{
"completion_length": 174.95387268066406,
"epoch": 98.0,
"grad_norm": 1.3127977169467409,
"kl": 0.00787353515625,
"learning_rate": 3.141245779747502e-07,
"loss": 0.0,
"reward": 1.6520834565162659,
"reward_std": 0.4045562893152237,
"rewards/answer_reward_func": 0.7250000536441803,
"rewards/format_reward_func": 0.9270833730697632,
"step": 196
},
{
"completion_length": 171.74703216552734,
"epoch": 99.0,
"grad_norm": 1.187680476036088,
"kl": 0.007598876953125,
"learning_rate": 3.106358271275056e-07,
"loss": 0.0,
"reward": 1.756250023841858,
"reward_std": 0.37892505526542664,
"rewards/answer_reward_func": 0.8187499940395355,
"rewards/format_reward_func": 0.9375,
"step": 198
},
{
"completion_length": 250.858642578125,
"epoch": 100.0,
"grad_norm": 0.9052133122410684,
"kl": 0.00677490234375,
"learning_rate": 3.0713448387917227e-07,
"loss": 0.0,
"reward": 1.681249976158142,
"reward_std": 0.4449262470006943,
"rewards/answer_reward_func": 0.7750000357627869,
"rewards/format_reward_func": 0.90625,
"step": 200
},
{
"completion_length": 163.5416717529297,
"epoch": 101.0,
"grad_norm": 0.8373387718652715,
"kl": 0.0063018798828125,
"learning_rate": 3.0362127536287636e-07,
"loss": 0.0,
"reward": 1.652083396911621,
"reward_std": 0.46857765316963196,
"rewards/answer_reward_func": 0.7354167401790619,
"rewards/format_reward_func": 0.9166666865348816,
"step": 202
},
{
"completion_length": 246.1131134033203,
"epoch": 102.0,
"grad_norm": 0.7374900854502903,
"kl": 0.006195068359375,
"learning_rate": 3.0009693117583523e-07,
"loss": 0.0,
"reward": 1.6479167938232422,
"reward_std": 0.5083845257759094,
"rewards/answer_reward_func": 0.762499988079071,
"rewards/format_reward_func": 0.8854166865348816,
"step": 204
},
{
"completion_length": 231.79911041259766,
"epoch": 103.0,
"grad_norm": 1.180696672190672,
"kl": 0.010528564453125,
"learning_rate": 2.965621832278401e-07,
"loss": 0.0,
"reward": 1.6229166388511658,
"reward_std": 0.6331139504909515,
"rewards/answer_reward_func": 0.7479166984558105,
"rewards/format_reward_func": 0.875,
"step": 206
},
{
"completion_length": 237.18602752685547,
"epoch": 104.0,
"grad_norm": 1.0783781256408975,
"kl": 0.0068817138671875,
"learning_rate": 2.9301776558925875e-07,
"loss": 0.0,
"reward": 1.6979168057441711,
"reward_std": 0.3851548731327057,
"rewards/answer_reward_func": 0.7604166865348816,
"rewards/format_reward_func": 0.9375000298023224,
"step": 208
},
{
"completion_length": 194.03274536132812,
"epoch": 105.0,
"grad_norm": 1.3242781951977625,
"kl": 0.008575439453125,
"learning_rate": 2.894644143385885e-07,
"loss": 0.0,
"reward": 1.7208333611488342,
"reward_std": 0.3679712861776352,
"rewards/answer_reward_func": 0.7729166746139526,
"rewards/format_reward_func": 0.9479166865348816,
"step": 210
},
{
"completion_length": 254.84375762939453,
"epoch": 106.0,
"grad_norm": 1.2562417854229282,
"kl": 0.0070953369140625,
"learning_rate": 2.859028674095937e-07,
"loss": 0.0,
"reward": 1.5958333015441895,
"reward_std": 0.36180783808231354,
"rewards/answer_reward_func": 0.6687500178813934,
"rewards/format_reward_func": 0.9270833432674408,
"step": 212
},
{
"completion_length": 220.92262268066406,
"epoch": 107.0,
"grad_norm": 1.1976305491708217,
"kl": 0.0076141357421875,
"learning_rate": 2.823338644380566e-07,
"loss": 0.0,
"reward": 1.5916667580604553,
"reward_std": 0.4970894306898117,
"rewards/answer_reward_func": 0.6854166984558105,
"rewards/format_reward_func": 0.9062500298023224,
"step": 214
},
{
"completion_length": 230.11607360839844,
"epoch": 108.0,
"grad_norm": 1.2521189573673432,
"kl": 0.011016845703125,
"learning_rate": 2.7875814660817504e-07,
"loss": 0.0,
"reward": 1.6437500715255737,
"reward_std": 0.35511599481105804,
"rewards/answer_reward_func": 0.6958333849906921,
"rewards/format_reward_func": 0.9479166865348816,
"step": 216
},
{
"completion_length": 257.1294708251953,
"epoch": 109.0,
"grad_norm": 0.8840586512014985,
"kl": 0.0065155029296875,
"learning_rate": 2.751764564986396e-07,
"loss": 0.0,
"reward": 1.7041666507720947,
"reward_std": 0.4368957430124283,
"rewards/answer_reward_func": 0.7874999940395355,
"rewards/format_reward_func": 0.9166666865348816,
"step": 218
},
{
"completion_length": 180.08036041259766,
"epoch": 110.0,
"grad_norm": 1.3367177987973136,
"kl": 0.008544921875,
"learning_rate": 2.715895379284194e-07,
"loss": 0.0,
"reward": 1.7208334803581238,
"reward_std": 0.3184947222471237,
"rewards/answer_reward_func": 0.7833333611488342,
"rewards/format_reward_func": 0.9375000298023224,
"step": 220
},
{
"completion_length": 174.14583587646484,
"epoch": 111.0,
"grad_norm": 0.9864081299854107,
"kl": 0.009552001953125,
"learning_rate": 2.6799813580229174e-07,
"loss": 0.0,
"reward": 1.6166667938232422,
"reward_std": 0.45926420390605927,
"rewards/answer_reward_func": 0.7104166746139526,
"rewards/format_reward_func": 0.90625,
"step": 222
},
{
"completion_length": 277.2693634033203,
"epoch": 112.0,
"grad_norm": 0.9956535351844211,
"kl": 0.0081634521484375,
"learning_rate": 2.6440299595614606e-07,
"loss": 0.0,
"reward": 1.675000011920929,
"reward_std": 0.4140470027923584,
"rewards/answer_reward_func": 0.7583333551883698,
"rewards/format_reward_func": 0.9166666865348816,
"step": 224
},
{
"completion_length": 212.07292938232422,
"epoch": 113.0,
"grad_norm": 1.240347599024976,
"kl": 0.006927490234375,
"learning_rate": 2.6080486500209347e-07,
"loss": 0.0,
"reward": 1.6791666746139526,
"reward_std": 0.4041960537433624,
"rewards/answer_reward_func": 0.7729166448116302,
"rewards/format_reward_func": 0.9062500298023224,
"step": 226
},
{
"completion_length": 211.12649536132812,
"epoch": 114.0,
"grad_norm": 1.0191986624515685,
"kl": 0.007720947265625,
"learning_rate": 2.572044901734166e-07,
"loss": 0.0,
"reward": 1.6395833492279053,
"reward_std": 0.5253069698810577,
"rewards/answer_reward_func": 0.7541667222976685,
"rewards/format_reward_func": 0.8854166865348816,
"step": 228
},
{
"completion_length": 253.59524536132812,
"epoch": 115.0,
"grad_norm": 1.1187509727468907,
"kl": 0.0093994140625,
"learning_rate": 2.536026191693893e-07,
"loss": 0.0,
"reward": 1.6500001549720764,
"reward_std": 0.5131012797355652,
"rewards/answer_reward_func": 0.7437500357627869,
"rewards/format_reward_func": 0.9062500298023224,
"step": 230
},
{
"completion_length": 252.3601303100586,
"epoch": 116.0,
"grad_norm": 0.8071311834392373,
"kl": 0.009674072265625,
"learning_rate": 2.5e-07,
"loss": 0.0,
"reward": 1.6562501192092896,
"reward_std": 0.48235540091991425,
"rewards/answer_reward_func": 0.7708334028720856,
"rewards/format_reward_func": 0.8854166865348816,
"step": 232
},
{
"completion_length": 233.3214340209961,
"epoch": 117.0,
"grad_norm": 1.1656552755712852,
"kl": 0.009185791015625,
"learning_rate": 2.4639738083061073e-07,
"loss": 0.0,
"reward": 1.7062499523162842,
"reward_std": 0.4089523106813431,
"rewards/answer_reward_func": 0.7687500417232513,
"rewards/format_reward_func": 0.9375000298023224,
"step": 234
},
{
"completion_length": 208.43899536132812,
"epoch": 118.0,
"grad_norm": 1.1265120620864038,
"kl": 0.0083465576171875,
"learning_rate": 2.4279550982658345e-07,
"loss": 0.0,
"reward": 1.6166667342185974,
"reward_std": 0.46437712013721466,
"rewards/answer_reward_func": 0.7104166448116302,
"rewards/format_reward_func": 0.90625,
"step": 236
},
{
"completion_length": 224.84524536132812,
"epoch": 119.0,
"grad_norm": 1.0636630331845314,
"kl": 0.0076141357421875,
"learning_rate": 2.3919513499790646e-07,
"loss": 0.0,
"reward": 1.6333333253860474,
"reward_std": 0.47814565896987915,
"rewards/answer_reward_func": 0.7270833849906921,
"rewards/format_reward_func": 0.9062500298023224,
"step": 238
},
{
"completion_length": 247.66221618652344,
"epoch": 120.0,
"grad_norm": 1.113225205494867,
"kl": 0.009002685546875,
"learning_rate": 2.3559700404385394e-07,
"loss": 0.0,
"reward": 1.6958333253860474,
"reward_std": 0.4592965245246887,
"rewards/answer_reward_func": 0.768750011920929,
"rewards/format_reward_func": 0.9270833730697632,
"step": 240
},
{
"completion_length": 237.17857360839844,
"epoch": 121.0,
"grad_norm": 1.077870033372934,
"kl": 0.006683349609375,
"learning_rate": 2.3200186419770823e-07,
"loss": 0.0,
"reward": 1.5104167461395264,
"reward_std": 0.5212539583444595,
"rewards/answer_reward_func": 0.6562500298023224,
"rewards/format_reward_func": 0.8541666865348816,
"step": 242
},
{
"completion_length": 219.99553680419922,
"epoch": 122.0,
"grad_norm": 1.1297698266097218,
"kl": 0.009765625,
"learning_rate": 2.284104620715807e-07,
"loss": 0.0,
"reward": 1.6541667580604553,
"reward_std": 0.48302122950553894,
"rewards/answer_reward_func": 0.737500011920929,
"rewards/format_reward_func": 0.9166666865348816,
"step": 244
},
{
"completion_length": 184.85714721679688,
"epoch": 123.0,
"grad_norm": 1.01567299552301,
"kl": 0.012451171875,
"learning_rate": 2.2482354350136043e-07,
"loss": 0.0,
"reward": 1.6791667938232422,
"reward_std": 0.4051380306482315,
"rewards/answer_reward_func": 0.7729167342185974,
"rewards/format_reward_func": 0.90625,
"step": 246
},
{
"completion_length": 204.14137268066406,
"epoch": 124.0,
"grad_norm": 1.1409009201232085,
"kl": 0.0103759765625,
"learning_rate": 2.2124185339182496e-07,
"loss": 0.0,
"reward": 1.5937501192092896,
"reward_std": 0.488675519824028,
"rewards/answer_reward_func": 0.6979166567325592,
"rewards/format_reward_func": 0.8958333432674408,
"step": 248
},
{
"completion_length": 228.09524536132812,
"epoch": 125.0,
"grad_norm": 1.1428973558078321,
"kl": 0.0098876953125,
"learning_rate": 2.1766613556194344e-07,
"loss": 0.0,
"reward": 1.6812500953674316,
"reward_std": 0.4293702244758606,
"rewards/answer_reward_func": 0.7645833194255829,
"rewards/format_reward_func": 0.9166666865348816,
"step": 250
},
{
"completion_length": 265.23958587646484,
"epoch": 126.0,
"grad_norm": 0.9689867414739515,
"kl": 0.010528564453125,
"learning_rate": 2.1409713259040628e-07,
"loss": 0.0,
"reward": 1.6958333849906921,
"reward_std": 0.5003866702318192,
"rewards/answer_reward_func": 0.7895833551883698,
"rewards/format_reward_func": 0.90625,
"step": 252
},
{
"completion_length": 238.41815948486328,
"epoch": 127.0,
"grad_norm": 1.2313155770579827,
"kl": 0.010711669921875,
"learning_rate": 2.105355856614115e-07,
"loss": 0.0,
"reward": 1.6770833730697632,
"reward_std": 0.47428610920906067,
"rewards/answer_reward_func": 0.7500000298023224,
"rewards/format_reward_func": 0.9270833432674408,
"step": 254
},
{
"completion_length": 232.48661041259766,
"epoch": 128.0,
"grad_norm": 0.9110185027634778,
"kl": 0.008453369140625,
"learning_rate": 2.069822344107413e-07,
"loss": 0.0,
"reward": 1.6916666626930237,
"reward_std": 0.4168810397386551,
"rewards/answer_reward_func": 0.7750000357627869,
"rewards/format_reward_func": 0.9166666865348816,
"step": 256
},
{
"completion_length": 241.36458587646484,
"epoch": 129.0,
"grad_norm": 1.2821986198667763,
"kl": 0.01025390625,
"learning_rate": 2.034378167721599e-07,
"loss": 0.0,
"reward": 1.7645832896232605,
"reward_std": 0.30109934508800507,
"rewards/answer_reward_func": 0.8166667222976685,
"rewards/format_reward_func": 0.9479166865348816,
"step": 258
},
{
"completion_length": 225.42857360839844,
"epoch": 130.0,
"grad_norm": 0.943480606859016,
"kl": 0.01080322265625,
"learning_rate": 1.9990306882416485e-07,
"loss": 0.0,
"reward": 1.683333396911621,
"reward_std": 0.49519842863082886,
"rewards/answer_reward_func": 0.7770833671092987,
"rewards/format_reward_func": 0.9062500298023224,
"step": 260
},
{
"completion_length": 231.6636962890625,
"epoch": 131.0,
"grad_norm": 0.9562007922521701,
"kl": 0.012542724609375,
"learning_rate": 1.9637872463712362e-07,
"loss": 0.0,
"reward": 1.6104167103767395,
"reward_std": 0.5748671591281891,
"rewards/answer_reward_func": 0.7354167103767395,
"rewards/format_reward_func": 0.8750000298023224,
"step": 262
},
{
"completion_length": 276.0327453613281,
"epoch": 132.0,
"grad_norm": 0.9641627688993039,
"kl": 0.009033203125,
"learning_rate": 1.9286551612082773e-07,
"loss": 0.0,
"reward": 1.6375000476837158,
"reward_std": 0.4231380522251129,
"rewards/answer_reward_func": 0.7208333611488342,
"rewards/format_reward_func": 0.9166666865348816,
"step": 264
},
{
"completion_length": 256.84078216552734,
"epoch": 133.0,
"grad_norm": 1.0911968919904242,
"kl": 0.009490966796875,
"learning_rate": 1.8936417287249446e-07,
"loss": 0.0,
"reward": 1.6666666269302368,
"reward_std": 0.4258534461259842,
"rewards/answer_reward_func": 0.729166716337204,
"rewards/format_reward_func": 0.9375000298023224,
"step": 266
},
{
"completion_length": 250.56102752685547,
"epoch": 134.0,
"grad_norm": 1.2221245515880175,
"kl": 0.009674072265625,
"learning_rate": 1.8587542202524985e-07,
"loss": 0.0,
"reward": 1.524999976158142,
"reward_std": 0.5299902558326721,
"rewards/answer_reward_func": 0.6395833790302277,
"rewards/format_reward_func": 0.8854166865348816,
"step": 268
},
{
"completion_length": 196.33185577392578,
"epoch": 135.0,
"grad_norm": 1.2840402345310011,
"kl": 0.013519287109375,
"learning_rate": 1.82399988097123e-07,
"loss": 0.0,
"reward": 1.7187500596046448,
"reward_std": 0.3355311304330826,
"rewards/answer_reward_func": 0.78125,
"rewards/format_reward_func": 0.9375000298023224,
"step": 270
},
{
"completion_length": 203.56101989746094,
"epoch": 136.0,
"grad_norm": 1.0538807147223688,
"kl": 0.0118408203125,
"learning_rate": 1.7893859284058378e-07,
"loss": 0.0,
"reward": 1.8416666388511658,
"reward_std": 0.21030117571353912,
"rewards/answer_reward_func": 0.8625000715255737,
"rewards/format_reward_func": 0.9791666865348816,
"step": 272
},
{
"completion_length": 254.30953979492188,
"epoch": 137.0,
"grad_norm": 0.9659537232220188,
"kl": 0.011077880859375,
"learning_rate": 1.7549195509265407e-07,
"loss": 0.0,
"reward": 1.6791666746139526,
"reward_std": 0.5740651041269302,
"rewards/answer_reward_func": 0.7937500178813934,
"rewards/format_reward_func": 0.8854166865348816,
"step": 274
},
{
"completion_length": 199.11905670166016,
"epoch": 138.0,
"grad_norm": 1.0523104695752745,
"kl": 0.012451171875,
"learning_rate": 1.7206079062562536e-07,
"loss": 0.0,
"reward": 1.6354166865348816,
"reward_std": 0.45953696966171265,
"rewards/answer_reward_func": 0.7187500298023224,
"rewards/format_reward_func": 0.9166666865348816,
"step": 276
},
{
"completion_length": 253.00596618652344,
"epoch": 139.0,
"grad_norm": 1.2749421525205042,
"kl": 0.01361083984375,
"learning_rate": 1.6864581199841226e-07,
"loss": 0.0,
"reward": 1.5812500715255737,
"reward_std": 0.4656961262226105,
"rewards/answer_reward_func": 0.6749999821186066,
"rewards/format_reward_func": 0.9062500298023224,
"step": 278
},
{
"completion_length": 210.05209350585938,
"epoch": 140.0,
"grad_norm": 0.9197691486950679,
"kl": 0.01361083984375,
"learning_rate": 1.6524772840857388e-07,
"loss": 0.0,
"reward": 1.7916668057441711,
"reward_std": 0.28819574415683746,
"rewards/answer_reward_func": 0.8333334028720856,
"rewards/format_reward_func": 0.9583333432674408,
"step": 280
},
{
"completion_length": 225.55357360839844,
"epoch": 141.0,
"grad_norm": 1.0481444334257914,
"kl": 0.0125732421875,
"learning_rate": 1.6186724554503237e-07,
"loss": 0.0,
"reward": 1.5562500953674316,
"reward_std": 0.58814936876297,
"rewards/answer_reward_func": 0.6916666924953461,
"rewards/format_reward_func": 0.8645833432674408,
"step": 282
},
{
"completion_length": 220.72024536132812,
"epoch": 142.0,
"grad_norm": 0.8606270979580886,
"kl": 0.012176513671875,
"learning_rate": 1.5850506544152103e-07,
"loss": 0.0,
"reward": 1.7083333134651184,
"reward_std": 0.4496448040008545,
"rewards/answer_reward_func": 0.7916666865348816,
"rewards/format_reward_func": 0.9166666865348816,
"step": 284
},
{
"completion_length": 243.38245391845703,
"epoch": 143.0,
"grad_norm": 1.0698695536011837,
"kl": 0.0106201171875,
"learning_rate": 1.5516188633079107e-07,
"loss": 0.0,
"reward": 1.6020833253860474,
"reward_std": 0.4612347036600113,
"rewards/answer_reward_func": 0.6958333849906921,
"rewards/format_reward_func": 0.9062500298023224,
"step": 286
},
{
"completion_length": 227.4419708251953,
"epoch": 144.0,
"grad_norm": 0.7884961138356886,
"kl": 0.011383056640625,
"learning_rate": 1.5183840249960784e-07,
"loss": 0.0,
"reward": 1.6104166507720947,
"reward_std": 0.4611455947160721,
"rewards/answer_reward_func": 0.7145833671092987,
"rewards/format_reward_func": 0.8958333730697632,
"step": 288
},
{
"completion_length": 231.55506896972656,
"epoch": 145.0,
"grad_norm": 0.8361046861951098,
"kl": 0.011993408203125,
"learning_rate": 1.4853530414456612e-07,
"loss": 0.0,
"reward": 1.6041667461395264,
"reward_std": 0.4646473228931427,
"rewards/answer_reward_func": 0.697916716337204,
"rewards/format_reward_func": 0.9062500298023224,
"step": 290
},
{
"completion_length": 228.49405670166016,
"epoch": 146.0,
"grad_norm": 0.8068563262758177,
"kl": 0.0104217529296875,
"learning_rate": 1.4525327722875568e-07,
"loss": 0.0,
"reward": 1.683333396911621,
"reward_std": 0.47417452931404114,
"rewards/answer_reward_func": 0.7770833373069763,
"rewards/format_reward_func": 0.9062500298023224,
"step": 292
},
{
"completion_length": 194.96875762939453,
"epoch": 147.0,
"grad_norm": 1.1213564709454569,
"kl": 0.013824462890625,
"learning_rate": 1.4199300333930515e-07,
"loss": 0.0,
"reward": 1.7062500715255737,
"reward_std": 0.3444534093141556,
"rewards/answer_reward_func": 0.7687500417232513,
"rewards/format_reward_func": 0.9375000298023224,
"step": 294
},
{
"completion_length": 244.4166717529297,
"epoch": 148.0,
"grad_norm": 0.7888089583197829,
"kl": 0.01177978515625,
"learning_rate": 1.3875515954583523e-07,
"loss": 0.0,
"reward": 1.756250023841858,
"reward_std": 0.3626406341791153,
"rewards/answer_reward_func": 0.8083333671092987,
"rewards/format_reward_func": 0.9479166865348816,
"step": 296
},
{
"completion_length": 199.47173309326172,
"epoch": 149.0,
"grad_norm": 1.5123863638798163,
"kl": 0.01519775390625,
"learning_rate": 1.3554041825985e-07,
"loss": 0.0,
"reward": 1.7041667103767395,
"reward_std": 0.3532189428806305,
"rewards/answer_reward_func": 0.7666667103767395,
"rewards/format_reward_func": 0.9375,
"step": 298
},
{
"completion_length": 239.6413803100586,
"epoch": 150.0,
"grad_norm": 0.9743053302752174,
"kl": 0.011260986328125,
"learning_rate": 1.323494470950949e-07,
"loss": 0.0,
"reward": 1.67083340883255,
"reward_std": 0.43112409114837646,
"rewards/answer_reward_func": 0.7541666924953461,
"rewards/format_reward_func": 0.9166666865348816,
"step": 300
},
{
"completion_length": 264.875,
"epoch": 151.0,
"grad_norm": 1.084558861013316,
"kl": 0.010162353515625,
"learning_rate": 1.2918290872891236e-07,
"loss": 0.0,
"reward": 1.5729166865348816,
"reward_std": 0.3881361186504364,
"rewards/answer_reward_func": 0.6458333432674408,
"rewards/format_reward_func": 0.9270833730697632,
"step": 302
},
{
"completion_length": 166.75893020629883,
"epoch": 152.0,
"grad_norm": 0.9178971871400314,
"kl": 0.013824462890625,
"learning_rate": 1.260414607646213e-07,
"loss": 0.0,
"reward": 1.6708332896232605,
"reward_std": 0.48874886333942413,
"rewards/answer_reward_func": 0.7645833492279053,
"rewards/format_reward_func": 0.90625,
"step": 304
},
{
"completion_length": 229.1294708251953,
"epoch": 153.0,
"grad_norm": 0.9051272338707157,
"kl": 0.012451171875,
"learning_rate": 1.2292575559495143e-07,
"loss": 0.0,
"reward": 1.4937500357627869,
"reward_std": 0.5545496791601181,
"rewards/answer_reward_func": 0.6291666924953461,
"rewards/format_reward_func": 0.8645833432674408,
"step": 306
},
{
"completion_length": 205.56548309326172,
"epoch": 154.0,
"grad_norm": 1.1418844839154754,
"kl": 0.012786865234375,
"learning_rate": 1.1983644026655835e-07,
"loss": 0.0,
"reward": 1.5958333611488342,
"reward_std": 0.39635278284549713,
"rewards/answer_reward_func": 0.6687500476837158,
"rewards/format_reward_func": 0.9270833432674408,
"step": 308
},
{
"completion_length": 259.0446472167969,
"epoch": 155.0,
"grad_norm": 0.9270214488759818,
"kl": 0.01220703125,
"learning_rate": 1.1677415634565066e-07,
"loss": 0.0,
"reward": 1.6541667580604553,
"reward_std": 0.5274697840213776,
"rewards/answer_reward_func": 0.768750011920929,
"rewards/format_reward_func": 0.8854166865348816,
"step": 310
},
{
"completion_length": 187.2574462890625,
"epoch": 156.0,
"grad_norm": 1.406411477220287,
"kl": 0.015869140625,
"learning_rate": 1.1373953978475353e-07,
"loss": 0.0,
"reward": 1.6708332896232605,
"reward_std": 0.4821990430355072,
"rewards/answer_reward_func": 0.7645833790302277,
"rewards/format_reward_func": 0.9062500298023224,
"step": 312
},
{
"completion_length": 276.9166717529297,
"epoch": 157.0,
"grad_norm": 0.7990683879825865,
"kl": 0.01318359375,
"learning_rate": 1.1073322079063913e-07,
"loss": 0.0,
"reward": 1.620833396911621,
"reward_std": 0.5315083116292953,
"rewards/answer_reward_func": 0.7458333373069763,
"rewards/format_reward_func": 0.875,
"step": 314
},
{
"completion_length": 257.94495391845703,
"epoch": 158.0,
"grad_norm": 1.1464111626599285,
"kl": 0.0115966796875,
"learning_rate": 1.0775582369344946e-07,
"loss": 0.0,
"reward": 1.5750000476837158,
"reward_std": 0.4518410414457321,
"rewards/answer_reward_func": 0.679166704416275,
"rewards/format_reward_func": 0.8958333432674408,
"step": 316
},
{
"completion_length": 197.84375,
"epoch": 159.0,
"grad_norm": 0.9180762035814116,
"kl": 0.011810302734375,
"learning_rate": 1.0480796681704077e-07,
"loss": 0.0,
"reward": 1.6875,
"reward_std": 0.5340849161148071,
"rewards/answer_reward_func": 0.7916666567325592,
"rewards/format_reward_func": 0.8958333730697632,
"step": 318
},
{
"completion_length": 214.01339721679688,
"epoch": 160.0,
"grad_norm": 0.9726223680985641,
"kl": 0.012664794921875,
"learning_rate": 1.018902623505741e-07,
"loss": 0.0,
"reward": 1.7437500953674316,
"reward_std": 0.38085436820983887,
"rewards/answer_reward_func": 0.7958333790302277,
"rewards/format_reward_func": 0.9479166865348816,
"step": 320
},
{
"completion_length": 238.88690948486328,
"epoch": 161.0,
"grad_norm": 1.2613531043331283,
"kl": 0.013427734375,
"learning_rate": 9.900331622138063e-08,
"loss": 0.0,
"reward": 1.6770833730697632,
"reward_std": 0.40811336040496826,
"rewards/answer_reward_func": 0.7395834028720856,
"rewards/format_reward_func": 0.9375000298023224,
"step": 322
},
{
"completion_length": 246.1726303100586,
"epoch": 162.0,
"grad_norm": 0.9813919388988234,
"kl": 0.01416015625,
"learning_rate": 9.614772796912681e-08,
"loss": 0.0,
"reward": 1.6854166388511658,
"reward_std": 0.3813091516494751,
"rewards/answer_reward_func": 0.737500011920929,
"rewards/format_reward_func": 0.9479166865348816,
"step": 324
},
{
"completion_length": 279.3869094848633,
"epoch": 163.0,
"grad_norm": 0.9383720906424019,
"kl": 0.01165771484375,
"learning_rate": 9.332409062130686e-08,
"loss": 0.0,
"reward": 1.6874999403953552,
"reward_std": 0.3446161448955536,
"rewards/answer_reward_func": 0.7395833432674408,
"rewards/format_reward_func": 0.9479166865348816,
"step": 326
},
{
"completion_length": 225.8199462890625,
"epoch": 164.0,
"grad_norm": 1.2329291257068657,
"kl": 0.013763427734375,
"learning_rate": 9.053299057008699e-08,
"loss": 0.0,
"reward": 1.6145833730697632,
"reward_std": 0.4202453941106796,
"rewards/answer_reward_func": 0.7187500298023224,
"rewards/format_reward_func": 0.8958333730697632,
"step": 328
},
{
"completion_length": 235.64584350585938,
"epoch": 165.0,
"grad_norm": 1.0271287801634421,
"kl": 0.012237548828125,
"learning_rate": 8.777500745052743e-08,
"loss": 0.0,
"reward": 1.7166666984558105,
"reward_std": 0.43931248784065247,
"rewards/answer_reward_func": 0.8000000417232513,
"rewards/format_reward_func": 0.9166666865348816,
"step": 330
},
{
"completion_length": 232.68006896972656,
"epoch": 166.0,
"grad_norm": 0.915865026213559,
"kl": 0.012603759765625,
"learning_rate": 8.505071402020892e-08,
"loss": 0.0,
"reward": 1.6874999403953552,
"reward_std": 0.5371277630329132,
"rewards/answer_reward_func": 0.7916666865348816,
"rewards/format_reward_func": 0.8958333432674408,
"step": 332
},
{
"completion_length": 217.71131896972656,
"epoch": 167.0,
"grad_norm": 0.9120790002082919,
"kl": 0.013275146484375,
"learning_rate": 8.236067604028562e-08,
"loss": 0.0,
"reward": 1.675000011920929,
"reward_std": 0.563975989818573,
"rewards/answer_reward_func": 0.7895833849906921,
"rewards/format_reward_func": 0.8854166865348816,
"step": 334
},
{
"completion_length": 196.93899536132812,
"epoch": 168.0,
"grad_norm": 1.1307071803466724,
"kl": 0.015533447265625,
"learning_rate": 7.970545215799327e-08,
"loss": 0.0,
"reward": 1.6979166865348816,
"reward_std": 0.45206770300865173,
"rewards/answer_reward_func": 0.7812500596046448,
"rewards/format_reward_func": 0.9166666865348816,
"step": 336
},
{
"completion_length": 211.1324462890625,
"epoch": 169.0,
"grad_norm": 1.0318110502343532,
"kl": 0.014373779296875,
"learning_rate": 7.708559379063204e-08,
"loss": 0.0,
"reward": 1.6083333492279053,
"reward_std": 0.4994523823261261,
"rewards/answer_reward_func": 0.7229166924953461,
"rewards/format_reward_func": 0.8854166865348816,
"step": 338
},
{
"completion_length": 249.483642578125,
"epoch": 170.0,
"grad_norm": 1.0228436471160482,
"kl": 0.01397705078125,
"learning_rate": 7.45016450110534e-08,
"loss": 0.0,
"reward": 1.725000023841858,
"reward_std": 0.4635240435600281,
"rewards/answer_reward_func": 0.8083333671092987,
"rewards/format_reward_func": 0.9166666865348816,
"step": 340
},
{
"completion_length": 181.33928680419922,
"epoch": 171.0,
"grad_norm": 0.9237959973304003,
"kl": 0.013946533203125,
"learning_rate": 7.195414243467029e-08,
"loss": 0.0,
"reward": 1.6895833015441895,
"reward_std": 0.5353528708219528,
"rewards/answer_reward_func": 0.7937500476837158,
"rewards/format_reward_func": 0.8958333432674408,
"step": 342
},
{
"completion_length": 197.4136962890625,
"epoch": 172.0,
"grad_norm": 1.0940015327651587,
"kl": 0.013580322265625,
"learning_rate": 6.944361510801763e-08,
"loss": 0.0,
"reward": 1.8708333373069763,
"reward_std": 0.22116978466510773,
"rewards/answer_reward_func": 0.8916666805744171,
"rewards/format_reward_func": 0.9791666865348816,
"step": 344
},
{
"completion_length": 270.08929443359375,
"epoch": 173.0,
"grad_norm": 0.8732317923979456,
"kl": 0.0106201171875,
"learning_rate": 6.697058439888283e-08,
"loss": 0.0,
"reward": 1.6125000715255737,
"reward_std": 0.46661631762981415,
"rewards/answer_reward_func": 0.6958333253860474,
"rewards/format_reward_func": 0.9166666865348816,
"step": 346
},
{
"completion_length": 203.64435577392578,
"epoch": 174.0,
"grad_norm": 1.0885104682621713,
"kl": 0.0135498046875,
"learning_rate": 6.453556388803288e-08,
"loss": 0.0,
"reward": 1.600000023841858,
"reward_std": 0.5343074351549149,
"rewards/answer_reward_func": 0.7041667103767395,
"rewards/format_reward_func": 0.8958333432674408,
"step": 348
},
{
"completion_length": 211.68155670166016,
"epoch": 175.0,
"grad_norm": 1.4782213044122223,
"kl": 0.01861572265625,
"learning_rate": 6.213905926255697e-08,
"loss": 0.0,
"reward": 1.7291668057441711,
"reward_std": 0.3893900662660599,
"rewards/answer_reward_func": 0.7916666865348816,
"rewards/format_reward_func": 0.9375,
"step": 350
},
{
"completion_length": 156.66071701049805,
"epoch": 176.0,
"grad_norm": 0.9393862503631087,
"kl": 0.013519287109375,
"learning_rate": 5.978156821084987e-08,
"loss": 0.0,
"reward": 1.5104167461395264,
"reward_std": 0.479897677898407,
"rewards/answer_reward_func": 0.6145833432674408,
"rewards/format_reward_func": 0.8958333432674408,
"step": 352
},
{
"completion_length": 199.02828216552734,
"epoch": 177.0,
"grad_norm": 1.3069044202145068,
"kl": 0.01422119140625,
"learning_rate": 5.7463580319254853e-08,
"loss": 0.0,
"reward": 1.6895833015441895,
"reward_std": 0.38866522908210754,
"rewards/answer_reward_func": 0.762499988079071,
"rewards/format_reward_func": 0.9270833730697632,
"step": 354
},
{
"completion_length": 189.39286041259766,
"epoch": 178.0,
"grad_norm": 1.0161007756312446,
"kl": 0.015289306640625,
"learning_rate": 5.518557697039081e-08,
"loss": 0.0,
"reward": 1.706250011920929,
"reward_std": 0.34542496502399445,
"rewards/answer_reward_func": 0.7583333849906921,
"rewards/format_reward_func": 0.9479166865348816,
"step": 356
},
{
"completion_length": 224.97024536132812,
"epoch": 179.0,
"grad_norm": 1.311399818971272,
"kl": 0.01507568359375,
"learning_rate": 5.294803124318145e-08,
"loss": 0.0,
"reward": 1.6791666746139526,
"reward_std": 0.4293544441461563,
"rewards/answer_reward_func": 0.7520833015441895,
"rewards/format_reward_func": 0.9270833432674408,
"step": 358
},
{
"completion_length": 222.889892578125,
"epoch": 180.0,
"grad_norm": 1.1475919639221568,
"kl": 0.013427734375,
"learning_rate": 5.07514078146106e-08,
"loss": 0.0,
"reward": 1.5791667699813843,
"reward_std": 0.47242018580436707,
"rewards/answer_reward_func": 0.6833333373069763,
"rewards/format_reward_func": 0.8958333730697632,
"step": 360
},
{
"completion_length": 178.65625,
"epoch": 181.0,
"grad_norm": 1.0645169958717802,
"kl": 0.01220703125,
"learning_rate": 4.859616286322094e-08,
"loss": 0.0,
"reward": 1.7187500596046448,
"reward_std": 0.5216688811779022,
"rewards/answer_reward_func": 0.8020833432674408,
"rewards/format_reward_func": 0.9166666865348816,
"step": 362
},
{
"completion_length": 215.66667938232422,
"epoch": 182.0,
"grad_norm": 0.7634779125291854,
"kl": 0.01507568359375,
"learning_rate": 4.648274397437829e-08,
"loss": 0.0,
"reward": 1.6770833730697632,
"reward_std": 0.45663949847221375,
"rewards/answer_reward_func": 0.75,
"rewards/format_reward_func": 0.9270833730697632,
"step": 364
},
{
"completion_length": 255.3482208251953,
"epoch": 183.0,
"grad_norm": 1.1451895327728272,
"kl": 0.01422119140625,
"learning_rate": 4.4411590047320617e-08,
"loss": 0.0,
"reward": 1.7062499523162842,
"reward_std": 0.3319532126188278,
"rewards/answer_reward_func": 0.7583333849906921,
"rewards/format_reward_func": 0.9479166865348816,
"step": 366
},
{
"completion_length": 175.7872085571289,
"epoch": 184.0,
"grad_norm": 1.138275541917393,
"kl": 0.01885986328125,
"learning_rate": 4.2383131204010494e-08,
"loss": 0.0,
"reward": 1.5458333492279053,
"reward_std": 0.5794899761676788,
"rewards/answer_reward_func": 0.6812499761581421,
"rewards/format_reward_func": 0.8645833730697632,
"step": 368
},
{
"completion_length": 220.264892578125,
"epoch": 185.0,
"grad_norm": 1.2941468258226874,
"kl": 0.01434326171875,
"learning_rate": 4.039778869981064e-08,
"loss": 0.0,
"reward": 1.6541666388511658,
"reward_std": 0.3757530450820923,
"rewards/answer_reward_func": 0.7062500417232513,
"rewards/format_reward_func": 0.9479166865348816,
"step": 370
},
{
"completion_length": 180.15625762939453,
"epoch": 186.0,
"grad_norm": 1.1191534930075977,
"kl": 0.014617919921875,
"learning_rate": 3.845597483600049e-08,
"loss": 0.0,
"reward": 1.6958333849906921,
"reward_std": 0.42705224454402924,
"rewards/answer_reward_func": 0.768750011920929,
"rewards/format_reward_func": 0.9270833432674408,
"step": 372
},
{
"completion_length": 193.14732360839844,
"epoch": 187.0,
"grad_norm": 1.5659699092387518,
"kl": 0.015594482421875,
"learning_rate": 3.655809287415284e-08,
"loss": 0.0,
"reward": 1.6958333253860474,
"reward_std": 0.3121452108025551,
"rewards/answer_reward_func": 0.7375000417232513,
"rewards/format_reward_func": 0.9583333432674408,
"step": 374
},
{
"completion_length": 175.52828216552734,
"epoch": 188.0,
"grad_norm": 1.0098836488478027,
"kl": 0.0152587890625,
"learning_rate": 3.4704536952387285e-08,
"loss": 0.0,
"reward": 1.754166603088379,
"reward_std": 0.35266736149787903,
"rewards/answer_reward_func": 0.8062500059604645,
"rewards/format_reward_func": 0.9479166865348816,
"step": 376
},
{
"completion_length": 264.1756134033203,
"epoch": 189.0,
"grad_norm": 0.9574953446250056,
"kl": 0.0113525390625,
"learning_rate": 3.2895692003518575e-08,
"loss": 0.0,
"reward": 1.7229167222976685,
"reward_std": 0.3303990066051483,
"rewards/answer_reward_func": 0.7750000357627869,
"rewards/format_reward_func": 0.9479166865348816,
"step": 378
},
{
"completion_length": 193.55208587646484,
"epoch": 190.0,
"grad_norm": 1.238270158742224,
"kl": 0.01446533203125,
"learning_rate": 3.113193367511635e-08,
"loss": 0.0,
"reward": 1.6812500357627869,
"reward_std": 0.41038842499256134,
"rewards/answer_reward_func": 0.7541666924953461,
"rewards/format_reward_func": 0.9270833432674408,
"step": 380
},
{
"completion_length": 200.19792938232422,
"epoch": 191.0,
"grad_norm": 0.9743369650595539,
"kl": 0.0125732421875,
"learning_rate": 2.9413628251493934e-08,
"loss": 0.0,
"reward": 1.7062499523162842,
"reward_std": 0.4908797889947891,
"rewards/answer_reward_func": 0.8000000715255737,
"rewards/format_reward_func": 0.90625,
"step": 382
},
{
"completion_length": 170.07440948486328,
"epoch": 192.0,
"grad_norm": 1.3063146962963188,
"kl": 0.013519287109375,
"learning_rate": 2.774113257764066e-08,
"loss": 0.0,
"reward": 1.7333333492279053,
"reward_std": 0.3345176726579666,
"rewards/answer_reward_func": 0.7958333790302277,
"rewards/format_reward_func": 0.9375000298023224,
"step": 384
},
{
"completion_length": 247.58482360839844,
"epoch": 193.0,
"grad_norm": 0.9984056006045743,
"kl": 0.0120849609375,
"learning_rate": 2.611479398511518e-08,
"loss": 0.0,
"reward": 1.7083333730697632,
"reward_std": 0.3646298050880432,
"rewards/answer_reward_func": 0.7708333730697632,
"rewards/format_reward_func": 0.9375,
"step": 386
},
{
"completion_length": 238.2574462890625,
"epoch": 194.0,
"grad_norm": 1.5444205983532193,
"kl": 0.013885498046875,
"learning_rate": 2.4534950219914057e-08,
"loss": 0.0,
"reward": 1.7312500476837158,
"reward_std": 0.38924433290958405,
"rewards/answer_reward_func": 0.8041666448116302,
"rewards/format_reward_func": 0.9270833432674408,
"step": 388
},
{
"completion_length": 226.72173309326172,
"epoch": 195.0,
"grad_norm": 0.8197579373143684,
"kl": 0.009979248046875,
"learning_rate": 2.300192937233128e-08,
"loss": 0.0,
"reward": 1.6791666746139526,
"reward_std": 0.4052896499633789,
"rewards/answer_reward_func": 0.7416667342185974,
"rewards/format_reward_func": 0.9375,
"step": 390
},
{
"completion_length": 232.91964721679688,
"epoch": 196.0,
"grad_norm": 1.0006804609593394,
"kl": 0.013702392578125,
"learning_rate": 2.1516049808822935e-08,
"loss": 0.0,
"reward": 1.6479166746139526,
"reward_std": 0.5231119990348816,
"rewards/answer_reward_func": 0.7520833611488342,
"rewards/format_reward_func": 0.8958333432674408,
"step": 392
},
{
"completion_length": 191.55803680419922,
"epoch": 197.0,
"grad_norm": 1.1112988487037212,
"kl": 0.011810302734375,
"learning_rate": 2.007762010589098e-08,
"loss": 0.0,
"reward": 1.6854167580604553,
"reward_std": 0.4321441501379013,
"rewards/answer_reward_func": 0.7583333551883698,
"rewards/format_reward_func": 0.9270833432674408,
"step": 394
},
{
"completion_length": 233.3482208251953,
"epoch": 198.0,
"grad_norm": 1.244460904016601,
"kl": 0.013214111328125,
"learning_rate": 1.8686938986000627e-08,
"loss": 0.0,
"reward": 1.6875000596046448,
"reward_std": 0.4832369536161423,
"rewards/answer_reward_func": 0.7604166567325592,
"rewards/format_reward_func": 0.9270833730697632,
"step": 396
},
{
"completion_length": 224.22471618652344,
"epoch": 199.0,
"grad_norm": 1.1490919776212416,
"kl": 0.0167236328125,
"learning_rate": 1.734429525554365e-08,
"loss": 0.0,
"reward": 1.5958333611488342,
"reward_std": 0.553360641002655,
"rewards/answer_reward_func": 0.7104166746139526,
"rewards/format_reward_func": 0.8854166865348816,
"step": 398
},
{
"completion_length": 255.5982208251953,
"epoch": 200.0,
"grad_norm": 0.9875662145593327,
"kl": 0.01202392578125,
"learning_rate": 1.604996774486145e-08,
"loss": 0.0,
"reward": 1.7104166746139526,
"reward_std": 0.44989022612571716,
"rewards/answer_reward_func": 0.793749988079071,
"rewards/format_reward_func": 0.9166666865348816,
"step": 400
},
{
"completion_length": 188.99405670166016,
"epoch": 201.0,
"grad_norm": 0.9195215006245523,
"kl": 0.012725830078125,
"learning_rate": 1.4804225250339281e-08,
"loss": 0.0,
"reward": 1.8250000476837158,
"reward_std": 0.39560186117887497,
"rewards/answer_reward_func": 0.8875000476837158,
"rewards/format_reward_func": 0.9375000298023224,
"step": 402
},
{
"completion_length": 187.25298309326172,
"epoch": 202.0,
"grad_norm": 1.6727983005324356,
"kl": 0.01751708984375,
"learning_rate": 1.360732647858498e-08,
"loss": 0.0,
"reward": 1.7270833849906921,
"reward_std": 0.4138026833534241,
"rewards/answer_reward_func": 0.7895833551883698,
"rewards/format_reward_func": 0.9375000298023224,
"step": 404
},
{
"completion_length": 192.87798309326172,
"epoch": 203.0,
"grad_norm": 1.083839122653639,
"kl": 0.014892578125,
"learning_rate": 1.2459519992702311e-08,
"loss": 0.0,
"reward": 1.7666667699813843,
"reward_std": 0.30420470237731934,
"rewards/answer_reward_func": 0.8083333373069763,
"rewards/format_reward_func": 0.9583333432674408,
"step": 406
},
{
"completion_length": 242.02084350585938,
"epoch": 204.0,
"grad_norm": 1.1194918345284006,
"kl": 0.01318359375,
"learning_rate": 1.1361044160671629e-08,
"loss": 0.0,
"reward": 1.6937499642372131,
"reward_std": 0.3781377822160721,
"rewards/answer_reward_func": 0.7666666805744171,
"rewards/format_reward_func": 0.9270833432674408,
"step": 408
},
{
"completion_length": 215.93006896972656,
"epoch": 205.0,
"grad_norm": 1.1017859873234446,
"kl": 0.011810302734375,
"learning_rate": 1.0312127105846947e-08,
"loss": 0.0,
"reward": 1.693750023841858,
"reward_std": 0.48744727671146393,
"rewards/answer_reward_func": 0.7770833671092987,
"rewards/format_reward_func": 0.9166666865348816,
"step": 410
},
{
"completion_length": 186.28423309326172,
"epoch": 206.0,
"grad_norm": 1.0624614282595348,
"kl": 0.01776123046875,
"learning_rate": 9.312986659581301e-09,
"loss": 0.0,
"reward": 1.5895833373069763,
"reward_std": 0.45519132912158966,
"rewards/answer_reward_func": 0.6833333373069763,
"rewards/format_reward_func": 0.9062500298023224,
"step": 412
},
{
"completion_length": 233.00894165039062,
"epoch": 207.0,
"grad_norm": 0.8851564318058255,
"kl": 0.012847900390625,
"learning_rate": 8.363830315988945e-09,
"loss": 0.0,
"reward": 1.6791667342185974,
"reward_std": 0.4650019705295563,
"rewards/answer_reward_func": 0.772916704416275,
"rewards/format_reward_func": 0.9062500298023224,
"step": 414
},
{
"completion_length": 194.01190948486328,
"epoch": 208.0,
"grad_norm": 1.155127800385124,
"kl": 0.0128173828125,
"learning_rate": 7.46485518885462e-09,
"loss": 0.0,
"reward": 1.6687501072883606,
"reward_std": 0.34395796060562134,
"rewards/answer_reward_func": 0.7208333611488342,
"rewards/format_reward_func": 0.9479166865348816,
"step": 416
},
{
"completion_length": 169.8482208251953,
"epoch": 209.0,
"grad_norm": 1.1199426121122114,
"kl": 0.017425537109375,
"learning_rate": 6.616247970698319e-09,
"loss": 0.0,
"reward": 1.693750023841858,
"reward_std": 0.3943335711956024,
"rewards/answer_reward_func": 0.7562500536441803,
"rewards/format_reward_func": 0.9375,
"step": 418
},
{
"completion_length": 243.14732360839844,
"epoch": 210.0,
"grad_norm": 0.8877069696048447,
"kl": 0.01654052734375,
"learning_rate": 5.8181848940044855e-09,
"loss": 0.0,
"reward": 1.6104167699813843,
"reward_std": 0.4525974839925766,
"rewards/answer_reward_func": 0.7041666805744171,
"rewards/format_reward_func": 0.9062500298023224,
"step": 420
},
{
"completion_length": 289.8839416503906,
"epoch": 211.0,
"grad_norm": 0.8870179616904256,
"kl": 0.011932373046875,
"learning_rate": 5.070831694623135e-09,
"loss": 0.0,
"reward": 1.5895834565162659,
"reward_std": 0.5177792310714722,
"rewards/answer_reward_func": 0.6937500536441803,
"rewards/format_reward_func": 0.8958333432674408,
"step": 422
},
{
"completion_length": 151.6934585571289,
"epoch": 212.0,
"grad_norm": 1.1557356844907873,
"kl": 0.01593017578125,
"learning_rate": 4.374343577351336e-09,
"loss": 0.0,
"reward": 1.7083333730697632,
"reward_std": 0.354248970746994,
"rewards/answer_reward_func": 0.7604166865348816,
"rewards/format_reward_func": 0.9479166865348816,
"step": 424
},
{
"completion_length": 181.48959350585938,
"epoch": 213.0,
"grad_norm": 0.9183490231413828,
"kl": 0.015350341796875,
"learning_rate": 3.7288651837012745e-09,
"loss": 0.0,
"reward": 1.7020833492279053,
"reward_std": 0.3603343367576599,
"rewards/answer_reward_func": 0.7645833790302277,
"rewards/format_reward_func": 0.9375000298023224,
"step": 426
},
{
"completion_length": 178.89137268066406,
"epoch": 214.0,
"grad_norm": 1.2202651910052664,
"kl": 0.0166015625,
"learning_rate": 3.134530561862081e-09,
"loss": 0.0,
"reward": 1.6979166865348816,
"reward_std": 0.45207175612449646,
"rewards/answer_reward_func": 0.78125,
"rewards/format_reward_func": 0.9166666865348816,
"step": 428
},
{
"completion_length": 256.625,
"epoch": 215.0,
"grad_norm": 0.9481831734670307,
"kl": 0.013458251953125,
"learning_rate": 2.5914631388619103e-09,
"loss": 0.0,
"reward": 1.7041667103767395,
"reward_std": 0.3960050940513611,
"rewards/answer_reward_func": 0.7666667103767395,
"rewards/format_reward_func": 0.9375000298023224,
"step": 430
},
{
"completion_length": 250.11905670166016,
"epoch": 216.0,
"grad_norm": 1.037124203173195,
"kl": 0.0146484375,
"learning_rate": 2.0997756949353297e-09,
"loss": 0.0,
"reward": 1.7562499642372131,
"reward_std": 0.3770013302564621,
"rewards/answer_reward_func": 0.8187500536441803,
"rewards/format_reward_func": 0.9375000298023224,
"step": 432
},
{
"completion_length": 255.68006896972656,
"epoch": 217.0,
"grad_norm": 1.1918462249746316,
"kl": 0.01416015625,
"learning_rate": 1.6595703401020844e-09,
"loss": 0.0,
"reward": 1.6354167461395264,
"reward_std": 0.4074978083372116,
"rewards/answer_reward_func": 0.7187500298023224,
"rewards/format_reward_func": 0.9166666865348816,
"step": 434
},
{
"completion_length": 232.20982360839844,
"epoch": 218.0,
"grad_norm": 0.7812593865128343,
"kl": 0.01055908203125,
"learning_rate": 1.2709384929615596e-09,
"loss": 0.0,
"reward": 1.6208333373069763,
"reward_std": 0.4629315435886383,
"rewards/answer_reward_func": 0.7249999940395355,
"rewards/format_reward_func": 0.8958333432674408,
"step": 436
},
{
"completion_length": 174.2559585571289,
"epoch": 219.0,
"grad_norm": 1.1014720216285023,
"kl": 0.01483154296875,
"learning_rate": 9.339608617077165e-10,
"loss": 0.0,
"reward": 1.6354166865348816,
"reward_std": 0.4915326237678528,
"rewards/answer_reward_func": 0.7187499701976776,
"rewards/format_reward_func": 0.9166666865348816,
"step": 438
},
{
"completion_length": 246.4464340209961,
"epoch": 220.0,
"grad_norm": 1.1926753556498022,
"kl": 0.014129638671875,
"learning_rate": 6.487074273681114e-10,
"loss": 0.0,
"reward": 1.67083340883255,
"reward_std": 0.32807309925556183,
"rewards/answer_reward_func": 0.7437499761581421,
"rewards/format_reward_func": 0.9270833432674408,
"step": 440
},
{
"completion_length": 166.12649536132812,
"epoch": 221.0,
"grad_norm": 1.2191019490862662,
"kl": 0.016021728515625,
"learning_rate": 4.152374292708538e-10,
"loss": 0.0,
"reward": 1.6312501430511475,
"reward_std": 0.5295368134975433,
"rewards/answer_reward_func": 0.7354166805744171,
"rewards/format_reward_func": 0.8958333432674408,
"step": 442
},
{
"completion_length": 206.5401840209961,
"epoch": 222.0,
"grad_norm": 1.1066549322199848,
"kl": 0.01348876953125,
"learning_rate": 2.3359935274214204e-10,
"loss": 0.0,
"reward": 1.7000001072883606,
"reward_std": 0.4501575529575348,
"rewards/answer_reward_func": 0.7833333611488342,
"rewards/format_reward_func": 0.9166666865348816,
"step": 444
},
{
"completion_length": 198.20089721679688,
"epoch": 223.0,
"grad_norm": 1.126642799573971,
"kl": 0.01629638671875,
"learning_rate": 1.0383091903720665e-10,
"loss": 0.0,
"reward": 1.7145833373069763,
"reward_std": 0.368631511926651,
"rewards/answer_reward_func": 0.7666666507720947,
"rewards/format_reward_func": 0.9479166865348816,
"step": 446
},
{
"completion_length": 190.87203216552734,
"epoch": 224.0,
"grad_norm": 0.8176781104321668,
"kl": 0.016876220703125,
"learning_rate": 2.595907750671533e-11,
"loss": 0.0,
"reward": 1.7895833849906921,
"reward_std": 0.31373436748981476,
"rewards/answer_reward_func": 0.831250011920929,
"rewards/format_reward_func": 0.9583333730697632,
"step": 448
},
{
"completion_length": 223.83929443359375,
"epoch": 225.0,
"grad_norm": 0.8752615103675867,
"kl": 0.0125732421875,
"learning_rate": 0.0,
"loss": 0.0,
"reward": 1.6062501072883606,
"reward_std": 0.4855257570743561,
"rewards/answer_reward_func": 0.7208333611488342,
"rewards/format_reward_func": 0.8854166865348816,
"step": 450
},
{
"epoch": 225.0,
"step": 450,
"total_flos": 0.0,
"train_loss": 7.9592482841109e-06,
"train_runtime": 6112.3984,
"train_samples_per_second": 3.534,
"train_steps_per_second": 0.074
}
],
"logging_steps": 2,
"max_steps": 450,
"num_input_tokens_seen": 0,
"num_train_epochs": 225,
"save_steps": 32,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}