{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.2, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0002, "grad_norm": 0.5344884991645813, "kl": 0.0005174986290512607, "learning_rate": 0.0, "loss": 0.0001, "num_tokens": 6480.0, "reward": 0.84515380859375, "reward_std": 0.014680828899145126, "rewards//mean": 0.84515380859375, "rewards//std": 0.027360284700989723, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0004, "grad_norm": 0.5381270051002502, "kl": 0.0006077195721445605, "learning_rate": 1.0000000000000001e-07, "loss": 0.0001, "num_tokens": 12960.0, "reward": 0.815673828125, "reward_std": 0.016982976347208023, "rewards//mean": 0.815673828125, "rewards//std": 0.02536199241876602, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0006, "grad_norm": 0.5261220335960388, "kl": 0.000551698001800105, "learning_rate": 2.0000000000000002e-07, "loss": 0.0001, "num_tokens": 19520.0, "reward": 0.81927490234375, "reward_std": 0.010811060667037964, "rewards//mean": 0.81927490234375, "rewards//std": 0.025441773235797882, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0008, "grad_norm": 0.5289216637611389, "kl": 0.0005863762053195387, "learning_rate": 3.0000000000000004e-07, "loss": 0.0001, "num_tokens": 26128.0, "reward": 0.82012939453125, "reward_std": 0.02173474058508873, "rewards//mean": 0.82012939453125, "rewards//std": 0.04268558695912361, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.001, "grad_norm": 0.5722470283508301, "kl": 0.000628954017884098, "learning_rate": 4.0000000000000003e-07, "loss": 0.0001, "num_tokens": 32704.0, "reward": 0.7650146484375, "reward_std": 0.015487657859921455, "rewards//mean": 0.7650146484375, "rewards//std": 0.023481056094169617, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0012, "grad_norm": 0.5234053730964661, "kl": 0.0005751600110670552, "learning_rate": 5.000000000000001e-07, "loss": 0.0001, "num_tokens": 39264.0, "reward": 0.8218994140625, "reward_std": 0.014946818351745605, "rewards//mean": 0.8218994140625, "rewards//std": 0.025350946933031082, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0014, "grad_norm": 0.5195151567459106, "kl": 0.0005390266160247847, "learning_rate": 6.000000000000001e-07, "loss": 0.0001, "num_tokens": 45856.0, "reward": 0.81597900390625, "reward_std": 0.015014132484793663, "rewards//mean": 0.81597900390625, "rewards//std": 0.020927488803863525, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0016, "grad_norm": 0.4819026589393616, "kl": 0.0005648515070788562, "learning_rate": 7.000000000000001e-07, "loss": 0.0001, "num_tokens": 52376.0, "reward": 0.802490234375, "reward_std": 0.009616490453481674, "rewards//mean": 0.802490234375, "rewards//std": 0.021797746419906616, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0018, "grad_norm": 0.49622002243995667, "kl": 0.0005116216343594715, "learning_rate": 8.000000000000001e-07, "loss": 0.0001, "num_tokens": 58928.0, "reward": 0.838623046875, "reward_std": 0.010705415159463882, "rewards//mean": 0.838623046875, "rewards//std": 0.01565403863787651, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.002, "grad_norm": 0.5537323355674744, "kl": 0.0006005062314216048, "learning_rate": 9.000000000000001e-07, "loss": 0.0001, "num_tokens": 65448.0, "reward": 0.8486328125, "reward_std": 0.014971021562814713, "rewards//mean": 0.8486328125, "rewards//std": 0.024861659854650497, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0022, "grad_norm": 0.4782222807407379, "kl": 0.0005192094249650836, "learning_rate": 1.0000000000000002e-06, "loss": 0.0001, "num_tokens": 72144.0, "reward": 0.8231201171875, "reward_std": 0.014505397528409958, "rewards//mean": 0.8231201171875, "rewards//std": 0.023432008922100067, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0024, "grad_norm": 0.5115165710449219, "kl": 0.000583281711442396, "learning_rate": 1.1e-06, "loss": 0.0001, "num_tokens": 78672.0, "reward": 0.7879638671875, "reward_std": 0.014534728601574898, "rewards//mean": 0.7879638671875, "rewards//std": 0.030848022550344467, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0026, "grad_norm": 0.5358304977416992, "kl": 0.0005532236391445622, "learning_rate": 1.2000000000000002e-06, "loss": 0.0001, "num_tokens": 85160.0, "reward": 0.8404541015625, "reward_std": 0.01879037171602249, "rewards//mean": 0.8404541015625, "rewards//std": 0.021690186113119125, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0028, "grad_norm": 0.5308247804641724, "kl": 0.0005572287191171199, "learning_rate": 1.3e-06, "loss": 0.0001, "num_tokens": 91664.0, "reward": 0.81329345703125, "reward_std": 0.011631859466433525, "rewards//mean": 0.81329345703125, "rewards//std": 0.022916875779628754, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.003, "grad_norm": 0.5254385471343994, "kl": 0.0005984247691230848, "learning_rate": 1.4000000000000001e-06, "loss": 0.0001, "num_tokens": 98256.0, "reward": 0.82110595703125, "reward_std": 0.015688244253396988, "rewards//mean": 0.82110595703125, "rewards//std": 0.04342671483755112, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0032, "grad_norm": 0.5159713625907898, "kl": 0.0006229286955203861, "learning_rate": 1.5e-06, "loss": 0.0001, "num_tokens": 104744.0, "reward": 0.78802490234375, "reward_std": 0.015156297013163567, "rewards//mean": 0.78802490234375, "rewards//std": 0.024192556738853455, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0034, "grad_norm": 0.5559095144271851, "kl": 0.0005670633545378223, "learning_rate": 1.6000000000000001e-06, "loss": 0.0001, "num_tokens": 111272.0, "reward": 0.8494873046875, "reward_std": 0.013256147503852844, "rewards//mean": 0.8494873046875, "rewards//std": 0.02326083369553089, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0036, "grad_norm": 0.5672537684440613, "kl": 0.0005939321563346311, "learning_rate": 1.7000000000000002e-06, "loss": 0.0001, "num_tokens": 117816.0, "reward": 0.854248046875, "reward_std": 0.018103815615177155, "rewards//mean": 0.854248046875, "rewards//std": 0.024517083540558815, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0038, "grad_norm": 0.4956085681915283, "kl": 0.0005725373921450227, "learning_rate": 1.8000000000000001e-06, "loss": 0.0001, "num_tokens": 124328.0, "reward": 0.82659912109375, "reward_std": 0.015440911054611206, "rewards//mean": 0.82659912109375, "rewards//std": 0.02957451343536377, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.004, "grad_norm": 0.5425844788551331, "kl": 0.00057974286028184, "learning_rate": 1.9000000000000002e-06, "loss": 0.0001, "num_tokens": 130768.0, "reward": 0.84521484375, "reward_std": 0.016270218417048454, "rewards//mean": 0.84521484375, "rewards//std": 0.03373591601848602, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0042, "grad_norm": 0.5052315592765808, "kl": 0.0005288290412863716, "learning_rate": 2.0000000000000003e-06, "loss": 0.0001, "num_tokens": 137264.0, "reward": 0.83184814453125, "reward_std": 0.013612732291221619, "rewards//mean": 0.83184814453125, "rewards//std": 0.028505386784672737, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0044, "grad_norm": 0.6783661842346191, "kl": 0.0005678313900716603, "learning_rate": 2.1000000000000002e-06, "loss": 0.0001, "num_tokens": 143744.0, "reward": 0.82196044921875, "reward_std": 0.01590130478143692, "rewards//mean": 0.82196044921875, "rewards//std": 0.02282818965613842, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0046, "grad_norm": 0.5427769422531128, "kl": 0.0006147136591607705, "learning_rate": 2.2e-06, "loss": 0.0001, "num_tokens": 150320.0, "reward": 0.83453369140625, "reward_std": 0.013559719547629356, "rewards//mean": 0.83453369140625, "rewards//std": 0.020104041323065758, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0048, "grad_norm": 0.5730366706848145, "kl": 0.0006162305653560907, "learning_rate": 2.3000000000000004e-06, "loss": 0.0001, "num_tokens": 156832.0, "reward": 0.79132080078125, "reward_std": 0.016593754291534424, "rewards//mean": 0.79132080078125, "rewards//std": 0.0344906747341156, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.005, "grad_norm": 0.5671719312667847, "kl": 0.0006081125320633873, "learning_rate": 2.4000000000000003e-06, "loss": 0.0001, "num_tokens": 163296.0, "reward": 0.83221435546875, "reward_std": 0.015442395582795143, "rewards//mean": 0.83221435546875, "rewards//std": 0.032700031995773315, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0052, "grad_norm": 0.5694089531898499, "kl": 0.000564752466743812, "learning_rate": 2.5e-06, "loss": 0.0001, "num_tokens": 169848.0, "reward": 0.8280029296875, "reward_std": 0.013243570923805237, "rewards//mean": 0.8280029296875, "rewards//std": 0.025084422901272774, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0054, "grad_norm": 0.5058358311653137, "kl": 0.0005670115642715245, "learning_rate": 2.6e-06, "loss": 0.0001, "num_tokens": 176272.0, "reward": 0.84521484375, "reward_std": 0.013745347037911415, "rewards//mean": 0.84521484375, "rewards//std": 0.030698729678988457, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0056, "grad_norm": 0.5333991050720215, "kl": 0.0006260298541747034, "learning_rate": 2.7000000000000004e-06, "loss": 0.0001, "num_tokens": 182896.0, "reward": 0.84588623046875, "reward_std": 0.02072429470717907, "rewards//mean": 0.84588623046875, "rewards//std": 0.028569040820002556, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0058, "grad_norm": 0.5217524170875549, "kl": 0.0006107169174356386, "learning_rate": 2.8000000000000003e-06, "loss": 0.0001, "num_tokens": 189472.0, "reward": 0.85723876953125, "reward_std": 0.012601524591445923, "rewards//mean": 0.85723876953125, "rewards//std": 0.025985149666666985, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.006, "grad_norm": 0.5388394594192505, "kl": 0.0006163410871522501, "learning_rate": 2.9e-06, "loss": 0.0001, "num_tokens": 196008.0, "reward": 0.8251953125, "reward_std": 0.01776113733649254, "rewards//mean": 0.8251953125, "rewards//std": 0.04279082641005516, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0062, "grad_norm": 0.5640242099761963, "kl": 0.0006344770517898723, "learning_rate": 3e-06, "loss": 0.0001, "num_tokens": 202440.0, "reward": 0.8143310546875, "reward_std": 0.011227348819375038, "rewards//mean": 0.8143310546875, "rewards//std": 0.032421380281448364, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0064, "grad_norm": 0.530994713306427, "kl": 0.0006286523857852444, "learning_rate": 3.1000000000000004e-06, "loss": 0.0001, "num_tokens": 208936.0, "reward": 0.86181640625, "reward_std": 0.01134884636849165, "rewards//mean": 0.86181640625, "rewards//std": 0.02666206657886505, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0066, "grad_norm": 0.5347341895103455, "kl": 0.0006648319977102801, "learning_rate": 3.2000000000000003e-06, "loss": 0.0001, "num_tokens": 215520.0, "reward": 0.80657958984375, "reward_std": 0.014637555927038193, "rewards//mean": 0.80657958984375, "rewards//std": 0.025391744449734688, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0068, "grad_norm": 0.5251972675323486, "kl": 0.0005861666286364198, "learning_rate": 3.3000000000000006e-06, "loss": 0.0001, "num_tokens": 222032.0, "reward": 0.8531494140625, "reward_std": 0.018488118425011635, "rewards//mean": 0.8531494140625, "rewards//std": 0.035555075854063034, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.007, "grad_norm": 0.5468869209289551, "kl": 0.0006114646239439026, "learning_rate": 3.4000000000000005e-06, "loss": 0.0001, "num_tokens": 228656.0, "reward": 0.81988525390625, "reward_std": 0.014370636083185673, "rewards//mean": 0.81988525390625, "rewards//std": 0.03541446104645729, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0072, "grad_norm": 0.5589239597320557, "kl": 0.0006090118695283309, "learning_rate": 3.5e-06, "loss": 0.0001, "num_tokens": 235144.0, "reward": 0.83056640625, "reward_std": 0.01737598143517971, "rewards//mean": 0.83056640625, "rewards//std": 0.044120125472545624, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0074, "grad_norm": 0.5322961211204529, "kl": 0.0006877124687889591, "learning_rate": 3.6000000000000003e-06, "loss": 0.0001, "num_tokens": 241728.0, "reward": 0.8135986328125, "reward_std": 0.01702769845724106, "rewards//mean": 0.8135986328125, "rewards//std": 0.032633595168590546, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0076, "grad_norm": 0.6947181224822998, "kl": 0.0007269596826517954, "learning_rate": 3.7e-06, "loss": 0.0001, "num_tokens": 248288.0, "reward": 0.8291015625, "reward_std": 0.017653653398156166, "rewards//mean": 0.8291015625, "rewards//std": 0.025770539417862892, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0078, "grad_norm": 0.5198093056678772, "kl": 0.000663173821521923, "learning_rate": 3.8000000000000005e-06, "loss": 0.0001, "num_tokens": 254888.0, "reward": 0.87103271484375, "reward_std": 0.013572394847869873, "rewards//mean": 0.87103271484375, "rewards//std": 0.025012129917740822, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.008, "grad_norm": 0.5395210385322571, "kl": 0.0005934182700002566, "learning_rate": 3.900000000000001e-06, "loss": 0.0001, "num_tokens": 261408.0, "reward": 0.79693603515625, "reward_std": 0.009941511787474155, "rewards//mean": 0.79693603515625, "rewards//std": 0.027546655386686325, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0082, "grad_norm": 0.5234987735748291, "kl": 0.000650742367724888, "learning_rate": 4.000000000000001e-06, "loss": 0.0001, "num_tokens": 267944.0, "reward": 0.8211669921875, "reward_std": 0.01747434213757515, "rewards//mean": 0.8211669921875, "rewards//std": 0.02766927145421505, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0084, "grad_norm": 0.5588589310646057, "kl": 0.0006299799279076979, "learning_rate": 4.1e-06, "loss": 0.0001, "num_tokens": 274448.0, "reward": 0.8140869140625, "reward_std": 0.013739854097366333, "rewards//mean": 0.8140869140625, "rewards//std": 0.025465337559580803, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0086, "grad_norm": 0.5930436849594116, "kl": 0.0006357767269946635, "learning_rate": 4.2000000000000004e-06, "loss": 0.0001, "num_tokens": 280936.0, "reward": 0.81170654296875, "reward_std": 0.011782532557845116, "rewards//mean": 0.81170654296875, "rewards//std": 0.0325273722410202, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0088, "grad_norm": 0.5557746291160583, "kl": 0.0007158647204050794, "learning_rate": 4.3e-06, "loss": 0.0001, "num_tokens": 287528.0, "reward": 0.85687255859375, "reward_std": 0.014486387372016907, "rewards//mean": 0.85687255859375, "rewards//std": 0.027903733775019646, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.009, "grad_norm": 0.6580216884613037, "kl": 0.0006931709067430347, "learning_rate": 4.4e-06, "loss": 0.0001, "num_tokens": 294064.0, "reward": 0.86663818359375, "reward_std": 0.014136701822280884, "rewards//mean": 0.86663818359375, "rewards//std": 0.023703305050730705, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0092, "grad_norm": 0.5328194499015808, "kl": 0.0006727315339958295, "learning_rate": 4.5e-06, "loss": 0.0001, "num_tokens": 300624.0, "reward": 0.8414306640625, "reward_std": 0.01816842518746853, "rewards//mean": 0.8414306640625, "rewards//std": 0.03057396598160267, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0094, "grad_norm": 0.5084638595581055, "kl": 0.0007027584506431594, "learning_rate": 4.600000000000001e-06, "loss": 0.0001, "num_tokens": 307144.0, "reward": 0.8560791015625, "reward_std": 0.010555820539593697, "rewards//mean": 0.8560791015625, "rewards//std": 0.026080874726176262, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0096, "grad_norm": 0.5603559017181396, "kl": 0.000706536418874748, "learning_rate": 4.7e-06, "loss": 0.0001, "num_tokens": 313696.0, "reward": 0.82489013671875, "reward_std": 0.01863221451640129, "rewards//mean": 0.82489013671875, "rewards//std": 0.030989699065685272, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0098, "grad_norm": 0.5202544331550598, "kl": 0.0006658075581071898, "learning_rate": 4.800000000000001e-06, "loss": 0.0001, "num_tokens": 320144.0, "reward": 0.864501953125, "reward_std": 0.017458315938711166, "rewards//mean": 0.864501953125, "rewards//std": 0.02970735915005207, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.01, "grad_norm": 0.5430282354354858, "kl": 0.0007079422211972997, "learning_rate": 4.9000000000000005e-06, "loss": 0.0001, "num_tokens": 326632.0, "reward": 0.8289794921875, "reward_std": 0.015223849564790726, "rewards//mean": 0.8289794921875, "rewards//std": 0.028258241713047028, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0102, "grad_norm": 0.5527967214584351, "kl": 0.00068058175384067, "learning_rate": 5e-06, "loss": 0.0001, "num_tokens": 333176.0, "reward": 0.85009765625, "reward_std": 0.014322711154818535, "rewards//mean": 0.85009765625, "rewards//std": 0.03962577506899834, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0104, "grad_norm": 0.49573835730552673, "kl": 0.0007503277156502008, "learning_rate": 4.9999994965001495e-06, "loss": 0.0001, "num_tokens": 339680.0, "reward": 0.82861328125, "reward_std": 0.012963730841875076, "rewards//mean": 0.82861328125, "rewards//std": 0.017378928139805794, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0106, "grad_norm": 0.6121876239776611, "kl": 0.000837808009237051, "learning_rate": 4.999997986000801e-06, "loss": 0.0001, "num_tokens": 346120.0, "reward": 0.8382568359375, "reward_std": 0.011654852889478207, "rewards//mean": 0.8382568359375, "rewards//std": 0.02849084511399269, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0108, "grad_norm": 0.5221219062805176, "kl": 0.0007825934153515846, "learning_rate": 4.999995468502563e-06, "loss": 0.0001, "num_tokens": 352528.0, "reward": 0.82977294921875, "reward_std": 0.01969888061285019, "rewards//mean": 0.82977294921875, "rewards//std": 0.035405483096838, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.011, "grad_norm": 0.5531036257743835, "kl": 0.0007164527050917968, "learning_rate": 4.9999919440064484e-06, "loss": 0.0001, "num_tokens": 359080.0, "reward": 0.816162109375, "reward_std": 0.011350625194609165, "rewards//mean": 0.816162109375, "rewards//std": 0.03247293457388878, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0112, "grad_norm": 0.5199887752532959, "kl": 0.0007260760758072138, "learning_rate": 4.999987412513878e-06, "loss": 0.0001, "num_tokens": 365600.0, "reward": 0.82843017578125, "reward_std": 0.019472327083349228, "rewards//mean": 0.82843017578125, "rewards//std": 0.03167574107646942, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0114, "grad_norm": 0.5673383474349976, "kl": 0.0008793429587967694, "learning_rate": 4.999981874026677e-06, "loss": 0.0001, "num_tokens": 372120.0, "reward": 0.82391357421875, "reward_std": 0.012286683544516563, "rewards//mean": 0.82391357421875, "rewards//std": 0.024570664390921593, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0116, "grad_norm": 0.5173213481903076, "kl": 0.0006918876169947907, "learning_rate": 4.9999753285470756e-06, "loss": 0.0001, "num_tokens": 378632.0, "reward": 0.83428955078125, "reward_std": 0.015794314444065094, "rewards//mean": 0.83428955078125, "rewards//std": 0.022453775629401207, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0118, "grad_norm": 0.570139467716217, "kl": 0.0007772536046104506, "learning_rate": 4.9999677760777114e-06, "loss": 0.0001, "num_tokens": 385168.0, "reward": 0.85089111328125, "reward_std": 0.015498969703912735, "rewards//mean": 0.85089111328125, "rewards//std": 0.028978342190384865, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.012, "grad_norm": 0.5766065716743469, "kl": 0.000741052339435555, "learning_rate": 4.999959216621626e-06, "loss": 0.0001, "num_tokens": 391712.0, "reward": 0.83740234375, "reward_std": 0.013245906680822372, "rewards//mean": 0.83740234375, "rewards//std": 0.02006947062909603, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0122, "grad_norm": 0.538689136505127, "kl": 0.0007515538309235126, "learning_rate": 4.999949650182267e-06, "loss": 0.0001, "num_tokens": 398168.0, "reward": 0.81243896484375, "reward_std": 0.0122340964153409, "rewards//mean": 0.81243896484375, "rewards//std": 0.031715378165245056, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0124, "grad_norm": 0.537291944026947, "kl": 0.0008048277377383783, "learning_rate": 4.999939076763487e-06, "loss": 0.0001, "num_tokens": 404648.0, "reward": 0.84130859375, "reward_std": 0.01757432520389557, "rewards//mean": 0.84130859375, "rewards//std": 0.028463469818234444, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0126, "grad_norm": 0.5939363837242126, "kl": 0.0007771121745463461, "learning_rate": 4.999927496369547e-06, "loss": 0.0001, "num_tokens": 411160.0, "reward": 0.83380126953125, "reward_std": 0.01802542805671692, "rewards//mean": 0.83380126953125, "rewards//std": 0.02362206019461155, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0128, "grad_norm": 0.5537134408950806, "kl": 0.0008526271412847564, "learning_rate": 4.99991490900511e-06, "loss": 0.0001, "num_tokens": 417712.0, "reward": 0.81298828125, "reward_std": 0.009979894384741783, "rewards//mean": 0.81298828125, "rewards//std": 0.02068750187754631, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.013, "grad_norm": 0.5816908478736877, "kl": 0.0007632242341060191, "learning_rate": 4.999901314675246e-06, "loss": 0.0001, "num_tokens": 424232.0, "reward": 0.8087158203125, "reward_std": 0.012353556230664253, "rewards//mean": 0.8087158203125, "rewards//std": 0.01686212420463562, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0132, "grad_norm": 0.5981035232543945, "kl": 0.0008245876524597406, "learning_rate": 4.999886713385432e-06, "loss": 0.0001, "num_tokens": 430712.0, "reward": 0.81884765625, "reward_std": 0.02527492679655552, "rewards//mean": 0.81884765625, "rewards//std": 0.05187131464481354, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0134, "grad_norm": 0.549401044845581, "kl": 0.000775572014390491, "learning_rate": 4.999871105141549e-06, "loss": 0.0001, "num_tokens": 437296.0, "reward": 0.8375244140625, "reward_std": 0.014890830963850021, "rewards//mean": 0.8375244140625, "rewards//std": 0.028397180140018463, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0136, "grad_norm": 0.541130542755127, "kl": 0.0007868314132792875, "learning_rate": 4.9998544899498845e-06, "loss": 0.0001, "num_tokens": 443816.0, "reward": 0.8046875, "reward_std": 0.012027734890580177, "rewards//mean": 0.8046875, "rewards//std": 0.021776903420686722, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0138, "grad_norm": 0.6027575135231018, "kl": 0.0009221135260304436, "learning_rate": 4.999836867817129e-06, "loss": 0.0001, "num_tokens": 450312.0, "reward": 0.816162109375, "reward_std": 0.016808371990919113, "rewards//mean": 0.816162109375, "rewards//std": 0.024358505383133888, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.014, "grad_norm": 0.5539107322692871, "kl": 0.0007876981690060347, "learning_rate": 4.9998182387503825e-06, "loss": 0.0001, "num_tokens": 456840.0, "reward": 0.77813720703125, "reward_std": 0.015146953985095024, "rewards//mean": 0.77813720703125, "rewards//std": 0.024361535906791687, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0142, "grad_norm": 0.5014827847480774, "kl": 0.0008181602897820994, "learning_rate": 4.999798602757149e-06, "loss": 0.0001, "num_tokens": 463288.0, "reward": 0.82305908203125, "reward_std": 0.019664010033011436, "rewards//mean": 0.82305908203125, "rewards//std": 0.035274408757686615, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0144, "grad_norm": 0.5452041029930115, "kl": 0.0008986780885607004, "learning_rate": 4.9997779598453365e-06, "loss": 0.0001, "num_tokens": 469680.0, "reward": 0.83221435546875, "reward_std": 0.016070978716015816, "rewards//mean": 0.83221435546875, "rewards//std": 0.038161925971508026, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0146, "grad_norm": 0.5421676635742188, "kl": 0.0008441337413387373, "learning_rate": 4.999756310023261e-06, "loss": 0.0001, "num_tokens": 476208.0, "reward": 0.7965087890625, "reward_std": 0.010674255900084972, "rewards//mean": 0.7965087890625, "rewards//std": 0.021038319915533066, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0148, "grad_norm": 0.49503278732299805, "kl": 0.0008629800722701475, "learning_rate": 4.999733653299643e-06, "loss": 0.0001, "num_tokens": 482752.0, "reward": 0.8499755859375, "reward_std": 0.017572959885001183, "rewards//mean": 0.8499755859375, "rewards//std": 0.02942756749689579, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.015, "grad_norm": 0.5961833000183105, "kl": 0.0009816785313887522, "learning_rate": 4.9997099896836076e-06, "loss": 0.0001, "num_tokens": 489336.0, "reward": 0.816650390625, "reward_std": 0.014112787321209908, "rewards//mean": 0.816650390625, "rewards//std": 0.021797746419906616, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0152, "grad_norm": 0.5034902095794678, "kl": 0.001062475799699314, "learning_rate": 4.999685319184688e-06, "loss": 0.0001, "num_tokens": 495976.0, "reward": 0.85406494140625, "reward_std": 0.01799612119793892, "rewards//mean": 0.85406494140625, "rewards//std": 0.026914028450846672, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0154, "grad_norm": 0.5780143141746521, "kl": 0.0010181776160607114, "learning_rate": 4.999659641812821e-06, "loss": 0.0001, "num_tokens": 502496.0, "reward": 0.79010009765625, "reward_std": 0.009412910789251328, "rewards//mean": 0.79010009765625, "rewards//std": 0.03928040713071823, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0156, "grad_norm": 0.536464273929596, "kl": 0.0010662138811312616, "learning_rate": 4.9996329575783486e-06, "loss": 0.0001, "num_tokens": 508992.0, "reward": 0.8341064453125, "reward_std": 0.015705348923802376, "rewards//mean": 0.8341064453125, "rewards//std": 0.025295954197645187, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0158, "grad_norm": 0.5415508151054382, "kl": 0.0009792676428332925, "learning_rate": 4.99960526649202e-06, "loss": 0.0001, "num_tokens": 515512.0, "reward": 0.8585205078125, "reward_std": 0.01360266376286745, "rewards//mean": 0.8585205078125, "rewards//std": 0.02985856868326664, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.016, "grad_norm": 0.5641584396362305, "kl": 0.0010558558569755405, "learning_rate": 4.999576568564989e-06, "loss": 0.0001, "num_tokens": 522056.0, "reward": 0.8646240234375, "reward_std": 0.012548411265015602, "rewards//mean": 0.8646240234375, "rewards//std": 0.03710509091615677, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0162, "grad_norm": 0.4885479807853699, "kl": 0.0010145479027414694, "learning_rate": 4.999546863808815e-06, "loss": 0.0001, "num_tokens": 528560.0, "reward": 0.83837890625, "reward_std": 0.013274097815155983, "rewards//mean": 0.83837890625, "rewards//std": 0.021242039278149605, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0164, "grad_norm": 0.5320591330528259, "kl": 0.0011373023444321007, "learning_rate": 4.999516152235463e-06, "loss": 0.0001, "num_tokens": 535144.0, "reward": 0.8441162109375, "reward_std": 0.01578804850578308, "rewards//mean": 0.8441162109375, "rewards//std": 0.02666405588388443, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0166, "grad_norm": 0.5232922434806824, "kl": 0.0012273474567336962, "learning_rate": 4.999484433857305e-06, "loss": 0.0001, "num_tokens": 541600.0, "reward": 0.8199462890625, "reward_std": 0.012842075899243355, "rewards//mean": 0.8199462890625, "rewards//std": 0.03780028596520424, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0168, "grad_norm": 0.6083173155784607, "kl": 0.0010711704671848565, "learning_rate": 4.999451708687114e-06, "loss": 0.0001, "num_tokens": 548224.0, "reward": 0.84564208984375, "reward_std": 0.015799537301063538, "rewards//mean": 0.84564208984375, "rewards//std": 0.01983264647424221, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.017, "grad_norm": 0.6593310236930847, "kl": 0.0012267531710676849, "learning_rate": 4.999417976738075e-06, "loss": 0.0001, "num_tokens": 554800.0, "reward": 0.86431884765625, "reward_std": 0.009114849381148815, "rewards//mean": 0.86431884765625, "rewards//std": 0.0262292567640543, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0172, "grad_norm": 0.6140724420547485, "kl": 0.0011840108927572146, "learning_rate": 4.999383238023773e-06, "loss": 0.0001, "num_tokens": 561328.0, "reward": 0.83831787109375, "reward_std": 0.01039905659854412, "rewards//mean": 0.83831787109375, "rewards//std": 0.024514535441994667, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0174, "grad_norm": 0.5758453607559204, "kl": 0.0012705906701739877, "learning_rate": 4.999347492558203e-06, "loss": 0.0001, "num_tokens": 567976.0, "reward": 0.85235595703125, "reward_std": 0.012438876554369926, "rewards//mean": 0.85235595703125, "rewards//std": 0.03644623979926109, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0176, "grad_norm": 0.5718148350715637, "kl": 0.0012508028012234718, "learning_rate": 4.999310740355761e-06, "loss": 0.0001, "num_tokens": 574488.0, "reward": 0.841064453125, "reward_std": 0.013601240701973438, "rewards//mean": 0.841064453125, "rewards//std": 0.039395131170749664, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0178, "grad_norm": 0.6675286889076233, "kl": 0.0014662631147075444, "learning_rate": 4.9992729814312514e-06, "loss": 0.0001, "num_tokens": 580992.0, "reward": 0.82659912109375, "reward_std": 0.016967086121439934, "rewards//mean": 0.82659912109375, "rewards//std": 0.03539735823869705, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.018, "grad_norm": 0.5523439645767212, "kl": 0.0013510131393559277, "learning_rate": 4.999234215799884e-06, "loss": 0.0001, "num_tokens": 587448.0, "reward": 0.81683349609375, "reward_std": 0.011323179118335247, "rewards//mean": 0.81683349609375, "rewards//std": 0.024706443771719933, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0182, "grad_norm": 0.5677604079246521, "kl": 0.0014052088954485953, "learning_rate": 4.999194443477273e-06, "loss": 0.0001, "num_tokens": 593944.0, "reward": 0.81878662109375, "reward_std": 0.011167924851179123, "rewards//mean": 0.81878662109375, "rewards//std": 0.024750519543886185, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0184, "grad_norm": 0.579730212688446, "kl": 0.001485287953983061, "learning_rate": 4.99915366447944e-06, "loss": 0.0001, "num_tokens": 600520.0, "reward": 0.83245849609375, "reward_std": 0.01508173905313015, "rewards//mean": 0.83245849609375, "rewards//std": 0.028220918029546738, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0186, "grad_norm": 0.5412017107009888, "kl": 0.001436470469343476, "learning_rate": 4.999111878822809e-06, "loss": 0.0001, "num_tokens": 606960.0, "reward": 0.8790283203125, "reward_std": 0.014004024676978588, "rewards//mean": 0.8790283203125, "rewards//std": 0.03480638191103935, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0188, "grad_norm": 0.5232703685760498, "kl": 0.0014006335695739836, "learning_rate": 4.999069086524212e-06, "loss": 0.0001, "num_tokens": 613488.0, "reward": 0.85296630859375, "reward_std": 0.010906463488936424, "rewards//mean": 0.85296630859375, "rewards//std": 0.028076795861124992, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.019, "grad_norm": 0.4935658276081085, "kl": 0.0014153113588690758, "learning_rate": 4.999025287600886e-06, "loss": 0.0001, "num_tokens": 620008.0, "reward": 0.8734130859375, "reward_std": 0.009302475489675999, "rewards//mean": 0.8734130859375, "rewards//std": 0.024985257536172867, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0192, "grad_norm": 0.6382635831832886, "kl": 0.001757641599397175, "learning_rate": 4.998980482070473e-06, "loss": 0.0002, "num_tokens": 626448.0, "reward": 0.84674072265625, "reward_std": 0.014600001275539398, "rewards//mean": 0.84674072265625, "rewards//std": 0.02444651536643505, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0194, "grad_norm": 0.5889910459518433, "kl": 0.0018631023849593475, "learning_rate": 4.9989346699510215e-06, "loss": 0.0002, "num_tokens": 632888.0, "reward": 0.846435546875, "reward_std": 0.012425427325069904, "rewards//mean": 0.846435546875, "rewards//std": 0.02704427018761635, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0196, "grad_norm": 0.5496983528137207, "kl": 0.0017451888270443305, "learning_rate": 4.9988878512609825e-06, "loss": 0.0002, "num_tokens": 639376.0, "reward": 0.8331298828125, "reward_std": 0.012676231563091278, "rewards//mean": 0.8331298828125, "rewards//std": 0.031408242881298065, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0198, "grad_norm": 0.5105761289596558, "kl": 0.0015143363270908594, "learning_rate": 4.998840026019217e-06, "loss": 0.0002, "num_tokens": 645936.0, "reward": 0.8331298828125, "reward_std": 0.010076148435473442, "rewards//mean": 0.8331298828125, "rewards//std": 0.019506044685840607, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.02, "grad_norm": 0.5801912546157837, "kl": 0.0020606104080798104, "learning_rate": 4.998791194244988e-06, "loss": 0.0002, "num_tokens": 652504.0, "reward": 0.8634033203125, "reward_std": 0.013777434825897217, "rewards//mean": 0.8634033203125, "rewards//std": 0.03736365959048271, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0202, "grad_norm": 0.6097487211227417, "kl": 0.0017242246540263295, "learning_rate": 4.998741355957963e-06, "loss": 0.0002, "num_tokens": 659024.0, "reward": 0.845703125, "reward_std": 0.012556832283735275, "rewards//mean": 0.845703125, "rewards//std": 0.02684764750301838, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0204, "grad_norm": 0.5697188377380371, "kl": 0.0018468490161467344, "learning_rate": 4.99869051117822e-06, "loss": 0.0002, "num_tokens": 665568.0, "reward": 0.85235595703125, "reward_std": 0.01241546031087637, "rewards//mean": 0.85235595703125, "rewards//std": 0.029282856732606888, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0206, "grad_norm": 0.5179392695426941, "kl": 0.0018879670387832448, "learning_rate": 4.998638659926238e-06, "loss": 0.0002, "num_tokens": 671992.0, "reward": 0.84124755859375, "reward_std": 0.01206475030630827, "rewards//mean": 0.84124755859375, "rewards//std": 0.023223303258419037, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0208, "grad_norm": 0.5548343062400818, "kl": 0.0020817798358621076, "learning_rate": 4.998585802222902e-06, "loss": 0.0002, "num_tokens": 678640.0, "reward": 0.79302978515625, "reward_std": 0.011504091322422028, "rewards//mean": 0.79302978515625, "rewards//std": 0.024054504930973053, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.021, "grad_norm": 0.5501526594161987, "kl": 0.0019515428430167958, "learning_rate": 4.9985319380895035e-06, "loss": 0.0002, "num_tokens": 685200.0, "reward": 0.85882568359375, "reward_std": 0.012049192562699318, "rewards//mean": 0.85882568359375, "rewards//std": 0.029772449284791946, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0212, "grad_norm": 0.5171511769294739, "kl": 0.0022368501813616604, "learning_rate": 4.99847706754774e-06, "loss": 0.0002, "num_tokens": 691664.0, "reward": 0.83056640625, "reward_std": 0.015448026359081268, "rewards//mean": 0.83056640625, "rewards//std": 0.018577925860881805, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0214, "grad_norm": 0.4974002242088318, "kl": 0.001978852436877787, "learning_rate": 4.998421190619712e-06, "loss": 0.0002, "num_tokens": 698152.0, "reward": 0.8232421875, "reward_std": 0.009565056301653385, "rewards//mean": 0.8232421875, "rewards//std": 0.022380255162715912, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0216, "grad_norm": 0.5393596291542053, "kl": 0.0022539695783052593, "learning_rate": 4.998364307327927e-06, "loss": 0.0002, "num_tokens": 704704.0, "reward": 0.806396484375, "reward_std": 0.011443907395005226, "rewards//mean": 0.806396484375, "rewards//std": 0.022850144654512405, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0218, "grad_norm": 0.5512969493865967, "kl": 0.0018980851164087653, "learning_rate": 4.998306417695298e-06, "loss": 0.0002, "num_tokens": 711256.0, "reward": 0.79241943359375, "reward_std": 0.012490647844970226, "rewards//mean": 0.79241943359375, "rewards//std": 0.0348195917904377, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.022, "grad_norm": 0.5473505258560181, "kl": 0.002349850197788328, "learning_rate": 4.998247521745142e-06, "loss": 0.0002, "num_tokens": 717784.0, "reward": 0.83465576171875, "reward_std": 0.008640991523861885, "rewards//mean": 0.83465576171875, "rewards//std": 0.017218681052327156, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0222, "grad_norm": 0.6422269344329834, "kl": 0.0024210367555497214, "learning_rate": 4.998187619501185e-06, "loss": 0.0002, "num_tokens": 724336.0, "reward": 0.82794189453125, "reward_std": 0.011137901805341244, "rewards//mean": 0.82794189453125, "rewards//std": 0.03503153845667839, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0224, "grad_norm": 0.5594554543495178, "kl": 0.002010446216445416, "learning_rate": 4.998126710987552e-06, "loss": 0.0002, "num_tokens": 730888.0, "reward": 0.86468505859375, "reward_std": 0.015423553064465523, "rewards//mean": 0.86468505859375, "rewards//std": 0.03212417662143707, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0226, "grad_norm": 0.5730808973312378, "kl": 0.0022841215832158923, "learning_rate": 4.998064796228779e-06, "loss": 0.0002, "num_tokens": 737352.0, "reward": 0.86993408203125, "reward_std": 0.014314034953713417, "rewards//mean": 0.86993408203125, "rewards//std": 0.025540942326188087, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0228, "grad_norm": 0.5489108562469482, "kl": 0.0020574367081280798, "learning_rate": 4.998001875249804e-06, "loss": 0.0002, "num_tokens": 743840.0, "reward": 0.84295654296875, "reward_std": 0.013090233318507671, "rewards//mean": 0.84295654296875, "rewards//std": 0.025018785148859024, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.023, "grad_norm": 0.5044224262237549, "kl": 0.0021600704931188375, "learning_rate": 4.997937948075973e-06, "loss": 0.0002, "num_tokens": 750432.0, "reward": 0.8189697265625, "reward_std": 0.010550140403211117, "rewards//mean": 0.8189697265625, "rewards//std": 0.01952776312828064, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0232, "grad_norm": 0.5633189678192139, "kl": 0.002223424904514104, "learning_rate": 4.997873014733036e-06, "loss": 0.0002, "num_tokens": 756888.0, "reward": 0.85516357421875, "reward_std": 0.011582260951399803, "rewards//mean": 0.85516357421875, "rewards//std": 0.04037541151046753, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0234, "grad_norm": 0.5373753309249878, "kl": 0.0026258028228767216, "learning_rate": 4.997807075247147e-06, "loss": 0.0003, "num_tokens": 763392.0, "reward": 0.81365966796875, "reward_std": 0.011771637946367264, "rewards//mean": 0.81365966796875, "rewards//std": 0.021346449851989746, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0236, "grad_norm": 0.5237939953804016, "kl": 0.0023264801857294515, "learning_rate": 4.9977401296448655e-06, "loss": 0.0002, "num_tokens": 769928.0, "reward": 0.8460693359375, "reward_std": 0.008590873330831528, "rewards//mean": 0.8460693359375, "rewards//std": 0.017849450930953026, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0238, "grad_norm": 0.5332798361778259, "kl": 0.002536685249651782, "learning_rate": 4.99767217795316e-06, "loss": 0.0003, "num_tokens": 776408.0, "reward": 0.85760498046875, "reward_std": 0.01757902279496193, "rewards//mean": 0.85760498046875, "rewards//std": 0.028330666944384575, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.024, "grad_norm": 0.5661578178405762, "kl": 0.002972067624796182, "learning_rate": 4.997603220199399e-06, "loss": 0.0003, "num_tokens": 782976.0, "reward": 0.84490966796875, "reward_std": 0.011383035220205784, "rewards//mean": 0.84490966796875, "rewards//std": 0.01630287617444992, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0242, "grad_norm": 0.5735255479812622, "kl": 0.0025082347565330565, "learning_rate": 4.99753325641136e-06, "loss": 0.0003, "num_tokens": 789624.0, "reward": 0.8165283203125, "reward_std": 0.012074325233697891, "rewards//mean": 0.8165283203125, "rewards//std": 0.023745177313685417, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0244, "grad_norm": 0.5703085660934448, "kl": 0.0030434352956945077, "learning_rate": 4.997462286617224e-06, "loss": 0.0003, "num_tokens": 796152.0, "reward": 0.84613037109375, "reward_std": 0.013441948220133781, "rewards//mean": 0.84613037109375, "rewards//std": 0.030390407890081406, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0246, "grad_norm": 0.5246081352233887, "kl": 0.00253114165388979, "learning_rate": 4.997390310845578e-06, "loss": 0.0003, "num_tokens": 802712.0, "reward": 0.85833740234375, "reward_std": 0.012656005099415779, "rewards//mean": 0.85833740234375, "rewards//std": 0.02223086729645729, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0248, "grad_norm": 0.5539637804031372, "kl": 0.003131673962343484, "learning_rate": 4.997317329125413e-06, "loss": 0.0003, "num_tokens": 809208.0, "reward": 0.85931396484375, "reward_std": 0.015795918181538582, "rewards//mean": 0.85931396484375, "rewards//std": 0.029177729040384293, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.025, "grad_norm": 0.6289098858833313, "kl": 0.002920530881965533, "learning_rate": 4.997243341486126e-06, "loss": 0.0003, "num_tokens": 815736.0, "reward": 0.87042236328125, "reward_std": 0.010492103174328804, "rewards//mean": 0.87042236328125, "rewards//std": 0.02300521731376648, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0252, "grad_norm": 0.604350745677948, "kl": 0.0029226918559288606, "learning_rate": 4.997168347957521e-06, "loss": 0.0003, "num_tokens": 822232.0, "reward": 0.8529052734375, "reward_std": 0.014255398884415627, "rewards//mean": 0.8529052734375, "rewards//std": 0.04210163280367851, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0254, "grad_norm": 0.5126016736030579, "kl": 0.0027541381714399904, "learning_rate": 4.997092348569802e-06, "loss": 0.0003, "num_tokens": 828736.0, "reward": 0.86907958984375, "reward_std": 0.009864596650004387, "rewards//mean": 0.86907958984375, "rewards//std": 0.020902881398797035, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0256, "grad_norm": 0.5625630617141724, "kl": 0.0037071430997457355, "learning_rate": 4.9970153433535855e-06, "loss": 0.0004, "num_tokens": 835208.0, "reward": 0.83758544921875, "reward_std": 0.012011556886136532, "rewards//mean": 0.83758544921875, "rewards//std": 0.026298996061086655, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0258, "grad_norm": 0.5449907779693604, "kl": 0.003065660857828334, "learning_rate": 4.996937332339887e-06, "loss": 0.0003, "num_tokens": 841672.0, "reward": 0.8533935546875, "reward_std": 0.015462837181985378, "rewards//mean": 0.8533935546875, "rewards//std": 0.021865351125597954, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.026, "grad_norm": 0.532476007938385, "kl": 0.0030444328440353274, "learning_rate": 4.996858315560129e-06, "loss": 0.0003, "num_tokens": 848168.0, "reward": 0.83038330078125, "reward_std": 0.011759311892092228, "rewards//mean": 0.83038330078125, "rewards//std": 0.026123428717255592, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0262, "grad_norm": 0.5832347273826599, "kl": 0.0031748518522363156, "learning_rate": 4.9967782930461405e-06, "loss": 0.0003, "num_tokens": 854768.0, "reward": 0.82232666015625, "reward_std": 0.012866346165537834, "rewards//mean": 0.82232666015625, "rewards//std": 0.022574130445718765, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0264, "grad_norm": 0.5763130784034729, "kl": 0.002997577394125983, "learning_rate": 4.9966972648301535e-06, "loss": 0.0003, "num_tokens": 861288.0, "reward": 0.852294921875, "reward_std": 0.008953496813774109, "rewards//mean": 0.852294921875, "rewards//std": 0.019210852682590485, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0266, "grad_norm": 0.5979419946670532, "kl": 0.0031551278370898217, "learning_rate": 4.996615230944808e-06, "loss": 0.0003, "num_tokens": 867752.0, "reward": 0.8076171875, "reward_std": 0.014883074909448624, "rewards//mean": 0.8076171875, "rewards//std": 0.02626393362879753, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0268, "grad_norm": 0.5739002227783203, "kl": 0.003316704591270536, "learning_rate": 4.996532191423145e-06, "loss": 0.0003, "num_tokens": 874240.0, "reward": 0.85858154296875, "reward_std": 0.013418575748801231, "rewards//mean": 0.85858154296875, "rewards//std": 0.025583580136299133, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.027, "grad_norm": 0.5882933735847473, "kl": 0.003152552613755688, "learning_rate": 4.996448146298615e-06, "loss": 0.0003, "num_tokens": 880704.0, "reward": 0.84490966796875, "reward_std": 0.013387042097747326, "rewards//mean": 0.84490966796875, "rewards//std": 0.023289043456315994, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0272, "grad_norm": 0.636451005935669, "kl": 0.0033634064893703908, "learning_rate": 4.996363095605069e-06, "loss": 0.0003, "num_tokens": 887200.0, "reward": 0.8017578125, "reward_std": 0.008450044319033623, "rewards//mean": 0.8017578125, "rewards//std": 0.01984495110809803, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0274, "grad_norm": 0.6109657287597656, "kl": 0.0032750566606409848, "learning_rate": 4.996277039376767e-06, "loss": 0.0003, "num_tokens": 893696.0, "reward": 0.87811279296875, "reward_std": 0.01004757173359394, "rewards//mean": 0.87811279296875, "rewards//std": 0.0377134345471859, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0276, "grad_norm": 0.5534079074859619, "kl": 0.003138502681395039, "learning_rate": 4.9961899776483725e-06, "loss": 0.0003, "num_tokens": 900264.0, "reward": 0.82147216796875, "reward_std": 0.013819834217429161, "rewards//mean": 0.82147216796875, "rewards//std": 0.030263124033808708, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0278, "grad_norm": 0.5728569626808167, "kl": 0.0036148553481325507, "learning_rate": 4.996101910454953e-06, "loss": 0.0004, "num_tokens": 906792.0, "reward": 0.8438720703125, "reward_std": 0.014269351959228516, "rewards//mean": 0.8438720703125, "rewards//std": 0.025920186191797256, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.028, "grad_norm": 0.591044008731842, "kl": 0.003646017052233219, "learning_rate": 4.996012837831983e-06, "loss": 0.0004, "num_tokens": 913376.0, "reward": 0.85736083984375, "reward_std": 0.010473604314029217, "rewards//mean": 0.85736083984375, "rewards//std": 0.013709330931305885, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0282, "grad_norm": 0.5964400172233582, "kl": 0.0035551346372812986, "learning_rate": 4.9959227598153395e-06, "loss": 0.0004, "num_tokens": 919960.0, "reward": 0.7999267578125, "reward_std": 0.009113901294767857, "rewards//mean": 0.7999267578125, "rewards//std": 0.015694117173552513, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0284, "grad_norm": 0.6457265019416809, "kl": 0.004052484320709482, "learning_rate": 4.995831676441307e-06, "loss": 0.0004, "num_tokens": 926600.0, "reward": 0.83953857421875, "reward_std": 0.017153846099972725, "rewards//mean": 0.83953857421875, "rewards//std": 0.028196770697832108, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0286, "grad_norm": 0.5374157428741455, "kl": 0.004182952019618824, "learning_rate": 4.995739587746574e-06, "loss": 0.0004, "num_tokens": 933120.0, "reward": 0.85748291015625, "reward_std": 0.012008757330477238, "rewards//mean": 0.85748291015625, "rewards//std": 0.019123880192637444, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0288, "grad_norm": 0.5459393262863159, "kl": 0.004459843650693074, "learning_rate": 4.995646493768234e-06, "loss": 0.0004, "num_tokens": 939712.0, "reward": 0.8717041015625, "reward_std": 0.012871598824858665, "rewards//mean": 0.8717041015625, "rewards//std": 0.02747381664812565, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.029, "grad_norm": 0.6245104074478149, "kl": 0.0036559294094331563, "learning_rate": 4.995552394543784e-06, "loss": 0.0004, "num_tokens": 946152.0, "reward": 0.84735107421875, "reward_std": 0.010359259322285652, "rewards//mean": 0.84735107421875, "rewards//std": 0.025145526975393295, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0292, "grad_norm": 0.49289852380752563, "kl": 0.004074933909578249, "learning_rate": 4.995457290111129e-06, "loss": 0.0004, "num_tokens": 952760.0, "reward": 0.829833984375, "reward_std": 0.01499673631042242, "rewards//mean": 0.829833984375, "rewards//std": 0.02499646320939064, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0294, "grad_norm": 0.5312600135803223, "kl": 0.004150361695792526, "learning_rate": 4.995361180508575e-06, "loss": 0.0004, "num_tokens": 959336.0, "reward": 0.8187255859375, "reward_std": 0.014176027849316597, "rewards//mean": 0.8187255859375, "rewards//std": 0.025973858311772346, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0296, "grad_norm": 0.5217401385307312, "kl": 0.0042341178632341325, "learning_rate": 4.995264065774837e-06, "loss": 0.0004, "num_tokens": 965904.0, "reward": 0.8221435546875, "reward_std": 0.014102162793278694, "rewards//mean": 0.8221435546875, "rewards//std": 0.029886946082115173, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0298, "grad_norm": 0.5702940225601196, "kl": 0.004433232854353264, "learning_rate": 4.99516594594903e-06, "loss": 0.0004, "num_tokens": 972480.0, "reward": 0.83148193359375, "reward_std": 0.009684868156909943, "rewards//mean": 0.83148193359375, "rewards//std": 0.030368482694029808, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.03, "grad_norm": 0.5658935308456421, "kl": 0.004318653082009405, "learning_rate": 4.9950668210706795e-06, "loss": 0.0004, "num_tokens": 979032.0, "reward": 0.81292724609375, "reward_std": 0.018283039331436157, "rewards//mean": 0.81292724609375, "rewards//std": 0.03353874757885933, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0302, "grad_norm": 0.6088890433311462, "kl": 0.003863652906147763, "learning_rate": 4.994966691179712e-06, "loss": 0.0004, "num_tokens": 985520.0, "reward": 0.8228759765625, "reward_std": 0.012609913945198059, "rewards//mean": 0.8228759765625, "rewards//std": 0.02047235146164894, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0304, "grad_norm": 0.5766634345054626, "kl": 0.0043726901058107615, "learning_rate": 4.9948655563164585e-06, "loss": 0.0004, "num_tokens": 992048.0, "reward": 0.83502197265625, "reward_std": 0.012382203713059425, "rewards//mean": 0.83502197265625, "rewards//std": 0.023486455902457237, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0306, "grad_norm": 0.6495277881622314, "kl": 0.004681104823248461, "learning_rate": 4.994763416521658e-06, "loss": 0.0005, "num_tokens": 998624.0, "reward": 0.8575439453125, "reward_std": 0.01804346963763237, "rewards//mean": 0.8575439453125, "rewards//std": 0.03466693311929703, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0308, "grad_norm": 0.5628495812416077, "kl": 0.005010787746869028, "learning_rate": 4.994660271836452e-06, "loss": 0.0005, "num_tokens": 1005144.0, "reward": 0.78619384765625, "reward_std": 0.014029724523425102, "rewards//mean": 0.78619384765625, "rewards//std": 0.027605941519141197, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.031, "grad_norm": 0.5648722052574158, "kl": 0.005263536557322368, "learning_rate": 4.994556122302387e-06, "loss": 0.0005, "num_tokens": 1011776.0, "reward": 0.81500244140625, "reward_std": 0.00935292523354292, "rewards//mean": 0.81500244140625, "rewards//std": 0.0195171982049942, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0312, "grad_norm": 0.5549405813217163, "kl": 0.004852844664128497, "learning_rate": 4.994450967961413e-06, "loss": 0.0005, "num_tokens": 1018328.0, "reward": 0.82122802734375, "reward_std": 0.01424551010131836, "rewards//mean": 0.82122802734375, "rewards//std": 0.017228348180651665, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0314, "grad_norm": 0.6064804196357727, "kl": 0.0048009168822318316, "learning_rate": 4.994344808855888e-06, "loss": 0.0005, "num_tokens": 1024736.0, "reward": 0.851318359375, "reward_std": 0.014092957600951195, "rewards//mean": 0.851318359375, "rewards//std": 0.027951501309871674, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0316, "grad_norm": 0.5940186381340027, "kl": 0.005103438044898212, "learning_rate": 4.994237645028573e-06, "loss": 0.0005, "num_tokens": 1031280.0, "reward": 0.83880615234375, "reward_std": 0.014840067364275455, "rewards//mean": 0.83880615234375, "rewards//std": 0.03041131980717182, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0318, "grad_norm": 0.6087404489517212, "kl": 0.005231126502621919, "learning_rate": 4.994129476522632e-06, "loss": 0.0005, "num_tokens": 1037776.0, "reward": 0.84613037109375, "reward_std": 0.017233435064554214, "rewards//mean": 0.84613037109375, "rewards//std": 0.03191426768898964, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.032, "grad_norm": 0.5607631802558899, "kl": 0.0044527553545776755, "learning_rate": 4.994020303381636e-06, "loss": 0.0004, "num_tokens": 1044264.0, "reward": 0.8514404296875, "reward_std": 0.01844841055572033, "rewards//mean": 0.8514404296875, "rewards//std": 0.030922522768378258, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0322, "grad_norm": 0.5657513737678528, "kl": 0.005017978633986786, "learning_rate": 4.993910125649561e-06, "loss": 0.0005, "num_tokens": 1050792.0, "reward": 0.856201171875, "reward_std": 0.012632312253117561, "rewards//mean": 0.856201171875, "rewards//std": 0.02604980394244194, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0324, "grad_norm": 0.5447162389755249, "kl": 0.004984280851203948, "learning_rate": 4.993798943370785e-06, "loss": 0.0005, "num_tokens": 1057344.0, "reward": 0.82562255859375, "reward_std": 0.014443015679717064, "rewards//mean": 0.82562255859375, "rewards//std": 0.024904167279601097, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0326, "grad_norm": 0.5358566045761108, "kl": 0.005205701396334916, "learning_rate": 4.993686756590093e-06, "loss": 0.0005, "num_tokens": 1063784.0, "reward": 0.85723876953125, "reward_std": 0.01433575339615345, "rewards//mean": 0.85723876953125, "rewards//std": 0.02944575995206833, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0328, "grad_norm": 0.52525395154953, "kl": 0.005318704032106325, "learning_rate": 4.993573565352674e-06, "loss": 0.0005, "num_tokens": 1070232.0, "reward": 0.836181640625, "reward_std": 0.009636202827095985, "rewards//mean": 0.836181640625, "rewards//std": 0.02487991936504841, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.033, "grad_norm": 0.6540741920471191, "kl": 0.005362637952202931, "learning_rate": 4.993459369704121e-06, "loss": 0.0005, "num_tokens": 1076808.0, "reward": 0.84423828125, "reward_std": 0.015483209863305092, "rewards//mean": 0.84423828125, "rewards//std": 0.03575059771537781, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0332, "grad_norm": 0.5869854092597961, "kl": 0.005901694006752223, "learning_rate": 4.9933441696904315e-06, "loss": 0.0006, "num_tokens": 1083304.0, "reward": 0.84246826171875, "reward_std": 0.015546703711152077, "rewards//mean": 0.84246826171875, "rewards//std": 0.02686617709696293, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0334, "grad_norm": 0.575697660446167, "kl": 0.0057476553483866155, "learning_rate": 4.993227965358008e-06, "loss": 0.0006, "num_tokens": 1089800.0, "reward": 0.86322021484375, "reward_std": 0.01910999044775963, "rewards//mean": 0.86322021484375, "rewards//std": 0.03419933468103409, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0336, "grad_norm": 0.6858865022659302, "kl": 0.005201482155825943, "learning_rate": 4.99311075675366e-06, "loss": 0.0005, "num_tokens": 1096344.0, "reward": 0.86181640625, "reward_std": 0.013126470148563385, "rewards//mean": 0.86181640625, "rewards//std": 0.02323511429131031, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0338, "grad_norm": 0.5925734043121338, "kl": 0.004758396476972848, "learning_rate": 4.992992543924597e-06, "loss": 0.0005, "num_tokens": 1102928.0, "reward": 0.8076171875, "reward_std": 0.017161305993795395, "rewards//mean": 0.8076171875, "rewards//std": 0.02628237009048462, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.034, "grad_norm": 0.5349700450897217, "kl": 0.005817261466290802, "learning_rate": 4.992873326918434e-06, "loss": 0.0006, "num_tokens": 1109536.0, "reward": 0.85833740234375, "reward_std": 0.010428598150610924, "rewards//mean": 0.85833740234375, "rewards//std": 0.03860246390104294, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0342, "grad_norm": 0.6442819237709045, "kl": 0.006201874581165612, "learning_rate": 4.992753105783194e-06, "loss": 0.0006, "num_tokens": 1116056.0, "reward": 0.83734130859375, "reward_std": 0.012758218683302402, "rewards//mean": 0.83734130859375, "rewards//std": 0.02916008234024048, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0344, "grad_norm": 0.5286543369293213, "kl": 0.006252227758523077, "learning_rate": 4.992631880567301e-06, "loss": 0.0006, "num_tokens": 1122600.0, "reward": 0.85650634765625, "reward_std": 0.01419912651181221, "rewards//mean": 0.85650634765625, "rewards//std": 0.034500766545534134, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0346, "grad_norm": 0.6253844499588013, "kl": 0.00583338420256041, "learning_rate": 4.992509651319585e-06, "loss": 0.0006, "num_tokens": 1129104.0, "reward": 0.814208984375, "reward_std": 0.015387440100312233, "rewards//mean": 0.814208984375, "rewards//std": 0.026428259909152985, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0348, "grad_norm": 0.5459834337234497, "kl": 0.006313475081697106, "learning_rate": 4.992386418089279e-06, "loss": 0.0006, "num_tokens": 1135800.0, "reward": 0.83197021484375, "reward_std": 0.011926204897463322, "rewards//mean": 0.83197021484375, "rewards//std": 0.02585022896528244, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.035, "grad_norm": 0.5601797699928284, "kl": 0.005816609016619623, "learning_rate": 4.992262180926022e-06, "loss": 0.0006, "num_tokens": 1142352.0, "reward": 0.84552001953125, "reward_std": 0.011603476479649544, "rewards//mean": 0.84552001953125, "rewards//std": 0.02518041804432869, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0352, "grad_norm": 0.49983859062194824, "kl": 0.006052123644622043, "learning_rate": 4.992136939879857e-06, "loss": 0.0006, "num_tokens": 1148984.0, "reward": 0.8468017578125, "reward_std": 0.015041794627904892, "rewards//mean": 0.8468017578125, "rewards//std": 0.033024612814188004, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0354, "grad_norm": 0.5256810188293457, "kl": 0.006643216998782009, "learning_rate": 4.992010695001229e-06, "loss": 0.0007, "num_tokens": 1155608.0, "reward": 0.8421630859375, "reward_std": 0.011597627773880959, "rewards//mean": 0.8421630859375, "rewards//std": 0.02555079385638237, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0356, "grad_norm": 0.6244553327560425, "kl": 0.007153651211410761, "learning_rate": 4.9918834463409925e-06, "loss": 0.0007, "num_tokens": 1162168.0, "reward": 0.85235595703125, "reward_std": 0.012248958460986614, "rewards//mean": 0.85235595703125, "rewards//std": 0.029620032757520676, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0358, "grad_norm": 0.5499500036239624, "kl": 0.0060980761190876365, "learning_rate": 4.991755193950401e-06, "loss": 0.0006, "num_tokens": 1168704.0, "reward": 0.83935546875, "reward_std": 0.011832846328616142, "rewards//mean": 0.83935546875, "rewards//std": 0.033120047301054, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.036, "grad_norm": 0.5794421434402466, "kl": 0.0063534324581269175, "learning_rate": 4.991625937881117e-06, "loss": 0.0006, "num_tokens": 1175256.0, "reward": 0.85357666015625, "reward_std": 0.016541291028261185, "rewards//mean": 0.85357666015625, "rewards//std": 0.039579860866069794, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0362, "grad_norm": 0.6126803159713745, "kl": 0.006563348462805152, "learning_rate": 4.991495678185202e-06, "loss": 0.0007, "num_tokens": 1181776.0, "reward": 0.8543701171875, "reward_std": 0.010336179286241531, "rewards//mean": 0.8543701171875, "rewards//std": 0.025021584704518318, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0364, "grad_norm": 0.5989588499069214, "kl": 0.00655812764307484, "learning_rate": 4.991364414915126e-06, "loss": 0.0007, "num_tokens": 1188352.0, "reward": 0.85723876953125, "reward_std": 0.015126184560358524, "rewards//mean": 0.85723876953125, "rewards//std": 0.03535842150449753, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0366, "grad_norm": 0.5920560956001282, "kl": 0.007293358852621168, "learning_rate": 4.9912321481237616e-06, "loss": 0.0007, "num_tokens": 1194944.0, "reward": 0.826416015625, "reward_std": 0.009397894144058228, "rewards//mean": 0.826416015625, "rewards//std": 0.019122395664453506, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0368, "grad_norm": 0.609233558177948, "kl": 0.0069352322607301176, "learning_rate": 4.991098877864386e-06, "loss": 0.0007, "num_tokens": 1201344.0, "reward": 0.8267822265625, "reward_std": 0.013857333920896053, "rewards//mean": 0.8267822265625, "rewards//std": 0.018535098060965538, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.037, "grad_norm": 0.6113260984420776, "kl": 0.007776582497172058, "learning_rate": 4.99096460419068e-06, "loss": 0.0008, "num_tokens": 1207896.0, "reward": 0.81707763671875, "reward_std": 0.014834532514214516, "rewards//mean": 0.81707763671875, "rewards//std": 0.035852447152137756, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0372, "grad_norm": 0.537739098072052, "kl": 0.00674009119393304, "learning_rate": 4.990829327156729e-06, "loss": 0.0007, "num_tokens": 1214408.0, "reward": 0.8321533203125, "reward_std": 0.009646743535995483, "rewards//mean": 0.8321533203125, "rewards//std": 0.022872328758239746, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0374, "grad_norm": 0.6183601021766663, "kl": 0.008413400035351515, "learning_rate": 4.990693046817023e-06, "loss": 0.0008, "num_tokens": 1220976.0, "reward": 0.81475830078125, "reward_std": 0.0088888481259346, "rewards//mean": 0.81475830078125, "rewards//std": 0.01764591969549656, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0376, "grad_norm": 0.579095184803009, "kl": 0.0072022924432531, "learning_rate": 4.990555763226456e-06, "loss": 0.0007, "num_tokens": 1227456.0, "reward": 0.8326416015625, "reward_std": 0.014683970250189304, "rewards//mean": 0.8326416015625, "rewards//std": 0.023900222033262253, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0378, "grad_norm": 0.5864057540893555, "kl": 0.008137135358992964, "learning_rate": 4.990417476440326e-06, "loss": 0.0008, "num_tokens": 1233904.0, "reward": 0.85552978515625, "reward_std": 0.012473128736019135, "rewards//mean": 0.85552978515625, "rewards//std": 0.03617484122514725, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.038, "grad_norm": 0.6523225903511047, "kl": 0.008292198239360005, "learning_rate": 4.9902781865143326e-06, "loss": 0.0008, "num_tokens": 1240416.0, "reward": 0.80804443359375, "reward_std": 0.014624349772930145, "rewards//mean": 0.80804443359375, "rewards//std": 0.026068320497870445, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0382, "grad_norm": 0.5711731314659119, "kl": 0.007915714057162404, "learning_rate": 4.990137893504585e-06, "loss": 0.0008, "num_tokens": 1246944.0, "reward": 0.8541259765625, "reward_std": 0.01398186944425106, "rewards//mean": 0.8541259765625, "rewards//std": 0.02810138463973999, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0384, "grad_norm": 0.575506329536438, "kl": 0.007600511948112398, "learning_rate": 4.989996597467591e-06, "loss": 0.0008, "num_tokens": 1253440.0, "reward": 0.8515625, "reward_std": 0.015058934688568115, "rewards//mean": 0.8515625, "rewards//std": 0.029297908768057823, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0386, "grad_norm": 0.6336308717727661, "kl": 0.007927103200927377, "learning_rate": 4.989854298460265e-06, "loss": 0.0008, "num_tokens": 1259976.0, "reward": 0.81378173828125, "reward_std": 0.010543026961386204, "rewards//mean": 0.81378173828125, "rewards//std": 0.01570315845310688, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0388, "grad_norm": 0.5761043429374695, "kl": 0.008717037679161876, "learning_rate": 4.989710996539926e-06, "loss": 0.0009, "num_tokens": 1266520.0, "reward": 0.889404296875, "reward_std": 0.013920702040195465, "rewards//mean": 0.889404296875, "rewards//std": 0.019436465576291084, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.039, "grad_norm": 0.5974484086036682, "kl": 0.008872057602275163, "learning_rate": 4.989566691764296e-06, "loss": 0.0009, "num_tokens": 1273072.0, "reward": 0.8302001953125, "reward_std": 0.016194570809602737, "rewards//mean": 0.8302001953125, "rewards//std": 0.03371773660182953, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0392, "grad_norm": 0.6319258809089661, "kl": 0.009591609938070178, "learning_rate": 4.9894213841914994e-06, "loss": 0.001, "num_tokens": 1279576.0, "reward": 0.80279541015625, "reward_std": 0.008850542828440666, "rewards//mean": 0.80279541015625, "rewards//std": 0.0178802739828825, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0394, "grad_norm": 0.585383951663971, "kl": 0.009328377433121204, "learning_rate": 4.989275073880067e-06, "loss": 0.0009, "num_tokens": 1286072.0, "reward": 0.8275146484375, "reward_std": 0.01107935793697834, "rewards//mean": 0.8275146484375, "rewards//std": 0.025997160002589226, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0396, "grad_norm": 0.5403591990470886, "kl": 0.00814232334960252, "learning_rate": 4.989127760888932e-06, "loss": 0.0008, "num_tokens": 1292584.0, "reward": 0.851318359375, "reward_std": 0.0181453637778759, "rewards//mean": 0.851318359375, "rewards//std": 0.02438831701874733, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0398, "grad_norm": 0.61063551902771, "kl": 0.008625458052847534, "learning_rate": 4.988979445277433e-06, "loss": 0.0009, "num_tokens": 1299088.0, "reward": 0.8294677734375, "reward_std": 0.010040882974863052, "rewards//mean": 0.8294677734375, "rewards//std": 0.01912674866616726, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.04, "grad_norm": 0.6044790744781494, "kl": 0.008569077064748853, "learning_rate": 4.988830127105312e-06, "loss": 0.0009, "num_tokens": 1305592.0, "reward": 0.84759521484375, "reward_std": 0.012233874760568142, "rewards//mean": 0.84759521484375, "rewards//std": 0.026906151324510574, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0402, "grad_norm": 0.6054474115371704, "kl": 0.009775987244211137, "learning_rate": 4.988679806432712e-06, "loss": 0.001, "num_tokens": 1312104.0, "reward": 0.8240966796875, "reward_std": 0.013713570311665535, "rewards//mean": 0.8240966796875, "rewards//std": 0.0317419171333313, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0404, "grad_norm": 0.5946995615959167, "kl": 0.009700121358036995, "learning_rate": 4.988528483320184e-06, "loss": 0.001, "num_tokens": 1318600.0, "reward": 0.842041015625, "reward_std": 0.01567421853542328, "rewards//mean": 0.842041015625, "rewards//std": 0.047219280153512955, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0406, "grad_norm": 0.5984341502189636, "kl": 0.009171057841740549, "learning_rate": 4.9883761578286805e-06, "loss": 0.0009, "num_tokens": 1325088.0, "reward": 0.8309326171875, "reward_std": 0.017868168652057648, "rewards//mean": 0.8309326171875, "rewards//std": 0.039114974439144135, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0408, "grad_norm": 0.5118701457977295, "kl": 0.009044189820997417, "learning_rate": 4.988222830019559e-06, "loss": 0.0009, "num_tokens": 1331568.0, "reward": 0.83575439453125, "reward_std": 0.010470295324921608, "rewards//mean": 0.83575439453125, "rewards//std": 0.01437237299978733, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.041, "grad_norm": 0.6392531991004944, "kl": 0.010742165730334818, "learning_rate": 4.988068499954578e-06, "loss": 0.0011, "num_tokens": 1338136.0, "reward": 0.85009765625, "reward_std": 0.012784970924258232, "rewards//mean": 0.85009765625, "rewards//std": 0.03285572677850723, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0412, "grad_norm": 0.5961043238639832, "kl": 0.011020470352377743, "learning_rate": 4.987913167695904e-06, "loss": 0.0011, "num_tokens": 1344584.0, "reward": 0.84033203125, "reward_std": 0.011852873489260674, "rewards//mean": 0.84033203125, "rewards//std": 0.02290969155728817, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0414, "grad_norm": 0.5800420045852661, "kl": 0.009589398512616754, "learning_rate": 4.987756833306103e-06, "loss": 0.001, "num_tokens": 1351184.0, "reward": 0.849853515625, "reward_std": 0.012520735152065754, "rewards//mean": 0.849853515625, "rewards//std": 0.03702443093061447, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0416, "grad_norm": 0.5232331156730652, "kl": 0.01012984523549676, "learning_rate": 4.987599496848147e-06, "loss": 0.001, "num_tokens": 1357760.0, "reward": 0.84149169921875, "reward_std": 0.016268065199255943, "rewards//mean": 0.84149169921875, "rewards//std": 0.024248182773590088, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0418, "grad_norm": 0.613732099533081, "kl": 0.010579521884210408, "learning_rate": 4.987441158385411e-06, "loss": 0.0011, "num_tokens": 1364392.0, "reward": 0.854736328125, "reward_std": 0.016416363418102264, "rewards//mean": 0.854736328125, "rewards//std": 0.03490995615720749, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.042, "grad_norm": 0.5997706651687622, "kl": 0.010741343721747398, "learning_rate": 4.987281817981674e-06, "loss": 0.0011, "num_tokens": 1370944.0, "reward": 0.86395263671875, "reward_std": 0.013692582026124, "rewards//mean": 0.86395263671875, "rewards//std": 0.030517160892486572, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0422, "grad_norm": 0.5470620393753052, "kl": 0.013648364343680441, "learning_rate": 4.987121475701118e-06, "loss": 0.0014, "num_tokens": 1377408.0, "reward": 0.84954833984375, "reward_std": 0.011153290048241615, "rewards//mean": 0.84954833984375, "rewards//std": 0.019954398274421692, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0424, "grad_norm": 0.7061495184898376, "kl": 0.011234368314035237, "learning_rate": 4.986960131608329e-06, "loss": 0.0011, "num_tokens": 1383864.0, "reward": 0.82733154296875, "reward_std": 0.011791674420237541, "rewards//mean": 0.82733154296875, "rewards//std": 0.022948559373617172, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0426, "grad_norm": 0.5791126489639282, "kl": 0.011082913028076291, "learning_rate": 4.986797785768296e-06, "loss": 0.0011, "num_tokens": 1390304.0, "reward": 0.851318359375, "reward_std": 0.018234960734844208, "rewards//mean": 0.851318359375, "rewards//std": 0.04792189970612526, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0428, "grad_norm": 0.6446351408958435, "kl": 0.010612470097839832, "learning_rate": 4.986634438246413e-06, "loss": 0.0011, "num_tokens": 1396776.0, "reward": 0.836181640625, "reward_std": 0.00901946984231472, "rewards//mean": 0.836181640625, "rewards//std": 0.017145667225122452, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.043, "grad_norm": 0.6024380922317505, "kl": 0.010903890011832118, "learning_rate": 4.986470089108476e-06, "loss": 0.0011, "num_tokens": 1403280.0, "reward": 0.82220458984375, "reward_std": 0.010884717106819153, "rewards//mean": 0.82220458984375, "rewards//std": 0.019161837175488472, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0432, "grad_norm": 0.5589469075202942, "kl": 0.011347507941536605, "learning_rate": 4.986304738420684e-06, "loss": 0.0011, "num_tokens": 1409752.0, "reward": 0.84674072265625, "reward_std": 0.01086362637579441, "rewards//mean": 0.84674072265625, "rewards//std": 0.016987673938274384, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0434, "grad_norm": 0.6330822706222534, "kl": 0.011820332612842321, "learning_rate": 4.986138386249641e-06, "loss": 0.0012, "num_tokens": 1416320.0, "reward": 0.84808349609375, "reward_std": 0.013128334656357765, "rewards//mean": 0.84808349609375, "rewards//std": 0.026347875595092773, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0436, "grad_norm": 0.6010637283325195, "kl": 0.00960783491609618, "learning_rate": 4.985971032662352e-06, "loss": 0.001, "num_tokens": 1422800.0, "reward": 0.83551025390625, "reward_std": 0.014542114920914173, "rewards//mean": 0.83551025390625, "rewards//std": 0.030082007870078087, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0438, "grad_norm": 0.5061883926391602, "kl": 0.010777416347991675, "learning_rate": 4.98580267772623e-06, "loss": 0.0011, "num_tokens": 1429392.0, "reward": 0.8599853515625, "reward_std": 0.013834359124302864, "rewards//mean": 0.8599853515625, "rewards//std": 0.025753792375326157, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.044, "grad_norm": 0.58171147108078, "kl": 0.011898035183548927, "learning_rate": 4.985633321509086e-06, "loss": 0.0012, "num_tokens": 1435816.0, "reward": 0.8372802734375, "reward_std": 0.01282973401248455, "rewards//mean": 0.8372802734375, "rewards//std": 0.026541143655776978, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0442, "grad_norm": 0.6470307111740112, "kl": 0.011951541411690414, "learning_rate": 4.985462964079137e-06, "loss": 0.0012, "num_tokens": 1442424.0, "reward": 0.8245849609375, "reward_std": 0.014490412548184395, "rewards//mean": 0.8245849609375, "rewards//std": 0.026085518300533295, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0444, "grad_norm": 0.6614448428153992, "kl": 0.011769724893383682, "learning_rate": 4.985291605505004e-06, "loss": 0.0012, "num_tokens": 1449072.0, "reward": 0.850341796875, "reward_std": 0.013671314343810081, "rewards//mean": 0.850341796875, "rewards//std": 0.020503751933574677, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0446, "grad_norm": 0.6548160910606384, "kl": 0.011074570356868207, "learning_rate": 4.9851192458557084e-06, "loss": 0.0011, "num_tokens": 1455600.0, "reward": 0.82940673828125, "reward_std": 0.012141291052103043, "rewards//mean": 0.82940673828125, "rewards//std": 0.03643876314163208, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0448, "grad_norm": 0.6014297008514404, "kl": 0.010654145269654691, "learning_rate": 4.984945885200679e-06, "loss": 0.0011, "num_tokens": 1462256.0, "reward": 0.85821533203125, "reward_std": 0.008870145305991173, "rewards//mean": 0.85821533203125, "rewards//std": 0.016872331500053406, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.045, "grad_norm": 0.5984156131744385, "kl": 0.011732026003301144, "learning_rate": 4.984771523609744e-06, "loss": 0.0012, "num_tokens": 1468736.0, "reward": 0.854248046875, "reward_std": 0.01087274868041277, "rewards//mean": 0.854248046875, "rewards//std": 0.03552206605672836, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0452, "grad_norm": 0.6330734491348267, "kl": 0.012012874241918325, "learning_rate": 4.9845961611531356e-06, "loss": 0.0012, "num_tokens": 1475256.0, "reward": 0.8272705078125, "reward_std": 0.012549136765301228, "rewards//mean": 0.8272705078125, "rewards//std": 0.027049588039517403, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0454, "grad_norm": 0.6217445135116577, "kl": 0.010952474898658693, "learning_rate": 4.984419797901491e-06, "loss": 0.0011, "num_tokens": 1481752.0, "reward": 0.80938720703125, "reward_std": 0.01106027141213417, "rewards//mean": 0.80938720703125, "rewards//std": 0.015995418652892113, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0456, "grad_norm": 0.5331505537033081, "kl": 0.011694666813127697, "learning_rate": 4.984242433925849e-06, "loss": 0.0012, "num_tokens": 1488312.0, "reward": 0.842041015625, "reward_std": 0.015316024422645569, "rewards//mean": 0.842041015625, "rewards//std": 0.055478423833847046, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0458, "grad_norm": 0.5423309803009033, "kl": 0.011228133691474795, "learning_rate": 4.984064069297652e-06, "loss": 0.0011, "num_tokens": 1494848.0, "reward": 0.8193359375, "reward_std": 0.01109884213656187, "rewards//mean": 0.8193359375, "rewards//std": 0.0388573482632637, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.046, "grad_norm": 0.6137152910232544, "kl": 0.013376098999287933, "learning_rate": 4.983884704088745e-06, "loss": 0.0013, "num_tokens": 1501336.0, "reward": 0.80975341796875, "reward_std": 0.01384538970887661, "rewards//mean": 0.80975341796875, "rewards//std": 0.032224390655756, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0462, "grad_norm": 0.578719973564148, "kl": 0.012490722816437483, "learning_rate": 4.983704338371375e-06, "loss": 0.0012, "num_tokens": 1507872.0, "reward": 0.8492431640625, "reward_std": 0.013666566461324692, "rewards//mean": 0.8492431640625, "rewards//std": 0.03480464220046997, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0464, "grad_norm": 0.5728334784507751, "kl": 0.013004794949665666, "learning_rate": 4.983522972218196e-06, "loss": 0.0013, "num_tokens": 1514504.0, "reward": 0.85906982421875, "reward_std": 0.015117382630705833, "rewards//mean": 0.85906982421875, "rewards//std": 0.02659662440419197, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0466, "grad_norm": 0.5708733201026917, "kl": 0.010296467458829284, "learning_rate": 4.983340605702261e-06, "loss": 0.001, "num_tokens": 1521152.0, "reward": 0.79461669921875, "reward_std": 0.011611716821789742, "rewards//mean": 0.79461669921875, "rewards//std": 0.027777044102549553, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0468, "grad_norm": 0.5398491621017456, "kl": 0.010386907437350601, "learning_rate": 4.983157238897026e-06, "loss": 0.001, "num_tokens": 1527624.0, "reward": 0.82073974609375, "reward_std": 0.009576220065355301, "rewards//mean": 0.82073974609375, "rewards//std": 0.01762274280190468, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.047, "grad_norm": 0.6520707011222839, "kl": 0.011667132377624512, "learning_rate": 4.982972871876353e-06, "loss": 0.0012, "num_tokens": 1534072.0, "reward": 0.877685546875, "reward_std": 0.014000840485095978, "rewards//mean": 0.877685546875, "rewards//std": 0.0195234976708889, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0472, "grad_norm": 0.5892413854598999, "kl": 0.010782511322759092, "learning_rate": 4.982787504714503e-06, "loss": 0.0011, "num_tokens": 1540600.0, "reward": 0.8544921875, "reward_std": 0.015170978382229805, "rewards//mean": 0.8544921875, "rewards//std": 0.03643839806318283, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0474, "grad_norm": 0.6138961315155029, "kl": 0.01105735485907644, "learning_rate": 4.982601137486144e-06, "loss": 0.0011, "num_tokens": 1547024.0, "reward": 0.8653564453125, "reward_std": 0.015798892825841904, "rewards//mean": 0.8653564453125, "rewards//std": 0.0336889922618866, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0476, "grad_norm": 0.5267355442047119, "kl": 0.011293756659142673, "learning_rate": 4.9824137702663424e-06, "loss": 0.0011, "num_tokens": 1553528.0, "reward": 0.856689453125, "reward_std": 0.013259888626635075, "rewards//mean": 0.856689453125, "rewards//std": 0.02487991936504841, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0478, "grad_norm": 0.5977954864501953, "kl": 0.012308461940847337, "learning_rate": 4.982225403130572e-06, "loss": 0.0012, "num_tokens": 1560032.0, "reward": 0.8436279296875, "reward_std": 0.01567743718624115, "rewards//mean": 0.8436279296875, "rewards//std": 0.028501469641923904, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.048, "grad_norm": 0.5983859300613403, "kl": 0.011517034086864442, "learning_rate": 4.982036036154706e-06, "loss": 0.0012, "num_tokens": 1566608.0, "reward": 0.81439208984375, "reward_std": 0.015477526932954788, "rewards//mean": 0.81439208984375, "rewards//std": 0.034139085561037064, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0482, "grad_norm": 0.6366744637489319, "kl": 0.010967534384690225, "learning_rate": 4.981845669415022e-06, "loss": 0.0011, "num_tokens": 1573104.0, "reward": 0.79840087890625, "reward_std": 0.013367927633225918, "rewards//mean": 0.79840087890625, "rewards//std": 0.030470000579953194, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0484, "grad_norm": 0.5575968027114868, "kl": 0.010500523203518242, "learning_rate": 4.981654302988198e-06, "loss": 0.0011, "num_tokens": 1579664.0, "reward": 0.80682373046875, "reward_std": 0.010329650714993477, "rewards//mean": 0.80682373046875, "rewards//std": 0.016217226162552834, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0486, "grad_norm": 0.5449930429458618, "kl": 0.013868011650629342, "learning_rate": 4.9814619369513184e-06, "loss": 0.0014, "num_tokens": 1586088.0, "reward": 0.8521728515625, "reward_std": 0.015307075344026089, "rewards//mean": 0.8521728515625, "rewards//std": 0.03274288401007652, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0488, "grad_norm": 0.6359149813652039, "kl": 0.011965149664320052, "learning_rate": 4.981268571381867e-06, "loss": 0.0012, "num_tokens": 1592592.0, "reward": 0.85205078125, "reward_std": 0.009656844660639763, "rewards//mean": 0.85205078125, "rewards//std": 0.02580341137945652, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.049, "grad_norm": 0.6533293724060059, "kl": 0.011329148430377245, "learning_rate": 4.981074206357732e-06, "loss": 0.0011, "num_tokens": 1599216.0, "reward": 0.80035400390625, "reward_std": 0.013833885081112385, "rewards//mean": 0.80035400390625, "rewards//std": 0.026191137731075287, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0492, "grad_norm": 0.5970718264579773, "kl": 0.01282750372774899, "learning_rate": 4.980878841957203e-06, "loss": 0.0013, "num_tokens": 1605792.0, "reward": 0.85223388671875, "reward_std": 0.01925882138311863, "rewards//mean": 0.85223388671875, "rewards//std": 0.029467342421412468, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0494, "grad_norm": 0.5749726295471191, "kl": 0.015270262490957975, "learning_rate": 4.980682478258973e-06, "loss": 0.0015, "num_tokens": 1612288.0, "reward": 0.86114501953125, "reward_std": 0.01426261942833662, "rewards//mean": 0.86114501953125, "rewards//std": 0.030289122834801674, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0496, "grad_norm": 0.6399112939834595, "kl": 0.012950829579494894, "learning_rate": 4.980485115342138e-06, "loss": 0.0013, "num_tokens": 1618808.0, "reward": 0.84722900390625, "reward_std": 0.010733772069215775, "rewards//mean": 0.84722900390625, "rewards//std": 0.015186639502644539, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0498, "grad_norm": 0.5356336236000061, "kl": 0.015245730872265995, "learning_rate": 4.980286753286196e-06, "loss": 0.0015, "num_tokens": 1625264.0, "reward": 0.79901123046875, "reward_std": 0.008075348101556301, "rewards//mean": 0.79901123046875, "rewards//std": 0.018752193078398705, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.05, "grad_norm": 0.6941496729850769, "kl": 0.013328676926903427, "learning_rate": 4.980087392171045e-06, "loss": 0.0013, "num_tokens": 1631784.0, "reward": 0.83270263671875, "reward_std": 0.015555602498352528, "rewards//mean": 0.83270263671875, "rewards//std": 0.04173377901315689, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0502, "grad_norm": 0.5972203612327576, "kl": 0.01535372855141759, "learning_rate": 4.9798870320769884e-06, "loss": 0.0015, "num_tokens": 1638272.0, "reward": 0.86041259765625, "reward_std": 0.01315420214086771, "rewards//mean": 0.86041259765625, "rewards//std": 0.031080909073352814, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0504, "grad_norm": 0.5607619881629944, "kl": 0.012482833117246628, "learning_rate": 4.979685673084733e-06, "loss": 0.0012, "num_tokens": 1644856.0, "reward": 0.8389892578125, "reward_std": 0.0162490364164114, "rewards//mean": 0.8389892578125, "rewards//std": 0.027370035648345947, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0506, "grad_norm": 0.6199642419815063, "kl": 0.013209497090429068, "learning_rate": 4.979483315275385e-06, "loss": 0.0013, "num_tokens": 1651312.0, "reward": 0.85308837890625, "reward_std": 0.015295417048037052, "rewards//mean": 0.85308837890625, "rewards//std": 0.024375824257731438, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0508, "grad_norm": 0.6028778553009033, "kl": 0.015642864862456918, "learning_rate": 4.979279958730454e-06, "loss": 0.0016, "num_tokens": 1657896.0, "reward": 0.8355712890625, "reward_std": 0.013927731662988663, "rewards//mean": 0.8355712890625, "rewards//std": 0.03703648969531059, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.051, "grad_norm": 0.6312702298164368, "kl": 0.014434279641136527, "learning_rate": 4.979075603531852e-06, "loss": 0.0014, "num_tokens": 1664464.0, "reward": 0.81414794921875, "reward_std": 0.011265669949352741, "rewards//mean": 0.81414794921875, "rewards//std": 0.01821327768266201, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0512, "grad_norm": 0.5943956971168518, "kl": 0.01590067707002163, "learning_rate": 4.978870249761893e-06, "loss": 0.0016, "num_tokens": 1670912.0, "reward": 0.88873291015625, "reward_std": 0.011189533397555351, "rewards//mean": 0.88873291015625, "rewards//std": 0.02351994626224041, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0514, "grad_norm": 0.6058782935142517, "kl": 0.015274307108484209, "learning_rate": 4.978663897503294e-06, "loss": 0.0015, "num_tokens": 1677448.0, "reward": 0.87750244140625, "reward_std": 0.013855671510100365, "rewards//mean": 0.87750244140625, "rewards//std": 0.029026882722973824, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0516, "grad_norm": 0.6686677932739258, "kl": 0.015571855707094073, "learning_rate": 4.978456546839175e-06, "loss": 0.0016, "num_tokens": 1683936.0, "reward": 0.83636474609375, "reward_std": 0.014321016147732735, "rewards//mean": 0.83636474609375, "rewards//std": 0.03732571005821228, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0518, "grad_norm": 0.5977317094802856, "kl": 0.015480601461604238, "learning_rate": 4.978248197853053e-06, "loss": 0.0015, "num_tokens": 1690536.0, "reward": 0.87762451171875, "reward_std": 0.014677740633487701, "rewards//mean": 0.87762451171875, "rewards//std": 0.029107604175806046, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.052, "grad_norm": 0.5967756509780884, "kl": 0.013380717020481825, "learning_rate": 4.978038850628855e-06, "loss": 0.0013, "num_tokens": 1697112.0, "reward": 0.8309326171875, "reward_std": 0.015575399622321129, "rewards//mean": 0.8309326171875, "rewards//std": 0.030181292444467545, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0522, "grad_norm": 0.6308935284614563, "kl": 0.017400558106601238, "learning_rate": 4.977828505250903e-06, "loss": 0.0017, "num_tokens": 1703664.0, "reward": 0.8321533203125, "reward_std": 0.00893273763358593, "rewards//mean": 0.8321533203125, "rewards//std": 0.016703374683856964, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0524, "grad_norm": 0.585950493812561, "kl": 0.01643908827099949, "learning_rate": 4.977617161803927e-06, "loss": 0.0016, "num_tokens": 1710240.0, "reward": 0.82623291015625, "reward_std": 0.012724083848297596, "rewards//mean": 0.82623291015625, "rewards//std": 0.02370458096265793, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0526, "grad_norm": 0.6090091466903687, "kl": 0.017514563747681677, "learning_rate": 4.977404820373053e-06, "loss": 0.0018, "num_tokens": 1716792.0, "reward": 0.86749267578125, "reward_std": 0.013316700235009193, "rewards//mean": 0.86749267578125, "rewards//std": 0.02335784211754799, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0528, "grad_norm": 0.653965413570404, "kl": 0.014788356143981218, "learning_rate": 4.977191481043814e-06, "loss": 0.0015, "num_tokens": 1723320.0, "reward": 0.850830078125, "reward_std": 0.016372665762901306, "rewards//mean": 0.850830078125, "rewards//std": 0.05733340233564377, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.053, "grad_norm": 0.6236534714698792, "kl": 0.020398969296365976, "learning_rate": 4.976977143902143e-06, "loss": 0.002, "num_tokens": 1729736.0, "reward": 0.8720703125, "reward_std": 0.017627835273742676, "rewards//mean": 0.8720703125, "rewards//std": 0.02900712378323078, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0532, "grad_norm": 0.6153638958930969, "kl": 0.022930864011868834, "learning_rate": 4.976761809034375e-06, "loss": 0.0023, "num_tokens": 1736176.0, "reward": 0.8203125, "reward_std": 0.010127585381269455, "rewards//mean": 0.8203125, "rewards//std": 0.023313162848353386, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0534, "grad_norm": 0.6875045895576477, "kl": 0.02023582032416016, "learning_rate": 4.976545476527246e-06, "loss": 0.002, "num_tokens": 1742640.0, "reward": 0.84423828125, "reward_std": 0.01115519367158413, "rewards//mean": 0.84423828125, "rewards//std": 0.03886669874191284, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0536, "grad_norm": 0.5966421365737915, "kl": 0.018444908084347844, "learning_rate": 4.976328146467895e-06, "loss": 0.0018, "num_tokens": 1749224.0, "reward": 0.8155517578125, "reward_std": 0.011006256565451622, "rewards//mean": 0.8155517578125, "rewards//std": 0.02147979475557804, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0538, "grad_norm": 0.6186309456825256, "kl": 0.019571573473513126, "learning_rate": 4.976109818943863e-06, "loss": 0.002, "num_tokens": 1755704.0, "reward": 0.8206787109375, "reward_std": 0.008112533017992973, "rewards//mean": 0.8206787109375, "rewards//std": 0.021176017820835114, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.054, "grad_norm": 0.611027181148529, "kl": 0.016425883513875306, "learning_rate": 4.975890494043092e-06, "loss": 0.0016, "num_tokens": 1762216.0, "reward": 0.811767578125, "reward_std": 0.011278307065367699, "rewards//mean": 0.811767578125, "rewards//std": 0.017328334972262383, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0542, "grad_norm": 0.5591591596603394, "kl": 0.015827651717700064, "learning_rate": 4.975670171853926e-06, "loss": 0.0016, "num_tokens": 1768744.0, "reward": 0.85272216796875, "reward_std": 0.011784248054027557, "rewards//mean": 0.85272216796875, "rewards//std": 0.01991415023803711, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0544, "grad_norm": 0.5942398905754089, "kl": 0.016672777361236513, "learning_rate": 4.975448852465111e-06, "loss": 0.0017, "num_tokens": 1775304.0, "reward": 0.845703125, "reward_std": 0.010333438403904438, "rewards//mean": 0.845703125, "rewards//std": 0.02365351840853691, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0546, "grad_norm": 0.5608808994293213, "kl": 0.021653135307133198, "learning_rate": 4.975226535965795e-06, "loss": 0.0022, "num_tokens": 1781808.0, "reward": 0.84881591796875, "reward_std": 0.018758047372102737, "rewards//mean": 0.84881591796875, "rewards//std": 0.03514370694756508, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0548, "grad_norm": 0.6057482361793518, "kl": 0.019426355720497668, "learning_rate": 4.975003222445525e-06, "loss": 0.0019, "num_tokens": 1788288.0, "reward": 0.773193359375, "reward_std": 0.010500762611627579, "rewards//mean": 0.773193359375, "rewards//std": 0.020420897752046585, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.055, "grad_norm": 0.7404553890228271, "kl": 0.017519006971269846, "learning_rate": 4.974778911994254e-06, "loss": 0.0018, "num_tokens": 1794840.0, "reward": 0.85693359375, "reward_std": 0.011788511648774147, "rewards//mean": 0.85693359375, "rewards//std": 0.029548974707722664, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0552, "grad_norm": 0.586951732635498, "kl": 0.017076579038985074, "learning_rate": 4.974553604702332e-06, "loss": 0.0017, "num_tokens": 1801368.0, "reward": 0.84478759765625, "reward_std": 0.011476319283246994, "rewards//mean": 0.84478759765625, "rewards//std": 0.022121649235486984, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0554, "grad_norm": 0.6556093692779541, "kl": 0.016207139007747173, "learning_rate": 4.974327300660515e-06, "loss": 0.0016, "num_tokens": 1807880.0, "reward": 0.81658935546875, "reward_std": 0.010874106548726559, "rewards//mean": 0.81658935546875, "rewards//std": 0.02728992886841297, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0556, "grad_norm": 0.6637612581253052, "kl": 0.02063511044252664, "learning_rate": 4.974099999959957e-06, "loss": 0.0021, "num_tokens": 1814448.0, "reward": 0.85821533203125, "reward_std": 0.008319644257426262, "rewards//mean": 0.85821533203125, "rewards//std": 0.01655350811779499, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0558, "grad_norm": 0.5973981618881226, "kl": 0.020477684447541833, "learning_rate": 4.973871702692215e-06, "loss": 0.002, "num_tokens": 1820936.0, "reward": 0.849609375, "reward_std": 0.010854586958885193, "rewards//mean": 0.849609375, "rewards//std": 0.03117240034043789, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.056, "grad_norm": 0.5967226624488831, "kl": 0.023090009344741702, "learning_rate": 4.973642408949247e-06, "loss": 0.0023, "num_tokens": 1827440.0, "reward": 0.804931640625, "reward_std": 0.009756384417414665, "rewards//mean": 0.804931640625, "rewards//std": 0.015545355156064034, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0562, "grad_norm": 0.6917018890380859, "kl": 0.021913145435974002, "learning_rate": 4.9734121188234115e-06, "loss": 0.0022, "num_tokens": 1833984.0, "reward": 0.84295654296875, "reward_std": 0.01435273140668869, "rewards//mean": 0.84295654296875, "rewards//std": 0.024912068620324135, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0564, "grad_norm": 0.6409370303153992, "kl": 0.020682931412011385, "learning_rate": 4.973180832407471e-06, "loss": 0.0021, "num_tokens": 1840440.0, "reward": 0.7938232421875, "reward_std": 0.008664321154356003, "rewards//mean": 0.7938232421875, "rewards//std": 0.015801778063178062, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0566, "grad_norm": 0.5642397999763489, "kl": 0.016882274649105966, "learning_rate": 4.972948549794587e-06, "loss": 0.0017, "num_tokens": 1846928.0, "reward": 0.8372802734375, "reward_std": 0.00897817499935627, "rewards//mean": 0.8372802734375, "rewards//std": 0.020448677241802216, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0568, "grad_norm": 0.5508286952972412, "kl": 0.018819021992385387, "learning_rate": 4.972715271078323e-06, "loss": 0.0019, "num_tokens": 1853400.0, "reward": 0.8465576171875, "reward_std": 0.012325981631875038, "rewards//mean": 0.8465576171875, "rewards//std": 0.026073908433318138, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.057, "grad_norm": 0.5768721103668213, "kl": 0.020360214170068502, "learning_rate": 4.972480996352644e-06, "loss": 0.002, "num_tokens": 1859832.0, "reward": 0.83929443359375, "reward_std": 0.011263377964496613, "rewards//mean": 0.83929443359375, "rewards//std": 0.026592640206217766, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0572, "grad_norm": 0.5812131762504578, "kl": 0.016725558903999627, "learning_rate": 4.9722457257119144e-06, "loss": 0.0017, "num_tokens": 1866384.0, "reward": 0.822265625, "reward_std": 0.008006171323359013, "rewards//mean": 0.822265625, "rewards//std": 0.01790745183825493, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0574, "grad_norm": 0.6562964916229248, "kl": 0.022804646170698106, "learning_rate": 4.972009459250903e-06, "loss": 0.0023, "num_tokens": 1872928.0, "reward": 0.82635498046875, "reward_std": 0.011693461798131466, "rewards//mean": 0.82635498046875, "rewards//std": 0.03444236144423485, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0576, "grad_norm": 0.7030390501022339, "kl": 0.021410147426649928, "learning_rate": 4.971772197064776e-06, "loss": 0.0021, "num_tokens": 1879496.0, "reward": 0.8388671875, "reward_std": 0.012064478360116482, "rewards//mean": 0.8388671875, "rewards//std": 0.020611261948943138, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0578, "grad_norm": 0.6291245222091675, "kl": 0.017788728117011487, "learning_rate": 4.971533939249105e-06, "loss": 0.0018, "num_tokens": 1886096.0, "reward": 0.847412109375, "reward_std": 0.00863015465438366, "rewards//mean": 0.847412109375, "rewards//std": 0.022807708010077477, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.058, "grad_norm": 0.5937075614929199, "kl": 0.019959610304795206, "learning_rate": 4.9712946858998576e-06, "loss": 0.002, "num_tokens": 1892696.0, "reward": 0.86376953125, "reward_std": 0.00976363942027092, "rewards//mean": 0.86376953125, "rewards//std": 0.020237719640135765, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0582, "grad_norm": 0.7244158983230591, "kl": 0.02064157126005739, "learning_rate": 4.971054437113406e-06, "loss": 0.0021, "num_tokens": 1899240.0, "reward": 0.85516357421875, "reward_std": 0.012879934161901474, "rewards//mean": 0.85516357421875, "rewards//std": 0.036751486361026764, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0584, "grad_norm": 0.6756367087364197, "kl": 0.02059971122071147, "learning_rate": 4.9708131929865235e-06, "loss": 0.0021, "num_tokens": 1905696.0, "reward": 0.8631591796875, "reward_std": 0.019185233861207962, "rewards//mean": 0.8631591796875, "rewards//std": 0.03356654942035675, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0586, "grad_norm": 0.6094867587089539, "kl": 0.02083066711202264, "learning_rate": 4.970570953616383e-06, "loss": 0.0021, "num_tokens": 1912248.0, "reward": 0.87664794921875, "reward_std": 0.012814180925488472, "rewards//mean": 0.87664794921875, "rewards//std": 0.023896027356386185, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0588, "grad_norm": 0.673071563243866, "kl": 0.017373336595483124, "learning_rate": 4.970327719100556e-06, "loss": 0.0017, "num_tokens": 1918816.0, "reward": 0.8592529296875, "reward_std": 0.011386911384761333, "rewards//mean": 0.8592529296875, "rewards//std": 0.029031902551651, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.059, "grad_norm": 0.6086812615394592, "kl": 0.019332718802616, "learning_rate": 4.970083489537021e-06, "loss": 0.0019, "num_tokens": 1925240.0, "reward": 0.86419677734375, "reward_std": 0.017809508368372917, "rewards//mean": 0.86419677734375, "rewards//std": 0.02093472145497799, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0592, "grad_norm": 0.6028139591217041, "kl": 0.015487979399040341, "learning_rate": 4.96983826502415e-06, "loss": 0.0015, "num_tokens": 1931848.0, "reward": 0.84808349609375, "reward_std": 0.01572677493095398, "rewards//mean": 0.84808349609375, "rewards//std": 0.03593721240758896, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0594, "grad_norm": 0.6186932921409607, "kl": 0.014006961951963603, "learning_rate": 4.969592045660723e-06, "loss": 0.0014, "num_tokens": 1938352.0, "reward": 0.87347412109375, "reward_std": 0.016976596787571907, "rewards//mean": 0.87347412109375, "rewards//std": 0.03520524874329567, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0596, "grad_norm": 0.5595616698265076, "kl": 0.019682875834405422, "learning_rate": 4.969344831545914e-06, "loss": 0.002, "num_tokens": 1944976.0, "reward": 0.85174560546875, "reward_std": 0.013488125056028366, "rewards//mean": 0.85174560546875, "rewards//std": 0.023379866033792496, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0598, "grad_norm": 0.6204686760902405, "kl": 0.020846938248723745, "learning_rate": 4.969096622779303e-06, "loss": 0.0021, "num_tokens": 1951480.0, "reward": 0.85546875, "reward_std": 0.017438407987356186, "rewards//mean": 0.85546875, "rewards//std": 0.05409525707364082, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.06, "grad_norm": 0.6783480644226074, "kl": 0.021353234420530498, "learning_rate": 4.968847419460867e-06, "loss": 0.0021, "num_tokens": 1957992.0, "reward": 0.80029296875, "reward_std": 0.009782904759049416, "rewards//mean": 0.80029296875, "rewards//std": 0.024602141231298447, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0602, "grad_norm": 0.6517021656036377, "kl": 0.02009395882487297, "learning_rate": 4.968597221690986e-06, "loss": 0.002, "num_tokens": 1964584.0, "reward": 0.842041015625, "reward_std": 0.011664390563964844, "rewards//mean": 0.842041015625, "rewards//std": 0.023218169808387756, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0604, "grad_norm": 0.6198382377624512, "kl": 0.019521123496815562, "learning_rate": 4.96834602957044e-06, "loss": 0.002, "num_tokens": 1971112.0, "reward": 0.8695068359375, "reward_std": 0.02505212277173996, "rewards//mean": 0.8695068359375, "rewards//std": 0.037069171667099, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0606, "grad_norm": 0.5796791315078735, "kl": 0.017055419622920454, "learning_rate": 4.968093843200407e-06, "loss": 0.0017, "num_tokens": 1977768.0, "reward": 0.84686279296875, "reward_std": 0.011925576254725456, "rewards//mean": 0.84686279296875, "rewards//std": 0.04024247080087662, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0608, "grad_norm": 0.6452294588088989, "kl": 0.02542811818420887, "learning_rate": 4.96784066268247e-06, "loss": 0.0025, "num_tokens": 1984296.0, "reward": 0.8172607421875, "reward_std": 0.0118525680154562, "rewards//mean": 0.8172607421875, "rewards//std": 0.019493624567985535, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.061, "grad_norm": 0.5470280647277832, "kl": 0.017710189567878842, "learning_rate": 4.967586488118609e-06, "loss": 0.0018, "num_tokens": 1990848.0, "reward": 0.8353271484375, "reward_std": 0.012310231104493141, "rewards//mean": 0.8353271484375, "rewards//std": 0.019290665164589882, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0612, "grad_norm": 0.5871063470840454, "kl": 0.018388585303910077, "learning_rate": 4.967331319611206e-06, "loss": 0.0018, "num_tokens": 1997344.0, "reward": 0.829345703125, "reward_std": 0.009855730459094048, "rewards//mean": 0.829345703125, "rewards//std": 0.0323907844722271, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0614, "grad_norm": 0.5350654721260071, "kl": 0.020239924429915845, "learning_rate": 4.9670751572630425e-06, "loss": 0.002, "num_tokens": 2003848.0, "reward": 0.82427978515625, "reward_std": 0.011770764365792274, "rewards//mean": 0.82427978515625, "rewards//std": 0.027529064565896988, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0616, "grad_norm": 0.597829282283783, "kl": 0.022617788519710302, "learning_rate": 4.9668180011773e-06, "loss": 0.0023, "num_tokens": 2010360.0, "reward": 0.80657958984375, "reward_std": 0.013200776651501656, "rewards//mean": 0.80657958984375, "rewards//std": 0.03527655452489853, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0618, "grad_norm": 0.6784936785697937, "kl": 0.02636838029138744, "learning_rate": 4.966559851457562e-06, "loss": 0.0026, "num_tokens": 2016808.0, "reward": 0.742431640625, "reward_std": 0.013185866177082062, "rewards//mean": 0.742431640625, "rewards//std": 0.02122350223362446, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.062, "grad_norm": 0.6267093420028687, "kl": 0.02070951892528683, "learning_rate": 4.966300708207811e-06, "loss": 0.0021, "num_tokens": 2023344.0, "reward": 0.82403564453125, "reward_std": 0.01098247803747654, "rewards//mean": 0.82403564453125, "rewards//std": 0.030690282583236694, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0622, "grad_norm": 0.570456862449646, "kl": 0.022752946242690086, "learning_rate": 4.96604057153243e-06, "loss": 0.0023, "num_tokens": 2029784.0, "reward": 0.858154296875, "reward_std": 0.012154900468885899, "rewards//mean": 0.858154296875, "rewards//std": 0.042659733444452286, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0624, "grad_norm": 0.608584463596344, "kl": 0.02454413822852075, "learning_rate": 4.965779441536202e-06, "loss": 0.0025, "num_tokens": 2036360.0, "reward": 0.825439453125, "reward_std": 0.013766555115580559, "rewards//mean": 0.825439453125, "rewards//std": 0.04080842435359955, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0626, "grad_norm": 0.5525429844856262, "kl": 0.019763967022299767, "learning_rate": 4.965517318324308e-06, "loss": 0.002, "num_tokens": 2042848.0, "reward": 0.707763671875, "reward_std": 0.012647854164242744, "rewards//mean": 0.707763671875, "rewards//std": 0.031870659440755844, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0628, "grad_norm": 0.6055047512054443, "kl": 0.021872437093406916, "learning_rate": 4.965254202002334e-06, "loss": 0.0022, "num_tokens": 2049312.0, "reward": 0.880126953125, "reward_std": 0.01076760608702898, "rewards//mean": 0.880126953125, "rewards//std": 0.024188879877328873, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.063, "grad_norm": 0.6491442322731018, "kl": 0.018391093239188194, "learning_rate": 4.964990092676263e-06, "loss": 0.0018, "num_tokens": 2055776.0, "reward": 0.83758544921875, "reward_std": 0.014810843393206596, "rewards//mean": 0.83758544921875, "rewards//std": 0.028338147327303886, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0632, "grad_norm": 0.5824354887008667, "kl": 0.024069178150966763, "learning_rate": 4.964724990452476e-06, "loss": 0.0024, "num_tokens": 2062288.0, "reward": 0.8619384765625, "reward_std": 0.011154308915138245, "rewards//mean": 0.8619384765625, "rewards//std": 0.025384364649653435, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0634, "grad_norm": 0.6070606708526611, "kl": 0.02114152815192938, "learning_rate": 4.9644588954377595e-06, "loss": 0.0021, "num_tokens": 2068864.0, "reward": 0.806640625, "reward_std": 0.006708196830004454, "rewards//mean": 0.806640625, "rewards//std": 0.01836155168712139, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0636, "grad_norm": 0.6098837852478027, "kl": 0.019263736554421484, "learning_rate": 4.964191807739293e-06, "loss": 0.0019, "num_tokens": 2075320.0, "reward": 0.85296630859375, "reward_std": 0.012227406725287437, "rewards//mean": 0.85296630859375, "rewards//std": 0.020224157720804214, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0638, "grad_norm": 0.6126257181167603, "kl": 0.02082336728926748, "learning_rate": 4.963923727464661e-06, "loss": 0.0021, "num_tokens": 2081848.0, "reward": 0.84228515625, "reward_std": 0.015138156712055206, "rewards//mean": 0.84228515625, "rewards//std": 0.02339095063507557, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.064, "grad_norm": 0.6214805245399475, "kl": 0.018937196349725127, "learning_rate": 4.963654654721848e-06, "loss": 0.0019, "num_tokens": 2088320.0, "reward": 0.86175537109375, "reward_std": 0.011250907555222511, "rewards//mean": 0.86175537109375, "rewards//std": 0.025321299210190773, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0642, "grad_norm": 0.6301703453063965, "kl": 0.021705118124373257, "learning_rate": 4.963384589619233e-06, "loss": 0.0022, "num_tokens": 2094792.0, "reward": 0.861328125, "reward_std": 0.015380768105387688, "rewards//mean": 0.861328125, "rewards//std": 0.022184601053595543, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0644, "grad_norm": 0.5786391496658325, "kl": 0.022070253267884254, "learning_rate": 4.9631135322656e-06, "loss": 0.0022, "num_tokens": 2101216.0, "reward": 0.80401611328125, "reward_std": 0.01128392294049263, "rewards//mean": 0.80401611328125, "rewards//std": 0.01851334050297737, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0646, "grad_norm": 0.6821287274360657, "kl": 0.021000708919018507, "learning_rate": 4.962841482770131e-06, "loss": 0.0021, "num_tokens": 2107696.0, "reward": 0.8448486328125, "reward_std": 0.014750917442142963, "rewards//mean": 0.8448486328125, "rewards//std": 0.02973664551973343, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0648, "grad_norm": 0.6906484961509705, "kl": 0.020665901945903897, "learning_rate": 4.962568441242408e-06, "loss": 0.0021, "num_tokens": 2114184.0, "reward": 0.85546875, "reward_std": 0.014176122844219208, "rewards//mean": 0.85546875, "rewards//std": 0.0196856502443552, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.065, "grad_norm": 0.6005605459213257, "kl": 0.019298722269013524, "learning_rate": 4.962294407792411e-06, "loss": 0.0019, "num_tokens": 2120744.0, "reward": 0.81097412109375, "reward_std": 0.008400456979870796, "rewards//mean": 0.81097412109375, "rewards//std": 0.01867048256099224, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0652, "grad_norm": 0.6110001802444458, "kl": 0.021721608005464077, "learning_rate": 4.962019382530521e-06, "loss": 0.0022, "num_tokens": 2127184.0, "reward": 0.854248046875, "reward_std": 0.009410550817847252, "rewards//mean": 0.854248046875, "rewards//std": 0.041572827845811844, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0654, "grad_norm": 0.6329718828201294, "kl": 0.02375357341952622, "learning_rate": 4.961743365567517e-06, "loss": 0.0024, "num_tokens": 2133728.0, "reward": 0.8387451171875, "reward_std": 0.015492910519242287, "rewards//mean": 0.8387451171875, "rewards//std": 0.03223032131791115, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0656, "grad_norm": 0.5807815194129944, "kl": 0.02455346193164587, "learning_rate": 4.961466357014581e-06, "loss": 0.0025, "num_tokens": 2140152.0, "reward": 0.86456298828125, "reward_std": 0.013275086879730225, "rewards//mean": 0.86456298828125, "rewards//std": 0.021598830819129944, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0658, "grad_norm": 0.6281384825706482, "kl": 0.020192489260807633, "learning_rate": 4.961188356983291e-06, "loss": 0.002, "num_tokens": 2146656.0, "reward": 0.86334228515625, "reward_std": 0.010247575119137764, "rewards//mean": 0.86334228515625, "rewards//std": 0.017792008817195892, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.066, "grad_norm": 0.6021639108657837, "kl": 0.019850074197165668, "learning_rate": 4.960909365585624e-06, "loss": 0.002, "num_tokens": 2153152.0, "reward": 0.83782958984375, "reward_std": 0.010318451561033726, "rewards//mean": 0.83782958984375, "rewards//std": 0.019722430035471916, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0662, "grad_norm": 0.6681183576583862, "kl": 0.01899259549099952, "learning_rate": 4.960629382933959e-06, "loss": 0.0019, "num_tokens": 2159784.0, "reward": 0.830322265625, "reward_std": 0.011032642796635628, "rewards//mean": 0.830322265625, "rewards//std": 0.02663821168243885, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0664, "grad_norm": 0.6400523781776428, "kl": 0.01971001864876598, "learning_rate": 4.960348409141074e-06, "loss": 0.002, "num_tokens": 2166248.0, "reward": 0.82122802734375, "reward_std": 0.011383827775716782, "rewards//mean": 0.82122802734375, "rewards//std": 0.017576295882463455, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0666, "grad_norm": 0.7005535960197449, "kl": 0.021976487361826003, "learning_rate": 4.960066444320143e-06, "loss": 0.0022, "num_tokens": 2172712.0, "reward": 0.84197998046875, "reward_std": 0.017401963472366333, "rewards//mean": 0.84197998046875, "rewards//std": 0.023714158684015274, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0668, "grad_norm": 0.5731592178344727, "kl": 0.018848864710889757, "learning_rate": 4.959783488584743e-06, "loss": 0.0019, "num_tokens": 2179336.0, "reward": 0.83154296875, "reward_std": 0.011851711198687553, "rewards//mean": 0.83154296875, "rewards//std": 0.02426510863006115, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.067, "grad_norm": 0.6444560289382935, "kl": 0.022729619406163692, "learning_rate": 4.9594995420488475e-06, "loss": 0.0023, "num_tokens": 2185896.0, "reward": 0.85693359375, "reward_std": 0.009836044162511826, "rewards//mean": 0.85693359375, "rewards//std": 0.019704097881913185, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0672, "grad_norm": 0.823472261428833, "kl": 0.024720686255022883, "learning_rate": 4.959214604826831e-06, "loss": 0.0025, "num_tokens": 2192408.0, "reward": 0.78509521484375, "reward_std": 0.011101828888058662, "rewards//mean": 0.78509521484375, "rewards//std": 0.020558221265673637, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0674, "grad_norm": 0.616445004940033, "kl": 0.020393103593960404, "learning_rate": 4.958928677033465e-06, "loss": 0.002, "num_tokens": 2199016.0, "reward": 0.84246826171875, "reward_std": 0.014257056638598442, "rewards//mean": 0.84246826171875, "rewards//std": 0.028340283781290054, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0676, "grad_norm": 0.6037515997886658, "kl": 0.020665846299380064, "learning_rate": 4.9586417587839225e-06, "loss": 0.0021, "num_tokens": 2205544.0, "reward": 0.80145263671875, "reward_std": 0.013119444251060486, "rewards//mean": 0.80145263671875, "rewards//std": 0.020688142627477646, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0678, "grad_norm": 0.7130603194236755, "kl": 0.022849680623039603, "learning_rate": 4.958353850193773e-06, "loss": 0.0023, "num_tokens": 2212120.0, "reward": 0.8408203125, "reward_std": 0.016479603946208954, "rewards//mean": 0.8408203125, "rewards//std": 0.02777874656021595, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.068, "grad_norm": 0.6459859013557434, "kl": 0.018527620006352663, "learning_rate": 4.958064951378988e-06, "loss": 0.0019, "num_tokens": 2218704.0, "reward": 0.84002685546875, "reward_std": 0.015253793448209763, "rewards//mean": 0.84002685546875, "rewards//std": 0.020368104800581932, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0682, "grad_norm": 0.6155900955200195, "kl": 0.019558688392862678, "learning_rate": 4.957775062455933e-06, "loss": 0.002, "num_tokens": 2225408.0, "reward": 0.83038330078125, "reward_std": 0.011874992400407791, "rewards//mean": 0.83038330078125, "rewards//std": 0.025683971121907234, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0684, "grad_norm": 0.6561264395713806, "kl": 0.022024541860446334, "learning_rate": 4.957484183541378e-06, "loss": 0.0022, "num_tokens": 2231888.0, "reward": 0.80804443359375, "reward_std": 0.013313915580511093, "rewards//mean": 0.80804443359375, "rewards//std": 0.02254057675600052, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0686, "grad_norm": 0.5750051736831665, "kl": 0.020300572272390127, "learning_rate": 4.957192314752487e-06, "loss": 0.002, "num_tokens": 2238424.0, "reward": 0.8651123046875, "reward_std": 0.009746493771672249, "rewards//mean": 0.8651123046875, "rewards//std": 0.02353000082075596, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0688, "grad_norm": 0.6531442999839783, "kl": 0.023013052181340754, "learning_rate": 4.9568994562068265e-06, "loss": 0.0023, "num_tokens": 2244936.0, "reward": 0.8524169921875, "reward_std": 0.012329033575952053, "rewards//mean": 0.8524169921875, "rewards//std": 0.025233639404177666, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.069, "grad_norm": 0.6909974813461304, "kl": 0.019845292437821627, "learning_rate": 4.9566056080223576e-06, "loss": 0.002, "num_tokens": 2251424.0, "reward": 0.82861328125, "reward_std": 0.014280682429671288, "rewards//mean": 0.82861328125, "rewards//std": 0.023245535790920258, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0692, "grad_norm": 0.6558386087417603, "kl": 0.01972190651576966, "learning_rate": 4.9563107703174444e-06, "loss": 0.002, "num_tokens": 2258032.0, "reward": 0.8741455078125, "reward_std": 0.014798332937061787, "rewards//mean": 0.8741455078125, "rewards//std": 0.03160810470581055, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0694, "grad_norm": 0.6157135963439941, "kl": 0.024010731372982264, "learning_rate": 4.956014943210845e-06, "loss": 0.0024, "num_tokens": 2264480.0, "reward": 0.81573486328125, "reward_std": 0.00979662872850895, "rewards//mean": 0.81573486328125, "rewards//std": 0.02577575109899044, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0696, "grad_norm": 0.6779954433441162, "kl": 0.023366793291643262, "learning_rate": 4.9557181268217225e-06, "loss": 0.0023, "num_tokens": 2271016.0, "reward": 0.837890625, "reward_std": 0.016689497977495193, "rewards//mean": 0.837890625, "rewards//std": 0.01790745183825493, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0698, "grad_norm": 0.6324844360351562, "kl": 0.01984335097949952, "learning_rate": 4.9554203212696304e-06, "loss": 0.002, "num_tokens": 2277560.0, "reward": 0.83612060546875, "reward_std": 0.016164684668183327, "rewards//mean": 0.83612060546875, "rewards//std": 0.025262044742703438, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.07, "grad_norm": 0.6446642875671387, "kl": 0.01843974506482482, "learning_rate": 4.955121526674528e-06, "loss": 0.0018, "num_tokens": 2284056.0, "reward": 0.85296630859375, "reward_std": 0.01069987565279007, "rewards//mean": 0.85296630859375, "rewards//std": 0.01646088808774948, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0702, "grad_norm": 0.6699240803718567, "kl": 0.020488249836489558, "learning_rate": 4.9548217431567665e-06, "loss": 0.002, "num_tokens": 2290576.0, "reward": 0.818603515625, "reward_std": 0.015040895901620388, "rewards//mean": 0.818603515625, "rewards//std": 0.024278830736875534, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0704, "grad_norm": 0.6167379021644592, "kl": 0.020811604452319443, "learning_rate": 4.9545209708371025e-06, "loss": 0.0021, "num_tokens": 2297096.0, "reward": 0.828125, "reward_std": 0.012585700489580631, "rewards//mean": 0.828125, "rewards//std": 0.03919249400496483, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0706, "grad_norm": 0.6443126797676086, "kl": 0.021438468480482697, "learning_rate": 4.9542192098366835e-06, "loss": 0.0021, "num_tokens": 2303560.0, "reward": 0.82696533203125, "reward_std": 0.008636537939310074, "rewards//mean": 0.82696533203125, "rewards//std": 0.01631772518157959, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0708, "grad_norm": 0.7037899494171143, "kl": 0.02008171903435141, "learning_rate": 4.95391646027706e-06, "loss": 0.002, "num_tokens": 2310104.0, "reward": 0.826416015625, "reward_std": 0.010460684075951576, "rewards//mean": 0.826416015625, "rewards//std": 0.01779721863567829, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.071, "grad_norm": 0.6334845423698425, "kl": 0.022552308510057628, "learning_rate": 4.953612722280181e-06, "loss": 0.0023, "num_tokens": 2316616.0, "reward": 0.81109619140625, "reward_std": 0.013959594070911407, "rewards//mean": 0.81109619140625, "rewards//std": 0.022578153759241104, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0712, "grad_norm": 0.6384918093681335, "kl": 0.021024183952249587, "learning_rate": 4.953307995968391e-06, "loss": 0.0021, "num_tokens": 2323224.0, "reward": 0.8253173828125, "reward_std": 0.011187070980668068, "rewards//mean": 0.8253173828125, "rewards//std": 0.02631431631743908, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0714, "grad_norm": 0.6690114736557007, "kl": 0.021767844446003437, "learning_rate": 4.953002281464432e-06, "loss": 0.0022, "num_tokens": 2329728.0, "reward": 0.82196044921875, "reward_std": 0.010448087006807327, "rewards//mean": 0.82196044921875, "rewards//std": 0.016960028558969498, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0716, "grad_norm": 0.6867706775665283, "kl": 0.0235063168220222, "learning_rate": 4.952695578891449e-06, "loss": 0.0024, "num_tokens": 2336144.0, "reward": 0.861328125, "reward_std": 0.012647712603211403, "rewards//mean": 0.861328125, "rewards//std": 0.024220149964094162, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0718, "grad_norm": 0.6281720995903015, "kl": 0.01960526150651276, "learning_rate": 4.9523878883729794e-06, "loss": 0.002, "num_tokens": 2342616.0, "reward": 0.84552001953125, "reward_std": 0.019969893619418144, "rewards//mean": 0.84552001953125, "rewards//std": 0.031257204711437225, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.072, "grad_norm": 0.5789004564285278, "kl": 0.021749505307525396, "learning_rate": 4.952079210032962e-06, "loss": 0.0022, "num_tokens": 2349200.0, "reward": 0.84375, "reward_std": 0.012675793841481209, "rewards//mean": 0.84375, "rewards//std": 0.021281909197568893, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0722, "grad_norm": 0.591282069683075, "kl": 0.025245020631700754, "learning_rate": 4.951769543995731e-06, "loss": 0.0025, "num_tokens": 2355608.0, "reward": 0.8421630859375, "reward_std": 0.0136952493339777, "rewards//mean": 0.8421630859375, "rewards//std": 0.019546357914805412, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0724, "grad_norm": 0.5938491821289062, "kl": 0.019946447922848165, "learning_rate": 4.951458890386021e-06, "loss": 0.002, "num_tokens": 2362096.0, "reward": 0.84320068359375, "reward_std": 0.013788199983537197, "rewards//mean": 0.84320068359375, "rewards//std": 0.026068320497870445, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0726, "grad_norm": 0.626299262046814, "kl": 0.022490440169349313, "learning_rate": 4.951147249328964e-06, "loss": 0.0022, "num_tokens": 2368736.0, "reward": 0.85345458984375, "reward_std": 0.013921165838837624, "rewards//mean": 0.85345458984375, "rewards//std": 0.027107935398817062, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0728, "grad_norm": 0.6578425765037537, "kl": 0.022625273559242487, "learning_rate": 4.950834620950089e-06, "loss": 0.0023, "num_tokens": 2375176.0, "reward": 0.8509521484375, "reward_std": 0.014235584065318108, "rewards//mean": 0.8509521484375, "rewards//std": 0.03371234983205795, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.073, "grad_norm": 0.6105694770812988, "kl": 0.023182813776656985, "learning_rate": 4.9505210053753204e-06, "loss": 0.0023, "num_tokens": 2381712.0, "reward": 0.85064697265625, "reward_std": 0.010071905329823494, "rewards//mean": 0.85064697265625, "rewards//std": 0.03252411261200905, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0732, "grad_norm": 0.6019152402877808, "kl": 0.023740790085867047, "learning_rate": 4.950206402730984e-06, "loss": 0.0024, "num_tokens": 2388352.0, "reward": 0.8419189453125, "reward_std": 0.01870671659708023, "rewards//mean": 0.8419189453125, "rewards//std": 0.031086571514606476, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0734, "grad_norm": 0.6212697625160217, "kl": 0.023675142554566264, "learning_rate": 4.949890813143802e-06, "loss": 0.0024, "num_tokens": 2395024.0, "reward": 0.8397216796875, "reward_std": 0.015749718993902206, "rewards//mean": 0.8397216796875, "rewards//std": 0.037660665810108185, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0736, "grad_norm": 0.598435640335083, "kl": 0.025552398525178432, "learning_rate": 4.949574236740893e-06, "loss": 0.0026, "num_tokens": 2401528.0, "reward": 0.87725830078125, "reward_std": 0.01766745001077652, "rewards//mean": 0.87725830078125, "rewards//std": 0.03177117556333542, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0738, "grad_norm": 0.5873764157295227, "kl": 0.024055095855146646, "learning_rate": 4.949256673649774e-06, "loss": 0.0024, "num_tokens": 2408024.0, "reward": 0.8677978515625, "reward_std": 0.008664744906127453, "rewards//mean": 0.8677978515625, "rewards//std": 0.02137523889541626, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.074, "grad_norm": 0.5763117671012878, "kl": 0.023328717099502683, "learning_rate": 4.94893812399836e-06, "loss": 0.0023, "num_tokens": 2414536.0, "reward": 0.86468505859375, "reward_std": 0.012787047773599625, "rewards//mean": 0.86468505859375, "rewards//std": 0.03016442246735096, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0742, "grad_norm": 0.5695638060569763, "kl": 0.026884031016379595, "learning_rate": 4.948618587914963e-06, "loss": 0.0027, "num_tokens": 2420984.0, "reward": 0.843017578125, "reward_std": 0.013181064277887344, "rewards//mean": 0.843017578125, "rewards//std": 0.020994020625948906, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0744, "grad_norm": 0.7202668786048889, "kl": 0.02846267749555409, "learning_rate": 4.948298065528292e-06, "loss": 0.0028, "num_tokens": 2427496.0, "reward": 0.83990478515625, "reward_std": 0.012518524192273617, "rewards//mean": 0.83990478515625, "rewards//std": 0.017900580540299416, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0746, "grad_norm": 0.5586526393890381, "kl": 0.022693981183692813, "learning_rate": 4.947976556967452e-06, "loss": 0.0023, "num_tokens": 2434000.0, "reward": 0.83734130859375, "reward_std": 0.011747884564101696, "rewards//mean": 0.83734130859375, "rewards//std": 0.03377801179885864, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0748, "grad_norm": 0.6451295614242554, "kl": 0.02354696928523481, "learning_rate": 4.947654062361949e-06, "loss": 0.0024, "num_tokens": 2440632.0, "reward": 0.84344482421875, "reward_std": 0.01550312340259552, "rewards//mean": 0.84344482421875, "rewards//std": 0.027972545474767685, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.075, "grad_norm": 0.693091869354248, "kl": 0.030426556011661887, "learning_rate": 4.9473305818416805e-06, "loss": 0.003, "num_tokens": 2447120.0, "reward": 0.85211181640625, "reward_std": 0.013688227161765099, "rewards//mean": 0.85211181640625, "rewards//std": 0.026409853249788284, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0752, "grad_norm": 0.5874215960502625, "kl": 0.024489378090947866, "learning_rate": 4.947006115536947e-06, "loss": 0.0024, "num_tokens": 2453656.0, "reward": 0.81732177734375, "reward_std": 0.011487660929560661, "rewards//mean": 0.81732177734375, "rewards//std": 0.02587130106985569, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0754, "grad_norm": 0.6001226305961609, "kl": 0.026121094590052962, "learning_rate": 4.946680663578443e-06, "loss": 0.0026, "num_tokens": 2460248.0, "reward": 0.86529541015625, "reward_std": 0.011276507750153542, "rewards//mean": 0.86529541015625, "rewards//std": 0.02111041732132435, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0756, "grad_norm": 0.5803017020225525, "kl": 0.023147993255406618, "learning_rate": 4.946354226097261e-06, "loss": 0.0023, "num_tokens": 2466752.0, "reward": 0.86126708984375, "reward_std": 0.01178767066448927, "rewards//mean": 0.86126708984375, "rewards//std": 0.022498900070786476, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0758, "grad_norm": 0.6185136437416077, "kl": 0.023552838247269392, "learning_rate": 4.946026803224888e-06, "loss": 0.0024, "num_tokens": 2473256.0, "reward": 0.8831787109375, "reward_std": 0.012100385501980782, "rewards//mean": 0.8831787109375, "rewards//std": 0.020312009379267693, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.076, "grad_norm": 0.5861929059028625, "kl": 0.023264746530912817, "learning_rate": 4.945698395093212e-06, "loss": 0.0023, "num_tokens": 2479752.0, "reward": 0.86248779296875, "reward_std": 0.011219041422009468, "rewards//mean": 0.86248779296875, "rewards//std": 0.02088186889886856, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0762, "grad_norm": 0.619530439376831, "kl": 0.026569161796942353, "learning_rate": 4.9453690018345144e-06, "loss": 0.0027, "num_tokens": 2486256.0, "reward": 0.84765625, "reward_std": 0.012358471751213074, "rewards//mean": 0.84765625, "rewards//std": 0.02428007684648037, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0764, "grad_norm": 0.5811346769332886, "kl": 0.02320175990462303, "learning_rate": 4.9450386235814755e-06, "loss": 0.0023, "num_tokens": 2492904.0, "reward": 0.84619140625, "reward_std": 0.010218563489615917, "rewards//mean": 0.84619140625, "rewards//std": 0.02535841055214405, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0766, "grad_norm": 0.6582991480827332, "kl": 0.0316473871935159, "learning_rate": 4.944707260467172e-06, "loss": 0.0032, "num_tokens": 2499432.0, "reward": 0.83831787109375, "reward_std": 0.018780037760734558, "rewards//mean": 0.83831787109375, "rewards//std": 0.022376112639904022, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0768, "grad_norm": 0.634960412979126, "kl": 0.02587113855406642, "learning_rate": 4.944374912625076e-06, "loss": 0.0026, "num_tokens": 2505944.0, "reward": 0.85772705078125, "reward_std": 0.012217249721288681, "rewards//mean": 0.85772705078125, "rewards//std": 0.029163716360926628, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.077, "grad_norm": 0.6517542004585266, "kl": 0.026097522117197514, "learning_rate": 4.944041580189057e-06, "loss": 0.0026, "num_tokens": 2512432.0, "reward": 0.820556640625, "reward_std": 0.012241235002875328, "rewards//mean": 0.820556640625, "rewards//std": 0.024977076798677444, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0772, "grad_norm": 0.7040196061134338, "kl": 0.024413017323240638, "learning_rate": 4.943707263293382e-06, "loss": 0.0024, "num_tokens": 2518928.0, "reward": 0.8297119140625, "reward_std": 0.00819058995693922, "rewards//mean": 0.8297119140625, "rewards//std": 0.022083040326833725, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0774, "grad_norm": 0.6075165867805481, "kl": 0.028073451248928905, "learning_rate": 4.943371962072714e-06, "loss": 0.0028, "num_tokens": 2525416.0, "reward": 0.85955810546875, "reward_std": 0.015763863921165466, "rewards//mean": 0.85955810546875, "rewards//std": 0.02799093909561634, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0776, "grad_norm": 0.6392875909805298, "kl": 0.026283514220267534, "learning_rate": 4.9430356766621114e-06, "loss": 0.0026, "num_tokens": 2531888.0, "reward": 0.8511962890625, "reward_std": 0.011728147976100445, "rewards//mean": 0.8511962890625, "rewards//std": 0.017740566283464432, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0778, "grad_norm": 0.6550029516220093, "kl": 0.028789987321943045, "learning_rate": 4.942698407197031e-06, "loss": 0.0029, "num_tokens": 2538400.0, "reward": 0.818115234375, "reward_std": 0.011933239176869392, "rewards//mean": 0.818115234375, "rewards//std": 0.03468024730682373, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.078, "grad_norm": 0.6924291253089905, "kl": 0.029282074654474854, "learning_rate": 4.942360153813324e-06, "loss": 0.0029, "num_tokens": 2544920.0, "reward": 0.82562255859375, "reward_std": 0.010113751515746117, "rewards//mean": 0.82562255859375, "rewards//std": 0.031042898073792458, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0782, "grad_norm": 0.6393311619758606, "kl": 0.027023433009162545, "learning_rate": 4.9420209166472386e-06, "loss": 0.0027, "num_tokens": 2551416.0, "reward": 0.88250732421875, "reward_std": 0.013438566587865353, "rewards//mean": 0.88250732421875, "rewards//std": 0.032676879316568375, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0784, "grad_norm": 0.6631886959075928, "kl": 0.028520054882392287, "learning_rate": 4.9416806958354206e-06, "loss": 0.0029, "num_tokens": 2557888.0, "reward": 0.780517578125, "reward_std": 0.011763013899326324, "rewards//mean": 0.780517578125, "rewards//std": 0.030392462387681007, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0786, "grad_norm": 0.5338965058326721, "kl": 0.02854826208204031, "learning_rate": 4.9413394915149094e-06, "loss": 0.0029, "num_tokens": 2564472.0, "reward": 0.83892822265625, "reward_std": 0.01127648912370205, "rewards//mean": 0.83892822265625, "rewards//std": 0.025852570310235023, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0788, "grad_norm": 0.6340290904045105, "kl": 0.024304196937009692, "learning_rate": 4.940997303823144e-06, "loss": 0.0024, "num_tokens": 2570952.0, "reward": 0.78216552734375, "reward_std": 0.012750035151839256, "rewards//mean": 0.78216552734375, "rewards//std": 0.026763997972011566, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.079, "grad_norm": 0.6295396685600281, "kl": 0.02592435828410089, "learning_rate": 4.940654132897957e-06, "loss": 0.0026, "num_tokens": 2577448.0, "reward": 0.852294921875, "reward_std": 0.012507164850831032, "rewards//mean": 0.852294921875, "rewards//std": 0.027267245575785637, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0792, "grad_norm": 0.5991517305374146, "kl": 0.03304093307815492, "learning_rate": 4.940309978877576e-06, "loss": 0.0033, "num_tokens": 2584056.0, "reward": 0.822021484375, "reward_std": 0.008339623920619488, "rewards//mean": 0.822021484375, "rewards//std": 0.02647404372692108, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0794, "grad_norm": 0.5874148607254028, "kl": 0.027525671990588307, "learning_rate": 4.939964841900627e-06, "loss": 0.0028, "num_tokens": 2590496.0, "reward": 0.872314453125, "reward_std": 0.014325520023703575, "rewards//mean": 0.872314453125, "rewards//std": 0.03774357587099075, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0796, "grad_norm": 0.6355072855949402, "kl": 0.0354489772580564, "learning_rate": 4.9396187221061324e-06, "loss": 0.0035, "num_tokens": 2597056.0, "reward": 0.82666015625, "reward_std": 0.01194741204380989, "rewards//mean": 0.82666015625, "rewards//std": 0.015893952921032906, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0798, "grad_norm": 0.6167977452278137, "kl": 0.028267529793083668, "learning_rate": 4.939271619633508e-06, "loss": 0.0028, "num_tokens": 2603568.0, "reward": 0.82171630859375, "reward_std": 0.013992267660796642, "rewards//mean": 0.82171630859375, "rewards//std": 0.03480915725231171, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.08, "grad_norm": 0.6546652317047119, "kl": 0.026222907239571214, "learning_rate": 4.938923534622567e-06, "loss": 0.0026, "num_tokens": 2610160.0, "reward": 0.84625244140625, "reward_std": 0.011356605216860771, "rewards//mean": 0.84625244140625, "rewards//std": 0.029670584946870804, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0802, "grad_norm": 0.618540346622467, "kl": 0.02860379498451948, "learning_rate": 4.938574467213519e-06, "loss": 0.0029, "num_tokens": 2616672.0, "reward": 0.83953857421875, "reward_std": 0.011920612305402756, "rewards//mean": 0.83953857421875, "rewards//std": 0.01723625510931015, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0804, "grad_norm": 0.6380559206008911, "kl": 0.032505206996575, "learning_rate": 4.938224417546965e-06, "loss": 0.0033, "num_tokens": 2623240.0, "reward": 0.8426513671875, "reward_std": 0.012489142827689648, "rewards//mean": 0.8426513671875, "rewards//std": 0.02962239645421505, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0806, "grad_norm": 0.6414657235145569, "kl": 0.026258823927491903, "learning_rate": 4.937873385763909e-06, "loss": 0.0026, "num_tokens": 2629728.0, "reward": 0.82769775390625, "reward_std": 0.011077295988798141, "rewards//mean": 0.82769775390625, "rewards//std": 0.027260513976216316, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0808, "grad_norm": 0.672893762588501, "kl": 0.03017253940925002, "learning_rate": 4.9375213720057435e-06, "loss": 0.003, "num_tokens": 2636176.0, "reward": 0.82470703125, "reward_std": 0.007876119576394558, "rewards//mean": 0.82470703125, "rewards//std": 0.012955565936863422, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.081, "grad_norm": 0.6322163939476013, "kl": 0.030439491849392653, "learning_rate": 4.937168376414261e-06, "loss": 0.003, "num_tokens": 2642640.0, "reward": 0.84759521484375, "reward_std": 0.008020121604204178, "rewards//mean": 0.84759521484375, "rewards//std": 0.02140098437666893, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0812, "grad_norm": 0.6374755501747131, "kl": 0.030731099424883723, "learning_rate": 4.9368143991316485e-06, "loss": 0.0031, "num_tokens": 2649184.0, "reward": 0.86627197265625, "reward_std": 0.014455043710768223, "rewards//mean": 0.86627197265625, "rewards//std": 0.038125813007354736, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0814, "grad_norm": 0.712587833404541, "kl": 0.026140809408389032, "learning_rate": 4.936459440300487e-06, "loss": 0.0026, "num_tokens": 2655808.0, "reward": 0.85528564453125, "reward_std": 0.010381786152720451, "rewards//mean": 0.85528564453125, "rewards//std": 0.027791209518909454, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0816, "grad_norm": 0.6231846213340759, "kl": 0.02688520005904138, "learning_rate": 4.936103500063755e-06, "loss": 0.0027, "num_tokens": 2662344.0, "reward": 0.851806640625, "reward_std": 0.017744846642017365, "rewards//mean": 0.851806640625, "rewards//std": 0.04603494331240654, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0818, "grad_norm": 0.6654735803604126, "kl": 0.030552038457244635, "learning_rate": 4.935746578564825e-06, "loss": 0.0031, "num_tokens": 2668896.0, "reward": 0.85076904296875, "reward_std": 0.016120346263051033, "rewards//mean": 0.85076904296875, "rewards//std": 0.02107812464237213, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.082, "grad_norm": 0.6213189363479614, "kl": 0.029119898565113544, "learning_rate": 4.935388675947463e-06, "loss": 0.0029, "num_tokens": 2675448.0, "reward": 0.86163330078125, "reward_std": 0.014135929755866528, "rewards//mean": 0.86163330078125, "rewards//std": 0.029747523367404938, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0822, "grad_norm": 0.6235265135765076, "kl": 0.03541518189013004, "learning_rate": 4.935029792355834e-06, "loss": 0.0035, "num_tokens": 2681992.0, "reward": 0.84326171875, "reward_std": 0.01086876168847084, "rewards//mean": 0.84326171875, "rewards//std": 0.024592293426394463, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0824, "grad_norm": 0.6536543965339661, "kl": 0.030560293700546026, "learning_rate": 4.934669927934496e-06, "loss": 0.0031, "num_tokens": 2688488.0, "reward": 0.866455078125, "reward_std": 0.014581255614757538, "rewards//mean": 0.866455078125, "rewards//std": 0.02263716049492359, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0826, "grad_norm": 0.6777145862579346, "kl": 0.032293920405209064, "learning_rate": 4.9343090828284025e-06, "loss": 0.0032, "num_tokens": 2695024.0, "reward": 0.82843017578125, "reward_std": 0.009871330112218857, "rewards//mean": 0.82843017578125, "rewards//std": 0.016012445092201233, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0828, "grad_norm": 0.5964574813842773, "kl": 0.03125216974876821, "learning_rate": 4.933947257182901e-06, "loss": 0.0031, "num_tokens": 2701456.0, "reward": 0.821533203125, "reward_std": 0.007897812873125076, "rewards//mean": 0.821533203125, "rewards//std": 0.023611243814229965, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.083, "grad_norm": 0.6268133521080017, "kl": 0.03192015551030636, "learning_rate": 4.933584451143736e-06, "loss": 0.0032, "num_tokens": 2708000.0, "reward": 0.82464599609375, "reward_std": 0.01356738805770874, "rewards//mean": 0.82464599609375, "rewards//std": 0.029166312888264656, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0832, "grad_norm": 0.6093042492866516, "kl": 0.028882501646876335, "learning_rate": 4.933220664857045e-06, "loss": 0.0029, "num_tokens": 2714520.0, "reward": 0.86712646484375, "reward_std": 0.013428254052996635, "rewards//mean": 0.86712646484375, "rewards//std": 0.023730112239718437, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0834, "grad_norm": 0.6019577980041504, "kl": 0.031143209664151073, "learning_rate": 4.93285589846936e-06, "loss": 0.0031, "num_tokens": 2721088.0, "reward": 0.85479736328125, "reward_std": 0.01068512536585331, "rewards//mean": 0.85479736328125, "rewards//std": 0.019912630319595337, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0836, "grad_norm": 0.6004226207733154, "kl": 0.031462118960916996, "learning_rate": 4.932490152127611e-06, "loss": 0.0031, "num_tokens": 2727568.0, "reward": 0.8572998046875, "reward_std": 0.008544829674065113, "rewards//mean": 0.8572998046875, "rewards//std": 0.022390736266970634, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0838, "grad_norm": 0.6271358132362366, "kl": 0.0408686890732497, "learning_rate": 4.93212342597912e-06, "loss": 0.0041, "num_tokens": 2734056.0, "reward": 0.83489990234375, "reward_std": 0.011716969311237335, "rewards//mean": 0.83489990234375, "rewards//std": 0.023686693981289864, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.084, "grad_norm": 0.6660474538803101, "kl": 0.035757099045440555, "learning_rate": 4.931755720171603e-06, "loss": 0.0036, "num_tokens": 2740504.0, "reward": 0.87750244140625, "reward_std": 0.013047153130173683, "rewards//mean": 0.87750244140625, "rewards//std": 0.03766965866088867, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0842, "grad_norm": 0.6816895008087158, "kl": 0.03769821021705866, "learning_rate": 4.931387034853173e-06, "loss": 0.0038, "num_tokens": 2747024.0, "reward": 0.843505859375, "reward_std": 0.01912887953221798, "rewards//mean": 0.843505859375, "rewards//std": 0.035172607749700546, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0844, "grad_norm": 0.6255840063095093, "kl": 0.03659572545439005, "learning_rate": 4.9310173701723365e-06, "loss": 0.0037, "num_tokens": 2753560.0, "reward": 0.8350830078125, "reward_std": 0.012399287894368172, "rewards//mean": 0.8350830078125, "rewards//std": 0.018668556585907936, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0846, "grad_norm": 0.6702026724815369, "kl": 0.038369332905858755, "learning_rate": 4.930646726277994e-06, "loss": 0.0038, "num_tokens": 2759984.0, "reward": 0.85687255859375, "reward_std": 0.017981387674808502, "rewards//mean": 0.85687255859375, "rewards//std": 0.04027368128299713, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0848, "grad_norm": 0.6983957886695862, "kl": 0.03270720690488815, "learning_rate": 4.930275103319441e-06, "loss": 0.0033, "num_tokens": 2766472.0, "reward": 0.8499755859375, "reward_std": 0.01107704732567072, "rewards//mean": 0.8499755859375, "rewards//std": 0.024177299812436104, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.085, "grad_norm": 0.6446329951286316, "kl": 0.034597059013321996, "learning_rate": 4.9299025014463665e-06, "loss": 0.0035, "num_tokens": 2772920.0, "reward": 0.84576416015625, "reward_std": 0.014222029596567154, "rewards//mean": 0.84576416015625, "rewards//std": 0.024685604497790337, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0852, "grad_norm": 0.6605253219604492, "kl": 0.02986510982736945, "learning_rate": 4.9295289208088545e-06, "loss": 0.003, "num_tokens": 2779360.0, "reward": 0.81646728515625, "reward_std": 0.010702652856707573, "rewards//mean": 0.81646728515625, "rewards//std": 0.027094529941678047, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0854, "grad_norm": 0.6196510791778564, "kl": 0.031927338568493724, "learning_rate": 4.929154361557384e-06, "loss": 0.0032, "num_tokens": 2785904.0, "reward": 0.83935546875, "reward_std": 0.01270909421145916, "rewards//mean": 0.83935546875, "rewards//std": 0.04159976541996002, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0856, "grad_norm": 0.6354705691337585, "kl": 0.032114935806021094, "learning_rate": 4.928778823842828e-06, "loss": 0.0032, "num_tokens": 2792512.0, "reward": 0.8443603515625, "reward_std": 0.015137514099478722, "rewards//mean": 0.8443603515625, "rewards//std": 0.030932310968637466, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0858, "grad_norm": 0.618382453918457, "kl": 0.0317255153786391, "learning_rate": 4.928402307816452e-06, "loss": 0.0032, "num_tokens": 2799152.0, "reward": 0.86468505859375, "reward_std": 0.01585196889936924, "rewards//mean": 0.86468505859375, "rewards//std": 0.031058497726917267, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.086, "grad_norm": 0.6240487694740295, "kl": 0.034124003956094384, "learning_rate": 4.928024813629917e-06, "loss": 0.0034, "num_tokens": 2805640.0, "reward": 0.83660888671875, "reward_std": 0.015030371025204659, "rewards//mean": 0.83660888671875, "rewards//std": 0.03876720368862152, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0862, "grad_norm": 0.6973584890365601, "kl": 0.04224087065085769, "learning_rate": 4.927646341435276e-06, "loss": 0.0042, "num_tokens": 2812120.0, "reward": 0.7916259765625, "reward_std": 0.009834514930844307, "rewards//mean": 0.7916259765625, "rewards//std": 0.03112744726240635, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0864, "grad_norm": 0.6077401638031006, "kl": 0.03255357057787478, "learning_rate": 4.92726689138498e-06, "loss": 0.0033, "num_tokens": 2818576.0, "reward": 0.840087890625, "reward_std": 0.01285285409539938, "rewards//mean": 0.840087890625, "rewards//std": 0.02187538892030716, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0866, "grad_norm": 0.6180047988891602, "kl": 0.032373772002756596, "learning_rate": 4.92688646363187e-06, "loss": 0.0032, "num_tokens": 2825176.0, "reward": 0.8251953125, "reward_std": 0.01345391571521759, "rewards//mean": 0.8251953125, "rewards//std": 0.03164280578494072, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0868, "grad_norm": 0.6412854194641113, "kl": 0.0392610477283597, "learning_rate": 4.926505058329184e-06, "loss": 0.0039, "num_tokens": 2831768.0, "reward": 0.8336181640625, "reward_std": 0.013466738164424896, "rewards//mean": 0.8336181640625, "rewards//std": 0.018826814368367195, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.087, "grad_norm": 0.7546082139015198, "kl": 0.03666896419599652, "learning_rate": 4.9261226756305495e-06, "loss": 0.0037, "num_tokens": 2838224.0, "reward": 0.85064697265625, "reward_std": 0.011050796136260033, "rewards//mean": 0.85064697265625, "rewards//std": 0.024377064779400826, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0872, "grad_norm": 0.6662473678588867, "kl": 0.02848008507862687, "learning_rate": 4.925739315689991e-06, "loss": 0.0028, "num_tokens": 2844688.0, "reward": 0.8526611328125, "reward_std": 0.014631968922913074, "rewards//mean": 0.8526611328125, "rewards//std": 0.018128827214241028, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0874, "grad_norm": 0.7872958779335022, "kl": 0.03451111959293485, "learning_rate": 4.925354978661928e-06, "loss": 0.0035, "num_tokens": 2851200.0, "reward": 0.8326416015625, "reward_std": 0.01695983111858368, "rewards//mean": 0.8326416015625, "rewards//std": 0.03947708383202553, "step": 437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0876, "grad_norm": 0.6215630173683167, "kl": 0.0340423216111958, "learning_rate": 4.924969664701168e-06, "loss": 0.0034, "num_tokens": 2857664.0, "reward": 0.82806396484375, "reward_std": 0.009415829554200172, "rewards//mean": 0.82806396484375, "rewards//std": 0.02834402211010456, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0878, "grad_norm": 0.6829410791397095, "kl": 0.03426765673793852, "learning_rate": 4.924583373962918e-06, "loss": 0.0034, "num_tokens": 2864208.0, "reward": 0.83428955078125, "reward_std": 0.008964203298091888, "rewards//mean": 0.83428955078125, "rewards//std": 0.021393202245235443, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.088, "grad_norm": 0.6143079996109009, "kl": 0.03419288503937423, "learning_rate": 4.924196106602774e-06, "loss": 0.0034, "num_tokens": 2870664.0, "reward": 0.85797119140625, "reward_std": 0.011148895137012005, "rewards//mean": 0.85797119140625, "rewards//std": 0.037456877529621124, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0882, "grad_norm": 0.6812248229980469, "kl": 0.034670352237299085, "learning_rate": 4.9238078627767285e-06, "loss": 0.0035, "num_tokens": 2877192.0, "reward": 0.8590087890625, "reward_std": 0.01278429850935936, "rewards//mean": 0.8590087890625, "rewards//std": 0.018051842227578163, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0884, "grad_norm": 0.6505360007286072, "kl": 0.03824305930174887, "learning_rate": 4.923418642641166e-06, "loss": 0.0038, "num_tokens": 2883736.0, "reward": 0.87548828125, "reward_std": 0.01399047952145338, "rewards//mean": 0.87548828125, "rewards//std": 0.02312016673386097, "step": 442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0886, "grad_norm": 0.6440812349319458, "kl": 0.03093017195351422, "learning_rate": 4.923028446352864e-06, "loss": 0.0031, "num_tokens": 2890376.0, "reward": 0.8350830078125, "reward_std": 0.012048540636897087, "rewards//mean": 0.8350830078125, "rewards//std": 0.021468516439199448, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0888, "grad_norm": 0.63178551197052, "kl": 0.031103559769690037, "learning_rate": 4.922637274068993e-06, "loss": 0.0031, "num_tokens": 2896840.0, "reward": 0.84912109375, "reward_std": 0.013408930972218513, "rewards//mean": 0.84912109375, "rewards//std": 0.024602141231298447, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.089, "grad_norm": 0.5702874660491943, "kl": 0.035526285879313946, "learning_rate": 4.9222451259471185e-06, "loss": 0.0036, "num_tokens": 2903384.0, "reward": 0.8553466796875, "reward_std": 0.010418427176773548, "rewards//mean": 0.8553466796875, "rewards//std": 0.018688006326556206, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0892, "grad_norm": 0.6411604285240173, "kl": 0.035621481481939554, "learning_rate": 4.921852002145196e-06, "loss": 0.0036, "num_tokens": 2910000.0, "reward": 0.8338623046875, "reward_std": 0.013247603550553322, "rewards//mean": 0.8338623046875, "rewards//std": 0.03268921375274658, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0894, "grad_norm": 0.6727077960968018, "kl": 0.03247371851466596, "learning_rate": 4.921457902821578e-06, "loss": 0.0032, "num_tokens": 2916512.0, "reward": 0.81097412109375, "reward_std": 0.01571122370660305, "rewards//mean": 0.81097412109375, "rewards//std": 0.03608853369951248, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0896, "grad_norm": 0.7633801698684692, "kl": 0.03594936756417155, "learning_rate": 4.921062828135006e-06, "loss": 0.0036, "num_tokens": 2923072.0, "reward": 0.8360595703125, "reward_std": 0.016459252685308456, "rewards//mean": 0.8360595703125, "rewards//std": 0.032998934388160706, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0898, "grad_norm": 0.6687192916870117, "kl": 0.03433335409499705, "learning_rate": 4.920666778244616e-06, "loss": 0.0034, "num_tokens": 2929680.0, "reward": 0.78240966796875, "reward_std": 0.009573189541697502, "rewards//mean": 0.78240966796875, "rewards//std": 0.021728798747062683, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.09, "grad_norm": 0.6165691614151001, "kl": 0.03592061810195446, "learning_rate": 4.920269753309937e-06, "loss": 0.0036, "num_tokens": 2936152.0, "reward": 0.81842041015625, "reward_std": 0.014015309512615204, "rewards//mean": 0.81842041015625, "rewards//std": 0.026593778282403946, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0902, "grad_norm": 0.6066876649856567, "kl": 0.036984790582209826, "learning_rate": 4.919871753490892e-06, "loss": 0.0037, "num_tokens": 2942736.0, "reward": 0.84619140625, "reward_std": 0.008205235004425049, "rewards//mean": 0.84619140625, "rewards//std": 0.03186020627617836, "step": 451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0904, "grad_norm": 0.6752570867538452, "kl": 0.03590616839937866, "learning_rate": 4.919472778947793e-06, "loss": 0.0036, "num_tokens": 2949344.0, "reward": 0.85333251953125, "reward_std": 0.013651330024003983, "rewards//mean": 0.85333251953125, "rewards//std": 0.019186310470104218, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0906, "grad_norm": 0.6786683201789856, "kl": 0.03594211395829916, "learning_rate": 4.919072829841347e-06, "loss": 0.0036, "num_tokens": 2955960.0, "reward": 0.82818603515625, "reward_std": 0.007302135229110718, "rewards//mean": 0.82818603515625, "rewards//std": 0.01555496733635664, "step": 453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0908, "grad_norm": 0.6827971935272217, "kl": 0.030247324146330357, "learning_rate": 4.918671906332656e-06, "loss": 0.003, "num_tokens": 2962352.0, "reward": 0.84320068359375, "reward_std": 0.008547368459403515, "rewards//mean": 0.84320068359375, "rewards//std": 0.020061831921339035, "step": 454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.091, "grad_norm": 0.7300602197647095, "kl": 0.03687928104773164, "learning_rate": 4.91827000858321e-06, "loss": 0.0037, "num_tokens": 2968912.0, "reward": 0.81256103515625, "reward_std": 0.0169554203748703, "rewards//mean": 0.81256103515625, "rewards//std": 0.04146559163928032, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0912, "grad_norm": 0.609417200088501, "kl": 0.028801521519199014, "learning_rate": 4.917867136754894e-06, "loss": 0.0029, "num_tokens": 2975400.0, "reward": 0.842529296875, "reward_std": 0.012160791084170341, "rewards//mean": 0.842529296875, "rewards//std": 0.025352440774440765, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0914, "grad_norm": 0.648038387298584, "kl": 0.03594379290007055, "learning_rate": 4.917463291009984e-06, "loss": 0.0036, "num_tokens": 2981952.0, "reward": 0.82843017578125, "reward_std": 0.010313676670193672, "rewards//mean": 0.82843017578125, "rewards//std": 0.03293528035283089, "step": 457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0916, "grad_norm": 0.6578435897827148, "kl": 0.03743579494766891, "learning_rate": 4.917058471511149e-06, "loss": 0.0037, "num_tokens": 2988512.0, "reward": 0.851318359375, "reward_std": 0.013669105246663094, "rewards//mean": 0.851318359375, "rewards//std": 0.02833874709904194, "step": 458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0918, "grad_norm": 0.6159623861312866, "kl": 0.03537259087897837, "learning_rate": 4.916652678421451e-06, "loss": 0.0035, "num_tokens": 2995008.0, "reward": 0.8701171875, "reward_std": 0.009295953437685966, "rewards//mean": 0.8701171875, "rewards//std": 0.033160243183374405, "step": 459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.092, "grad_norm": 0.6083399653434753, "kl": 0.03422114229761064, "learning_rate": 4.916245911904344e-06, "loss": 0.0034, "num_tokens": 3001648.0, "reward": 0.8624267578125, "reward_std": 0.011924168094992638, "rewards//mean": 0.8624267578125, "rewards//std": 0.0302133746445179, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0922, "grad_norm": 0.6568530797958374, "kl": 0.03064131084829569, "learning_rate": 4.9158381721236715e-06, "loss": 0.0031, "num_tokens": 3008168.0, "reward": 0.85479736328125, "reward_std": 0.01007093396037817, "rewards//mean": 0.85479736328125, "rewards//std": 0.025291988626122475, "step": 461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0924, "grad_norm": 0.6399670839309692, "kl": 0.030397456837818027, "learning_rate": 4.915429459243673e-06, "loss": 0.003, "num_tokens": 3014872.0, "reward": 0.82806396484375, "reward_std": 0.012727971188724041, "rewards//mean": 0.82806396484375, "rewards//std": 0.02784399501979351, "step": 462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0926, "grad_norm": 0.6363940834999084, "kl": 0.032815776066854596, "learning_rate": 4.9150197734289764e-06, "loss": 0.0033, "num_tokens": 3021392.0, "reward": 0.85394287109375, "reward_std": 0.011889531277120113, "rewards//mean": 0.85394287109375, "rewards//std": 0.03702928498387337, "step": 463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0928, "grad_norm": 0.6081061959266663, "kl": 0.03635578881949186, "learning_rate": 4.9146091148446055e-06, "loss": 0.0036, "num_tokens": 3027848.0, "reward": 0.84417724609375, "reward_std": 0.012306313961744308, "rewards//mean": 0.84417724609375, "rewards//std": 0.02853669971227646, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.093, "grad_norm": 0.6227095127105713, "kl": 0.0410658591426909, "learning_rate": 4.91419748365597e-06, "loss": 0.0041, "num_tokens": 3034400.0, "reward": 0.83636474609375, "reward_std": 0.017500348389148712, "rewards//mean": 0.83636474609375, "rewards//std": 0.04229619726538658, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0932, "grad_norm": 0.693902313709259, "kl": 0.03862925851717591, "learning_rate": 4.9137848800288775e-06, "loss": 0.0039, "num_tokens": 3040912.0, "reward": 0.8209228515625, "reward_std": 0.013860415667295456, "rewards//mean": 0.8209228515625, "rewards//std": 0.024061819538474083, "step": 466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0934, "grad_norm": 0.7567556500434875, "kl": 0.036113617941737175, "learning_rate": 4.9133713041295235e-06, "loss": 0.0036, "num_tokens": 3047472.0, "reward": 0.85614013671875, "reward_std": 0.00819423608481884, "rewards//mean": 0.85614013671875, "rewards//std": 0.016769740730524063, "step": 467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0936, "grad_norm": 0.661746084690094, "kl": 0.0400458665098995, "learning_rate": 4.912956756124498e-06, "loss": 0.004, "num_tokens": 3053976.0, "reward": 0.79193115234375, "reward_std": 0.012701844796538353, "rewards//mean": 0.79193115234375, "rewards//std": 0.020963624119758606, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0938, "grad_norm": 0.6104310750961304, "kl": 0.03691074647940695, "learning_rate": 4.912541236180779e-06, "loss": 0.0037, "num_tokens": 3060544.0, "reward": 0.84771728515625, "reward_std": 0.010005248710513115, "rewards//mean": 0.84771728515625, "rewards//std": 0.032499901950359344, "step": 469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.094, "grad_norm": 0.6204822063446045, "kl": 0.029529433464631438, "learning_rate": 4.9121247444657384e-06, "loss": 0.003, "num_tokens": 3067056.0, "reward": 0.8560791015625, "reward_std": 0.015111008659005165, "rewards//mean": 0.8560791015625, "rewards//std": 0.036882489919662476, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0942, "grad_norm": 0.6523507237434387, "kl": 0.037716952385380864, "learning_rate": 4.91170728114714e-06, "loss": 0.0038, "num_tokens": 3073576.0, "reward": 0.83990478515625, "reward_std": 0.019746724516153336, "rewards//mean": 0.83990478515625, "rewards//std": 0.02970423921942711, "step": 471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0944, "grad_norm": 0.5663513541221619, "kl": 0.03594962228089571, "learning_rate": 4.911288846393136e-06, "loss": 0.0036, "num_tokens": 3080128.0, "reward": 0.850341796875, "reward_std": 0.01110504474490881, "rewards//mean": 0.850341796875, "rewards//std": 0.028526155278086662, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0946, "grad_norm": 0.6797471046447754, "kl": 0.03497613500803709, "learning_rate": 4.910869440372274e-06, "loss": 0.0035, "num_tokens": 3086584.0, "reward": 0.8629150390625, "reward_std": 0.014508411288261414, "rewards//mean": 0.8629150390625, "rewards//std": 0.02641766145825386, "step": 473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0948, "grad_norm": 0.6318498849868774, "kl": 0.039045897545292974, "learning_rate": 4.910449063253489e-06, "loss": 0.0039, "num_tokens": 3093040.0, "reward": 0.7581787109375, "reward_std": 0.011910462751984596, "rewards//mean": 0.7581787109375, "rewards//std": 0.026150431483983994, "step": 474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.095, "grad_norm": 0.6022250652313232, "kl": 0.03229112015105784, "learning_rate": 4.9100277152061105e-06, "loss": 0.0032, "num_tokens": 3099536.0, "reward": 0.8621826171875, "reward_std": 0.016534287482500076, "rewards//mean": 0.8621826171875, "rewards//std": 0.02098933421075344, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0952, "grad_norm": 0.5952948927879333, "kl": 0.026717626955360174, "learning_rate": 4.9096053963998555e-06, "loss": 0.0027, "num_tokens": 3106128.0, "reward": 0.84246826171875, "reward_std": 0.009889265522360802, "rewards//mean": 0.84246826171875, "rewards//std": 0.02417064644396305, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0954, "grad_norm": 0.7570531964302063, "kl": 0.03380241570994258, "learning_rate": 4.909182107004835e-06, "loss": 0.0034, "num_tokens": 3112680.0, "reward": 0.811279296875, "reward_std": 0.008888293989002705, "rewards//mean": 0.811279296875, "rewards//std": 0.02740016020834446, "step": 477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0956, "grad_norm": 0.6230834126472473, "kl": 0.04082751553505659, "learning_rate": 4.908757847191551e-06, "loss": 0.0041, "num_tokens": 3119192.0, "reward": 0.86517333984375, "reward_std": 0.012800133787095547, "rewards//mean": 0.86517333984375, "rewards//std": 0.030899198725819588, "step": 478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0958, "grad_norm": 0.5696311593055725, "kl": 0.0354614038951695, "learning_rate": 4.908332617130893e-06, "loss": 0.0035, "num_tokens": 3125672.0, "reward": 0.79400634765625, "reward_std": 0.00883110798895359, "rewards//mean": 0.79400634765625, "rewards//std": 0.0182057972997427, "step": 479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.096, "grad_norm": 0.594983696937561, "kl": 0.03111358336172998, "learning_rate": 4.907906416994146e-06, "loss": 0.0031, "num_tokens": 3132080.0, "reward": 0.879638671875, "reward_std": 0.015366640873253345, "rewards//mean": 0.879638671875, "rewards//std": 0.028038017451763153, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0962, "grad_norm": 0.6047464609146118, "kl": 0.030363738536834717, "learning_rate": 4.907479246952981e-06, "loss": 0.003, "num_tokens": 3138576.0, "reward": 0.85302734375, "reward_std": 0.01101978775113821, "rewards//mean": 0.85302734375, "rewards//std": 0.020297471433877945, "step": 481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0964, "grad_norm": 0.6047098636627197, "kl": 0.03049930650740862, "learning_rate": 4.907051107179464e-06, "loss": 0.003, "num_tokens": 3145080.0, "reward": 0.83258056640625, "reward_std": 0.011874470859766006, "rewards//mean": 0.83258056640625, "rewards//std": 0.03233833983540535, "step": 482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0966, "grad_norm": 0.6885430216789246, "kl": 0.033972281496971846, "learning_rate": 4.9066219978460485e-06, "loss": 0.0034, "num_tokens": 3151552.0, "reward": 0.86956787109375, "reward_std": 0.01559597160667181, "rewards//mean": 0.86956787109375, "rewards//std": 0.033491771668195724, "step": 483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0968, "grad_norm": 0.6130304932594299, "kl": 0.03379618399776518, "learning_rate": 4.90619191912558e-06, "loss": 0.0034, "num_tokens": 3158000.0, "reward": 0.8402099609375, "reward_std": 0.013679608702659607, "rewards//mean": 0.8402099609375, "rewards//std": 0.019246671348810196, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.097, "grad_norm": 0.6161892414093018, "kl": 0.03453512443229556, "learning_rate": 4.905760871191295e-06, "loss": 0.0035, "num_tokens": 3164560.0, "reward": 0.86322021484375, "reward_std": 0.008620163425803185, "rewards//mean": 0.86322021484375, "rewards//std": 0.020652256906032562, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0972, "grad_norm": 0.6238561868667603, "kl": 0.03610549960285425, "learning_rate": 4.9053288542168185e-06, "loss": 0.0036, "num_tokens": 3171048.0, "reward": 0.8316650390625, "reward_std": 0.012315354309976101, "rewards//mean": 0.8316650390625, "rewards//std": 0.0226247850805521, "step": 486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0974, "grad_norm": 0.6723244786262512, "kl": 0.0330163084436208, "learning_rate": 4.904895868376167e-06, "loss": 0.0033, "num_tokens": 3177536.0, "reward": 0.81072998046875, "reward_std": 0.011163340881466866, "rewards//mean": 0.81072998046875, "rewards//std": 0.01871340349316597, "step": 487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0976, "grad_norm": 0.6025989055633545, "kl": 0.027839731890708208, "learning_rate": 4.904461913843747e-06, "loss": 0.0028, "num_tokens": 3183992.0, "reward": 0.84307861328125, "reward_std": 0.00990958884358406, "rewards//mean": 0.84307861328125, "rewards//std": 0.020511779934167862, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0978, "grad_norm": 0.6140331029891968, "kl": 0.035078632878139615, "learning_rate": 4.904026990794356e-06, "loss": 0.0035, "num_tokens": 3190488.0, "reward": 0.830810546875, "reward_std": 0.012093999423086643, "rewards//mean": 0.830810546875, "rewards//std": 0.02350844070315361, "step": 489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.098, "grad_norm": 0.590908408164978, "kl": 0.03199684526771307, "learning_rate": 4.903591099403181e-06, "loss": 0.0032, "num_tokens": 3197032.0, "reward": 0.8131103515625, "reward_std": 0.008650442585349083, "rewards//mean": 0.8131103515625, "rewards//std": 0.014419297687709332, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0982, "grad_norm": 0.604034423828125, "kl": 0.029838560381904244, "learning_rate": 4.903154239845798e-06, "loss": 0.003, "num_tokens": 3203568.0, "reward": 0.85394287109375, "reward_std": 0.01665610447525978, "rewards//mean": 0.85394287109375, "rewards//std": 0.02005881257355213, "step": 491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0984, "grad_norm": 0.5806700587272644, "kl": 0.03282779362052679, "learning_rate": 4.902716412298174e-06, "loss": 0.0033, "num_tokens": 3210000.0, "reward": 0.85321044921875, "reward_std": 0.016802972182631493, "rewards//mean": 0.85321044921875, "rewards//std": 0.04070698097348213, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0986, "grad_norm": 0.6078839898109436, "kl": 0.03290758584626019, "learning_rate": 4.902277616936667e-06, "loss": 0.0033, "num_tokens": 3216488.0, "reward": 0.82843017578125, "reward_std": 0.009662855416536331, "rewards//mean": 0.82843017578125, "rewards//std": 0.022695180028676987, "step": 493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0988, "grad_norm": 0.7613587379455566, "kl": 0.03131350572220981, "learning_rate": 4.901837853938024e-06, "loss": 0.0031, "num_tokens": 3222912.0, "reward": 0.78887939453125, "reward_std": 0.012067398056387901, "rewards//mean": 0.78887939453125, "rewards//std": 0.030761227011680603, "step": 494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.099, "grad_norm": 0.6558058857917786, "kl": 0.03489048779010773, "learning_rate": 4.90139712347938e-06, "loss": 0.0035, "num_tokens": 3229480.0, "reward": 0.84124755859375, "reward_std": 0.01762406900525093, "rewards//mean": 0.84124755859375, "rewards//std": 0.035579923540353775, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0992, "grad_norm": 0.6649083495140076, "kl": 0.036183219868689775, "learning_rate": 4.900955425738262e-06, "loss": 0.0036, "num_tokens": 3235984.0, "reward": 0.8514404296875, "reward_std": 0.014244592748582363, "rewards//mean": 0.8514404296875, "rewards//std": 0.036238234490156174, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0994, "grad_norm": 0.7367611527442932, "kl": 0.03572784992866218, "learning_rate": 4.900512760892585e-06, "loss": 0.0036, "num_tokens": 3242488.0, "reward": 0.855712890625, "reward_std": 0.011523640714585781, "rewards//mean": 0.855712890625, "rewards//std": 0.01867382600903511, "step": 497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0996, "grad_norm": 0.6988459229469299, "kl": 0.03196686413139105, "learning_rate": 4.900069129120656e-06, "loss": 0.0032, "num_tokens": 3249040.0, "reward": 0.8355712890625, "reward_std": 0.017917927354574203, "rewards//mean": 0.8355712890625, "rewards//std": 0.0311624426394701, "step": 498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0998, "grad_norm": 0.5815377235412598, "kl": 0.031875348184257746, "learning_rate": 4.899624530601168e-06, "loss": 0.0032, "num_tokens": 3255536.0, "reward": 0.84539794921875, "reward_std": 0.012960272841155529, "rewards//mean": 0.84539794921875, "rewards//std": 0.03601589426398277, "step": 499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1, "grad_norm": 0.6033855080604553, "kl": 0.03386277239769697, "learning_rate": 4.899178965513206e-06, "loss": 0.0034, "num_tokens": 3262064.0, "reward": 0.82635498046875, "reward_std": 0.011881604790687561, "rewards//mean": 0.82635498046875, "rewards//std": 0.02400852181017399, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1002, "grad_norm": 0.6250687837600708, "kl": 0.03108464856632054, "learning_rate": 4.8987324340362445e-06, "loss": 0.0031, "num_tokens": 3268632.0, "reward": 0.8099365234375, "reward_std": 0.009064503014087677, "rewards//mean": 0.8099365234375, "rewards//std": 0.022906716912984848, "step": 501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1004, "grad_norm": 0.6700149774551392, "kl": 0.03416571137495339, "learning_rate": 4.898284936350144e-06, "loss": 0.0034, "num_tokens": 3275104.0, "reward": 0.83502197265625, "reward_std": 0.012307664379477501, "rewards//mean": 0.83502197265625, "rewards//std": 0.026518534868955612, "step": 502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1006, "grad_norm": 0.6595599055290222, "kl": 0.03373102657496929, "learning_rate": 4.897836472635159e-06, "loss": 0.0034, "num_tokens": 3281712.0, "reward": 0.865478515625, "reward_std": 0.010280190035700798, "rewards//mean": 0.865478515625, "rewards//std": 0.03414534777402878, "step": 503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1008, "grad_norm": 0.6997821927070618, "kl": 0.02997818193398416, "learning_rate": 4.89738704307193e-06, "loss": 0.003, "num_tokens": 3288280.0, "reward": 0.82611083984375, "reward_std": 0.0106052216142416, "rewards//mean": 0.82611083984375, "rewards//std": 0.0170197244733572, "step": 504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.101, "grad_norm": 0.5888683795928955, "kl": 0.03280960535630584, "learning_rate": 4.896936647841485e-06, "loss": 0.0033, "num_tokens": 3294792.0, "reward": 0.8609619140625, "reward_std": 0.013485745526850224, "rewards//mean": 0.8609619140625, "rewards//std": 0.02458459883928299, "step": 505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1012, "grad_norm": 0.6109232902526855, "kl": 0.03339107520878315, "learning_rate": 4.896485287125247e-06, "loss": 0.0033, "num_tokens": 3301264.0, "reward": 0.84881591796875, "reward_std": 0.011974513530731201, "rewards//mean": 0.84881591796875, "rewards//std": 0.038548704236745834, "step": 506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1014, "grad_norm": 0.5779264569282532, "kl": 0.027007358381524682, "learning_rate": 4.896032961105021e-06, "loss": 0.0027, "num_tokens": 3307824.0, "reward": 0.8441162109375, "reward_std": 0.010420424863696098, "rewards//mean": 0.8441162109375, "rewards//std": 0.028803668916225433, "step": 507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1016, "grad_norm": 0.6584574580192566, "kl": 0.03429147810675204, "learning_rate": 4.8955796699630045e-06, "loss": 0.0034, "num_tokens": 3314424.0, "reward": 0.7806396484375, "reward_std": 0.009217744693160057, "rewards//mean": 0.7806396484375, "rewards//std": 0.015859151259064674, "step": 508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1018, "grad_norm": 0.6022382974624634, "kl": 0.03081224230118096, "learning_rate": 4.895125413881783e-06, "loss": 0.0031, "num_tokens": 3320928.0, "reward": 0.8909912109375, "reward_std": 0.009043923579156399, "rewards//mean": 0.8909912109375, "rewards//std": 0.0231329295784235, "step": 509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.102, "grad_norm": 0.672275960445404, "kl": 0.03528377669863403, "learning_rate": 4.894670193044332e-06, "loss": 0.0035, "num_tokens": 3327376.0, "reward": 0.86248779296875, "reward_std": 0.011328982189297676, "rewards//mean": 0.86248779296875, "rewards//std": 0.035140261054039, "step": 510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1022, "grad_norm": 0.8039358258247375, "kl": 0.036853160709142685, "learning_rate": 4.894214007634014e-06, "loss": 0.0037, "num_tokens": 3333928.0, "reward": 0.79327392578125, "reward_std": 0.01666302978992462, "rewards//mean": 0.79327392578125, "rewards//std": 0.030523112043738365, "step": 511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1024, "grad_norm": 0.6324937343597412, "kl": 0.030147293815389276, "learning_rate": 4.893756857834579e-06, "loss": 0.003, "num_tokens": 3340520.0, "reward": 0.800048828125, "reward_std": 0.010150490328669548, "rewards//mean": 0.800048828125, "rewards//std": 0.026810409501194954, "step": 512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1026, "grad_norm": 0.6281384229660034, "kl": 0.0276860895100981, "learning_rate": 4.893298743830168e-06, "loss": 0.0028, "num_tokens": 3346992.0, "reward": 0.8341064453125, "reward_std": 0.009020951576530933, "rewards//mean": 0.8341064453125, "rewards//std": 0.02563832886517048, "step": 513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1028, "grad_norm": 0.6529840230941772, "kl": 0.027598173590376973, "learning_rate": 4.89283966580531e-06, "loss": 0.0028, "num_tokens": 3353528.0, "reward": 0.84344482421875, "reward_std": 0.012123117223381996, "rewards//mean": 0.84344482421875, "rewards//std": 0.03667231649160385, "step": 514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.103, "grad_norm": 0.6482062339782715, "kl": 0.03003290994092822, "learning_rate": 4.8923796239449206e-06, "loss": 0.003, "num_tokens": 3360064.0, "reward": 0.83209228515625, "reward_std": 0.010229753330349922, "rewards//mean": 0.83209228515625, "rewards//std": 0.019155515357851982, "step": 515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1032, "grad_norm": 0.6167376041412354, "kl": 0.03031931398436427, "learning_rate": 4.891918618434305e-06, "loss": 0.003, "num_tokens": 3366520.0, "reward": 0.834228515625, "reward_std": 0.010733341798186302, "rewards//mean": 0.834228515625, "rewards//std": 0.014762187376618385, "step": 516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1034, "grad_norm": 0.6656414270401001, "kl": 0.03264021617360413, "learning_rate": 4.891456649459156e-06, "loss": 0.0033, "num_tokens": 3373016.0, "reward": 0.84588623046875, "reward_std": 0.012665395624935627, "rewards//mean": 0.84588623046875, "rewards//std": 0.02881384827196598, "step": 517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1036, "grad_norm": 0.6099600791931152, "kl": 0.030192186124622822, "learning_rate": 4.890993717205553e-06, "loss": 0.003, "num_tokens": 3379576.0, "reward": 0.8043212890625, "reward_std": 0.010902250185608864, "rewards//mean": 0.8043212890625, "rewards//std": 0.027416452765464783, "step": 518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1038, "grad_norm": 0.599140465259552, "kl": 0.027934797341004014, "learning_rate": 4.8905298218599685e-06, "loss": 0.0028, "num_tokens": 3386096.0, "reward": 0.83251953125, "reward_std": 0.01088377833366394, "rewards//mean": 0.83251953125, "rewards//std": 0.020978152751922607, "step": 519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.104, "grad_norm": 0.5696952939033508, "kl": 0.03250581002794206, "learning_rate": 4.8900649636092565e-06, "loss": 0.0033, "num_tokens": 3392512.0, "reward": 0.8446044921875, "reward_std": 0.01649361103773117, "rewards//mean": 0.8446044921875, "rewards//std": 0.04427339881658554, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1042, "grad_norm": 0.6459939479827881, "kl": 0.03033285029232502, "learning_rate": 4.889599142640663e-06, "loss": 0.003, "num_tokens": 3399056.0, "reward": 0.83172607421875, "reward_std": 0.011939212679862976, "rewards//mean": 0.83172607421875, "rewards//std": 0.020801976323127747, "step": 521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1044, "grad_norm": 0.5709954500198364, "kl": 0.0319932468701154, "learning_rate": 4.889132359141822e-06, "loss": 0.0032, "num_tokens": 3405552.0, "reward": 0.858642578125, "reward_std": 0.017098452895879745, "rewards//mean": 0.858642578125, "rewards//std": 0.02903946116566658, "step": 522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1046, "grad_norm": 0.8689175844192505, "kl": 0.028929896419867873, "learning_rate": 4.888664613300751e-06, "loss": 0.0029, "num_tokens": 3411992.0, "reward": 0.85455322265625, "reward_std": 0.020008470863103867, "rewards//mean": 0.85455322265625, "rewards//std": 0.02635362185537815, "step": 523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1048, "grad_norm": 0.6963226795196533, "kl": 0.031829854240641, "learning_rate": 4.888195905305859e-06, "loss": 0.0032, "num_tokens": 3418448.0, "reward": 0.788818359375, "reward_std": 0.014885769225656986, "rewards//mean": 0.788818359375, "rewards//std": 0.0243386123329401, "step": 524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.105, "grad_norm": 0.6378881931304932, "kl": 0.028337573632597923, "learning_rate": 4.887726235345943e-06, "loss": 0.0028, "num_tokens": 3424928.0, "reward": 0.81781005859375, "reward_std": 0.010992229916155338, "rewards//mean": 0.81781005859375, "rewards//std": 0.016839103773236275, "step": 525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1052, "grad_norm": 0.6507261395454407, "kl": 0.033656949177384377, "learning_rate": 4.8872556036101845e-06, "loss": 0.0034, "num_tokens": 3431400.0, "reward": 0.85791015625, "reward_std": 0.011110810562968254, "rewards//mean": 0.85791015625, "rewards//std": 0.03369999676942825, "step": 526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1054, "grad_norm": 0.6233684420585632, "kl": 0.02477980637922883, "learning_rate": 4.886784010288155e-06, "loss": 0.0025, "num_tokens": 3438000.0, "reward": 0.82098388671875, "reward_std": 0.016229748725891113, "rewards//mean": 0.82098388671875, "rewards//std": 0.021916091442108154, "step": 527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1056, "grad_norm": 0.7725349068641663, "kl": 0.030107759404927492, "learning_rate": 4.886311455569811e-06, "loss": 0.003, "num_tokens": 3444520.0, "reward": 0.8575439453125, "reward_std": 0.013236935250461102, "rewards//mean": 0.8575439453125, "rewards//std": 0.024675559252500534, "step": 528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1058, "grad_norm": 0.6618022322654724, "kl": 0.030748266261070967, "learning_rate": 4.885837939645499e-06, "loss": 0.0031, "num_tokens": 3451088.0, "reward": 0.84832763671875, "reward_std": 0.012683728709816933, "rewards//mean": 0.84832763671875, "rewards//std": 0.023923883214592934, "step": 529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.106, "grad_norm": 0.6690743565559387, "kl": 0.030612023547291756, "learning_rate": 4.885363462705949e-06, "loss": 0.0031, "num_tokens": 3457648.0, "reward": 0.855224609375, "reward_std": 0.015518763102591038, "rewards//mean": 0.855224609375, "rewards//std": 0.03478484973311424, "step": 530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1062, "grad_norm": 0.6613344550132751, "kl": 0.03167608566582203, "learning_rate": 4.884888024942282e-06, "loss": 0.0032, "num_tokens": 3464160.0, "reward": 0.8558349609375, "reward_std": 0.018330905586481094, "rewards//mean": 0.8558349609375, "rewards//std": 0.035486891865730286, "step": 531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1064, "grad_norm": 0.7026782035827637, "kl": 0.030375287402421236, "learning_rate": 4.884411626546004e-06, "loss": 0.003, "num_tokens": 3470768.0, "reward": 0.822998046875, "reward_std": 0.012220524251461029, "rewards//mean": 0.822998046875, "rewards//std": 0.024654999375343323, "step": 532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1066, "grad_norm": 0.6098203063011169, "kl": 0.0286163913551718, "learning_rate": 4.883934267709007e-06, "loss": 0.0029, "num_tokens": 3477336.0, "reward": 0.84466552734375, "reward_std": 0.012859920039772987, "rewards//mean": 0.84466552734375, "rewards//std": 0.0268813855946064, "step": 533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1068, "grad_norm": 0.6562339663505554, "kl": 0.0278691683197394, "learning_rate": 4.883455948623574e-06, "loss": 0.0028, "num_tokens": 3483840.0, "reward": 0.85479736328125, "reward_std": 0.010976341553032398, "rewards//mean": 0.85479736328125, "rewards//std": 0.022364608943462372, "step": 534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.107, "grad_norm": 0.6684576272964478, "kl": 0.03230980271473527, "learning_rate": 4.882976669482368e-06, "loss": 0.0032, "num_tokens": 3490368.0, "reward": 0.8468017578125, "reward_std": 0.010632255114614964, "rewards//mean": 0.8468017578125, "rewards//std": 0.017778070643544197, "step": 535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1072, "grad_norm": 0.6153666377067566, "kl": 0.03179552615620196, "learning_rate": 4.882496430478445e-06, "loss": 0.0032, "num_tokens": 3496880.0, "reward": 0.857177734375, "reward_std": 0.01358007825911045, "rewards//mean": 0.857177734375, "rewards//std": 0.02453683316707611, "step": 536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1074, "grad_norm": 0.6012992262840271, "kl": 0.03250846825540066, "learning_rate": 4.882015231805245e-06, "loss": 0.0033, "num_tokens": 3503432.0, "reward": 0.87005615234375, "reward_std": 0.010263869538903236, "rewards//mean": 0.87005615234375, "rewards//std": 0.01699480228126049, "step": 537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1076, "grad_norm": 0.6483962535858154, "kl": 0.03316758247092366, "learning_rate": 4.881533073656594e-06, "loss": 0.0033, "num_tokens": 3509960.0, "reward": 0.8270263671875, "reward_std": 0.009003392420709133, "rewards//mean": 0.8270263671875, "rewards//std": 0.01653578132390976, "step": 538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1078, "grad_norm": 0.6129351258277893, "kl": 0.03461948991753161, "learning_rate": 4.8810499562267066e-06, "loss": 0.0035, "num_tokens": 3516416.0, "reward": 0.84686279296875, "reward_std": 0.008496642112731934, "rewards//mean": 0.84686279296875, "rewards//std": 0.02763170190155506, "step": 539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.108, "grad_norm": 0.5928858518600464, "kl": 0.03543858043849468, "learning_rate": 4.88056587971018e-06, "loss": 0.0035, "num_tokens": 3522880.0, "reward": 0.80194091796875, "reward_std": 0.011357907205820084, "rewards//mean": 0.80194091796875, "rewards//std": 0.020311543717980385, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1082, "grad_norm": 0.7632496356964111, "kl": 0.034994550282135606, "learning_rate": 4.880080844302004e-06, "loss": 0.0035, "num_tokens": 3529352.0, "reward": 0.78594970703125, "reward_std": 0.009396329522132874, "rewards//mean": 0.78594970703125, "rewards//std": 0.017719540745019913, "step": 541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1084, "grad_norm": 0.6258948445320129, "kl": 0.027121696388348937, "learning_rate": 4.879594850197548e-06, "loss": 0.0027, "num_tokens": 3535864.0, "reward": 0.82269287109375, "reward_std": 0.010259194299578667, "rewards//mean": 0.82269287109375, "rewards//std": 0.03360007703304291, "step": 542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1086, "grad_norm": 0.6535525321960449, "kl": 0.029613361693918705, "learning_rate": 4.87910789759257e-06, "loss": 0.003, "num_tokens": 3542448.0, "reward": 0.84735107421875, "reward_std": 0.012180662713944912, "rewards//mean": 0.84735107421875, "rewards//std": 0.029580144211649895, "step": 543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1088, "grad_norm": 0.654188871383667, "kl": 0.02947809756733477, "learning_rate": 4.878619986683215e-06, "loss": 0.0029, "num_tokens": 3548968.0, "reward": 0.83056640625, "reward_std": 0.013154493644833565, "rewards//mean": 0.83056640625, "rewards//std": 0.036301881074905396, "step": 544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.109, "grad_norm": 0.6415969729423523, "kl": 0.029625217197462916, "learning_rate": 4.8781311176660144e-06, "loss": 0.003, "num_tokens": 3555472.0, "reward": 0.82647705078125, "reward_std": 0.010771572589874268, "rewards//mean": 0.82647705078125, "rewards//std": 0.024693572893738747, "step": 545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1092, "grad_norm": 0.5805480480194092, "kl": 0.029833517502993345, "learning_rate": 4.8776412907378845e-06, "loss": 0.003, "num_tokens": 3561944.0, "reward": 0.83563232421875, "reward_std": 0.014117766171693802, "rewards//mean": 0.83563232421875, "rewards//std": 0.029275618493556976, "step": 546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1094, "grad_norm": 0.6056393384933472, "kl": 0.029833688866347075, "learning_rate": 4.877150506096127e-06, "loss": 0.003, "num_tokens": 3568408.0, "reward": 0.84912109375, "reward_std": 0.013056284748017788, "rewards//mean": 0.84912109375, "rewards//std": 0.022200971841812134, "step": 547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1096, "grad_norm": 0.7053980827331543, "kl": 0.03147451486438513, "learning_rate": 4.8766587639384285e-06, "loss": 0.0031, "num_tokens": 3574896.0, "reward": 0.85784912109375, "reward_std": 0.014083274640142918, "rewards//mean": 0.85784912109375, "rewards//std": 0.025321299210190773, "step": 548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1098, "grad_norm": 0.5739861130714417, "kl": 0.03014273289591074, "learning_rate": 4.876166064462866e-06, "loss": 0.003, "num_tokens": 3581352.0, "reward": 0.82379150390625, "reward_std": 0.012559041380882263, "rewards//mean": 0.82379150390625, "rewards//std": 0.029798876494169235, "step": 549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.11, "grad_norm": 0.5770258903503418, "kl": 0.030987678095698357, "learning_rate": 4.8756724078678955e-06, "loss": 0.0031, "num_tokens": 3587776.0, "reward": 0.8641357421875, "reward_std": 0.015060827136039734, "rewards//mean": 0.8641357421875, "rewards//std": 0.02080388553440571, "step": 550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1102, "grad_norm": 0.6917813420295715, "kl": 0.031157562509179115, "learning_rate": 4.875177794352364e-06, "loss": 0.0031, "num_tokens": 3594472.0, "reward": 0.82940673828125, "reward_std": 0.010959278792142868, "rewards//mean": 0.82940673828125, "rewards//std": 0.017772428691387177, "step": 551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1104, "grad_norm": 0.6400755643844604, "kl": 0.03357220510952175, "learning_rate": 4.8746822241155006e-06, "loss": 0.0034, "num_tokens": 3601016.0, "reward": 0.80755615234375, "reward_std": 0.010078158229589462, "rewards//mean": 0.80755615234375, "rewards//std": 0.021544797345995903, "step": 552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1106, "grad_norm": 0.5885239839553833, "kl": 0.031021602218970656, "learning_rate": 4.874185697356921e-06, "loss": 0.0031, "num_tokens": 3607544.0, "reward": 0.84344482421875, "reward_std": 0.014324812218546867, "rewards//mean": 0.84344482421875, "rewards//std": 0.03417675569653511, "step": 553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1108, "grad_norm": 0.5927717089653015, "kl": 0.03164471639320254, "learning_rate": 4.873688214276628e-06, "loss": 0.0032, "num_tokens": 3614016.0, "reward": 0.84332275390625, "reward_std": 0.008280541747808456, "rewards//mean": 0.84332275390625, "rewards//std": 0.016515973955392838, "step": 554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.111, "grad_norm": 0.6338167786598206, "kl": 0.035400526598095894, "learning_rate": 4.873189775075005e-06, "loss": 0.0035, "num_tokens": 3620536.0, "reward": 0.86553955078125, "reward_std": 0.011882786639034748, "rewards//mean": 0.86553955078125, "rewards//std": 0.022378141060471535, "step": 555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1112, "grad_norm": 0.6624349355697632, "kl": 0.03116373484954238, "learning_rate": 4.872690379952824e-06, "loss": 0.0031, "num_tokens": 3627152.0, "reward": 0.8284912109375, "reward_std": 0.013359297066926956, "rewards//mean": 0.8284912109375, "rewards//std": 0.028820481151342392, "step": 556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1114, "grad_norm": 0.587210476398468, "kl": 0.03523128619417548, "learning_rate": 4.8721900291112415e-06, "loss": 0.0035, "num_tokens": 3633672.0, "reward": 0.86175537109375, "reward_std": 0.013639332726597786, "rewards//mean": 0.86175537109375, "rewards//std": 0.03241875395178795, "step": 557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1116, "grad_norm": 0.6118485331535339, "kl": 0.03052513301372528, "learning_rate": 4.871688722751799e-06, "loss": 0.0031, "num_tokens": 3640240.0, "reward": 0.7965087890625, "reward_std": 0.012003393843770027, "rewards//mean": 0.7965087890625, "rewards//std": 0.018145518377423286, "step": 558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1118, "grad_norm": 0.6017597317695618, "kl": 0.031704419292509556, "learning_rate": 4.8711864610764235e-06, "loss": 0.0032, "num_tokens": 3646784.0, "reward": 0.86309814453125, "reward_std": 0.013073796406388283, "rewards//mean": 0.86309814453125, "rewards//std": 0.028868434950709343, "step": 559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.112, "grad_norm": 0.6723352074623108, "kl": 0.036269206553697586, "learning_rate": 4.870683244287425e-06, "loss": 0.0036, "num_tokens": 3653280.0, "reward": 0.85546875, "reward_std": 0.01556839793920517, "rewards//mean": 0.85546875, "rewards//std": 0.03975088149309158, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1122, "grad_norm": 0.6495388746261597, "kl": 0.03623637277632952, "learning_rate": 4.870179072587499e-06, "loss": 0.0036, "num_tokens": 3659856.0, "reward": 0.8348388671875, "reward_std": 0.010795267298817635, "rewards//mean": 0.8348388671875, "rewards//std": 0.021253081038594246, "step": 561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1124, "grad_norm": 0.6097123622894287, "kl": 0.03436880186200142, "learning_rate": 4.869673946179726e-06, "loss": 0.0034, "num_tokens": 3666440.0, "reward": 0.84228515625, "reward_std": 0.012537036091089249, "rewards//mean": 0.84228515625, "rewards//std": 0.027555515989661217, "step": 562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1126, "grad_norm": 0.6926887631416321, "kl": 0.02803546912036836, "learning_rate": 4.8691678652675715e-06, "loss": 0.0028, "num_tokens": 3673000.0, "reward": 0.86138916015625, "reward_std": 0.016193002462387085, "rewards//mean": 0.86138916015625, "rewards//std": 0.03052063286304474, "step": 563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1128, "grad_norm": 0.592189371585846, "kl": 0.03593447944149375, "learning_rate": 4.8686608300548836e-06, "loss": 0.0036, "num_tokens": 3679384.0, "reward": 0.853515625, "reward_std": 0.012158144265413284, "rewards//mean": 0.853515625, "rewards//std": 0.017947981134057045, "step": 564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.113, "grad_norm": 0.6793131232261658, "kl": 0.03625408164225519, "learning_rate": 4.868152840745896e-06, "loss": 0.0036, "num_tokens": 3685888.0, "reward": 0.8232421875, "reward_std": 0.011547568254172802, "rewards//mean": 0.8232421875, "rewards//std": 0.020063435658812523, "step": 565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1132, "grad_norm": 0.629365861415863, "kl": 0.029668693197891116, "learning_rate": 4.8676438975452276e-06, "loss": 0.003, "num_tokens": 3692424.0, "reward": 0.8212890625, "reward_std": 0.010893348604440689, "rewards//mean": 0.8212890625, "rewards//std": 0.027330461889505386, "step": 566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1134, "grad_norm": 0.6305131316184998, "kl": 0.03516363073140383, "learning_rate": 4.86713400065788e-06, "loss": 0.0035, "num_tokens": 3698872.0, "reward": 0.8587646484375, "reward_std": 0.011101635172963142, "rewards//mean": 0.8587646484375, "rewards//std": 0.029472799971699715, "step": 567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1136, "grad_norm": 0.5981056690216064, "kl": 0.03191525675356388, "learning_rate": 4.866623150289241e-06, "loss": 0.0032, "num_tokens": 3705320.0, "reward": 0.88812255859375, "reward_std": 0.010125132277607918, "rewards//mean": 0.88812255859375, "rewards//std": 0.02194715104997158, "step": 568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1138, "grad_norm": 0.5857009887695312, "kl": 0.03170208935625851, "learning_rate": 4.86611134664508e-06, "loss": 0.0032, "num_tokens": 3711736.0, "reward": 0.854248046875, "reward_std": 0.012967083603143692, "rewards//mean": 0.854248046875, "rewards//std": 0.017837999388575554, "step": 569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.114, "grad_norm": 0.6245691180229187, "kl": 0.03494541137479246, "learning_rate": 4.865598589931552e-06, "loss": 0.0035, "num_tokens": 3718328.0, "reward": 0.79693603515625, "reward_std": 0.011958854272961617, "rewards//mean": 0.79693603515625, "rewards//std": 0.022363930940628052, "step": 570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1142, "grad_norm": 0.6825640797615051, "kl": 0.03346801269799471, "learning_rate": 4.865084880355193e-06, "loss": 0.0033, "num_tokens": 3724896.0, "reward": 0.8687744140625, "reward_std": 0.01131730992347002, "rewards//mean": 0.8687744140625, "rewards//std": 0.015449140220880508, "step": 571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1144, "grad_norm": 0.6870326399803162, "kl": 0.03804417513310909, "learning_rate": 4.864570218122928e-06, "loss": 0.0038, "num_tokens": 3731392.0, "reward": 0.84197998046875, "reward_std": 0.012165633030235767, "rewards//mean": 0.84197998046875, "rewards//std": 0.020326443016529083, "step": 572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1146, "grad_norm": 0.6390773057937622, "kl": 0.034224626841023564, "learning_rate": 4.864054603442063e-06, "loss": 0.0034, "num_tokens": 3737888.0, "reward": 0.84515380859375, "reward_std": 0.01106143370270729, "rewards//mean": 0.84515380859375, "rewards//std": 0.02885112538933754, "step": 573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1148, "grad_norm": 0.6096854209899902, "kl": 0.03863173257559538, "learning_rate": 4.863538036520285e-06, "loss": 0.0039, "num_tokens": 3744464.0, "reward": 0.8778076171875, "reward_std": 0.012170696631073952, "rewards//mean": 0.8778076171875, "rewards//std": 0.021704141050577164, "step": 574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.115, "grad_norm": 0.6443453431129456, "kl": 0.041011053370311856, "learning_rate": 4.863020517565669e-06, "loss": 0.0041, "num_tokens": 3750944.0, "reward": 0.81353759765625, "reward_std": 0.009858833625912666, "rewards//mean": 0.81353759765625, "rewards//std": 0.024500945582985878, "step": 575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1152, "grad_norm": 0.6102349758148193, "kl": 0.033064100658521056, "learning_rate": 4.862502046786671e-06, "loss": 0.0033, "num_tokens": 3757576.0, "reward": 0.82891845703125, "reward_std": 0.010357474908232689, "rewards//mean": 0.82891845703125, "rewards//std": 0.016131121665239334, "step": 576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1154, "grad_norm": 0.6194673180580139, "kl": 0.030226377304643393, "learning_rate": 4.861982624392132e-06, "loss": 0.003, "num_tokens": 3764128.0, "reward": 0.86114501953125, "reward_std": 0.01061889249831438, "rewards//mean": 0.86114501953125, "rewards//std": 0.02764703705906868, "step": 577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1156, "grad_norm": 0.6460333466529846, "kl": 0.03713248623535037, "learning_rate": 4.861462250591273e-06, "loss": 0.0037, "num_tokens": 3770640.0, "reward": 0.841064453125, "reward_std": 0.013898089528083801, "rewards//mean": 0.841064453125, "rewards//std": 0.033529773354530334, "step": 578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1158, "grad_norm": 0.6622205376625061, "kl": 0.03561116009950638, "learning_rate": 4.860940925593703e-06, "loss": 0.0036, "num_tokens": 3777336.0, "reward": 0.834716796875, "reward_std": 0.01101248525083065, "rewards//mean": 0.834716796875, "rewards//std": 0.030344609171152115, "step": 579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.116, "grad_norm": 0.6115918159484863, "kl": 0.041194694116711617, "learning_rate": 4.86041864960941e-06, "loss": 0.0041, "num_tokens": 3783792.0, "reward": 0.85552978515625, "reward_std": 0.011757418513298035, "rewards//mean": 0.85552978515625, "rewards//std": 0.01923123002052307, "step": 580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1162, "grad_norm": 0.5668520927429199, "kl": 0.03501286217942834, "learning_rate": 4.859895422848767e-06, "loss": 0.0035, "num_tokens": 3790240.0, "reward": 0.81658935546875, "reward_std": 0.010440420359373093, "rewards//mean": 0.81658935546875, "rewards//std": 0.028868434950709343, "step": 581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1164, "grad_norm": 0.6220555901527405, "kl": 0.041811997536569834, "learning_rate": 4.859371245522531e-06, "loss": 0.0042, "num_tokens": 3796760.0, "reward": 0.826904296875, "reward_std": 0.013789907097816467, "rewards//mean": 0.826904296875, "rewards//std": 0.025323763489723206, "step": 582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1166, "grad_norm": 0.6384966969490051, "kl": 0.035850258776918054, "learning_rate": 4.8588461178418375e-06, "loss": 0.0036, "num_tokens": 3803336.0, "reward": 0.85491943359375, "reward_std": 0.016050763428211212, "rewards//mean": 0.85491943359375, "rewards//std": 0.03363789618015289, "step": 583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1168, "grad_norm": 0.681158185005188, "kl": 0.04152256716042757, "learning_rate": 4.858320040018212e-06, "loss": 0.0042, "num_tokens": 3809832.0, "reward": 0.821044921875, "reward_std": 0.014268225058913231, "rewards//mean": 0.821044921875, "rewards//std": 0.023734018206596375, "step": 584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.117, "grad_norm": 0.6424878835678101, "kl": 0.035631838254630566, "learning_rate": 4.857793012263555e-06, "loss": 0.0036, "num_tokens": 3816272.0, "reward": 0.88055419921875, "reward_std": 0.010589011013507843, "rewards//mean": 0.88055419921875, "rewards//std": 0.02607703022658825, "step": 585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1172, "grad_norm": 0.6001079082489014, "kl": 0.03453458775766194, "learning_rate": 4.857265034790155e-06, "loss": 0.0035, "num_tokens": 3822744.0, "reward": 0.84454345703125, "reward_std": 0.01057126373052597, "rewards//mean": 0.84454345703125, "rewards//std": 0.02217564359307289, "step": 586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1174, "grad_norm": 0.6222141981124878, "kl": 0.041371190221980214, "learning_rate": 4.85673610781068e-06, "loss": 0.0041, "num_tokens": 3829272.0, "reward": 0.8468017578125, "reward_std": 0.014979726634919643, "rewards//mean": 0.8468017578125, "rewards//std": 0.018115462735295296, "step": 587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1176, "grad_norm": 0.6488871574401855, "kl": 0.037690630881115794, "learning_rate": 4.856206231538184e-06, "loss": 0.0038, "num_tokens": 3835808.0, "reward": 0.87548828125, "reward_std": 0.013796349987387657, "rewards//mean": 0.87548828125, "rewards//std": 0.024155063554644585, "step": 588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1178, "grad_norm": 0.6412578225135803, "kl": 0.0396907776594162, "learning_rate": 4.855675406186099e-06, "loss": 0.004, "num_tokens": 3842312.0, "reward": 0.82733154296875, "reward_std": 0.014147238805890083, "rewards//mean": 0.82733154296875, "rewards//std": 0.022736497223377228, "step": 589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.118, "grad_norm": 0.6577656865119934, "kl": 0.03925878135487437, "learning_rate": 4.855143631968242e-06, "loss": 0.0039, "num_tokens": 3848880.0, "reward": 0.85369873046875, "reward_std": 0.012300359085202217, "rewards//mean": 0.85369873046875, "rewards//std": 0.0363980270922184, "step": 590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1182, "grad_norm": 0.6258107423782349, "kl": 0.04022790654562414, "learning_rate": 4.854610909098813e-06, "loss": 0.004, "num_tokens": 3855376.0, "reward": 0.83575439453125, "reward_std": 0.012947885319590569, "rewards//mean": 0.83575439453125, "rewards//std": 0.033850088715553284, "step": 591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1184, "grad_norm": 0.6062402129173279, "kl": 0.04229812603443861, "learning_rate": 4.854077237792389e-06, "loss": 0.0042, "num_tokens": 3861856.0, "reward": 0.854736328125, "reward_std": 0.007470754906535149, "rewards//mean": 0.854736328125, "rewards//std": 0.018227506428956985, "step": 592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1186, "grad_norm": 0.5723037719726562, "kl": 0.03844059142284095, "learning_rate": 4.853542618263937e-06, "loss": 0.0038, "num_tokens": 3868264.0, "reward": 0.8463134765625, "reward_std": 0.00796434748917818, "rewards//mean": 0.8463134765625, "rewards//std": 0.023263435810804367, "step": 593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1188, "grad_norm": 0.6231993436813354, "kl": 0.03992640180513263, "learning_rate": 4.8530070507288e-06, "loss": 0.004, "num_tokens": 3874784.0, "reward": 0.8321533203125, "reward_std": 0.014231465756893158, "rewards//mean": 0.8321533203125, "rewards//std": 0.03230912983417511, "step": 594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.119, "grad_norm": 0.6749900579452515, "kl": 0.0327031088527292, "learning_rate": 4.852470535402703e-06, "loss": 0.0033, "num_tokens": 3881304.0, "reward": 0.84979248046875, "reward_std": 0.010251807048916817, "rewards//mean": 0.84979248046875, "rewards//std": 0.02176499553024769, "step": 595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1192, "grad_norm": 0.6172786951065063, "kl": 0.03722618380561471, "learning_rate": 4.851933072501756e-06, "loss": 0.0037, "num_tokens": 3887856.0, "reward": 0.83587646484375, "reward_std": 0.00847033690661192, "rewards//mean": 0.83587646484375, "rewards//std": 0.021150536835193634, "step": 596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1194, "grad_norm": 0.6346173286437988, "kl": 0.042119944002479315, "learning_rate": 4.851394662242449e-06, "loss": 0.0042, "num_tokens": 3894440.0, "reward": 0.8529052734375, "reward_std": 0.014024918898940086, "rewards//mean": 0.8529052734375, "rewards//std": 0.0261920765042305, "step": 597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1196, "grad_norm": 0.6278045773506165, "kl": 0.03757873619906604, "learning_rate": 4.850855304841653e-06, "loss": 0.0038, "num_tokens": 3900944.0, "reward": 0.786376953125, "reward_std": 0.010384895838797092, "rewards//mean": 0.786376953125, "rewards//std": 0.0285516157746315, "step": 598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1198, "grad_norm": 0.6595480442047119, "kl": 0.04688437725417316, "learning_rate": 4.8503150005166225e-06, "loss": 0.0047, "num_tokens": 3907464.0, "reward": 0.802734375, "reward_std": 0.012809094972908497, "rewards//mean": 0.802734375, "rewards//std": 0.026502618566155434, "step": 599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.12, "grad_norm": 0.5957270860671997, "kl": 0.03809724980965257, "learning_rate": 4.849773749484989e-06, "loss": 0.0038, "num_tokens": 3914072.0, "reward": 0.82818603515625, "reward_std": 0.012442629784345627, "rewards//mean": 0.82818603515625, "rewards//std": 0.023576516658067703, "step": 600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1202, "grad_norm": 0.7704198360443115, "kl": 0.03094646194949746, "learning_rate": 4.849231551964771e-06, "loss": 0.0031, "num_tokens": 3920608.0, "reward": 0.7781982421875, "reward_std": 0.012034199200570583, "rewards//mean": 0.7781982421875, "rewards//std": 0.02012934535741806, "step": 601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1204, "grad_norm": 0.6139811277389526, "kl": 0.040575713850557804, "learning_rate": 4.848688408174366e-06, "loss": 0.0041, "num_tokens": 3927152.0, "reward": 0.82891845703125, "reward_std": 0.012762252241373062, "rewards//mean": 0.82891845703125, "rewards//std": 0.0212725717574358, "step": 602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1206, "grad_norm": 0.6253176331520081, "kl": 0.03625836200080812, "learning_rate": 4.84814431833255e-06, "loss": 0.0036, "num_tokens": 3933664.0, "reward": 0.828369140625, "reward_std": 0.00994873046875, "rewards//mean": 0.828369140625, "rewards//std": 0.03292476385831833, "step": 603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1208, "grad_norm": 0.6379534006118774, "kl": 0.03596004471182823, "learning_rate": 4.847599282658483e-06, "loss": 0.0036, "num_tokens": 3940112.0, "reward": 0.83465576171875, "reward_std": 0.012594557367265224, "rewards//mean": 0.83465576171875, "rewards//std": 0.021179860457777977, "step": 604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.121, "grad_norm": 0.6428269147872925, "kl": 0.04226218955591321, "learning_rate": 4.847053301371706e-06, "loss": 0.0042, "num_tokens": 3946704.0, "reward": 0.83428955078125, "reward_std": 0.010459795594215393, "rewards//mean": 0.83428955078125, "rewards//std": 0.022356484085321426, "step": 605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1212, "grad_norm": 0.550739586353302, "kl": 0.03827002551406622, "learning_rate": 4.84650637469214e-06, "loss": 0.0038, "num_tokens": 3953160.0, "reward": 0.87030029296875, "reward_std": 0.011451773345470428, "rewards//mean": 0.87030029296875, "rewards//std": 0.0321001335978508, "step": 606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1214, "grad_norm": 0.6382452249526978, "kl": 0.0332846415694803, "learning_rate": 4.845958502840087e-06, "loss": 0.0033, "num_tokens": 3959576.0, "reward": 0.86163330078125, "reward_std": 0.01746852695941925, "rewards//mean": 0.86163330078125, "rewards//std": 0.02861880511045456, "step": 607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1216, "grad_norm": 0.7068121433258057, "kl": 0.0464365491643548, "learning_rate": 4.8454096860362284e-06, "loss": 0.0046, "num_tokens": 3966064.0, "reward": 0.79376220703125, "reward_std": 0.010808397084474564, "rewards//mean": 0.79376220703125, "rewards//std": 0.023053860291838646, "step": 608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1218, "grad_norm": 0.6986582279205322, "kl": 0.040672212140634656, "learning_rate": 4.8448599245016306e-06, "loss": 0.0041, "num_tokens": 3972592.0, "reward": 0.8546142578125, "reward_std": 0.01843278855085373, "rewards//mean": 0.8546142578125, "rewards//std": 0.027085380628705025, "step": 609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.122, "grad_norm": 0.7402421832084656, "kl": 0.04260013531893492, "learning_rate": 4.844309218457735e-06, "loss": 0.0043, "num_tokens": 3979104.0, "reward": 0.8359375, "reward_std": 0.011350443586707115, "rewards//mean": 0.8359375, "rewards//std": 0.027294989675283432, "step": 610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1222, "grad_norm": 0.5928390622138977, "kl": 0.04137154296040535, "learning_rate": 4.843757568126366e-06, "loss": 0.0041, "num_tokens": 3985616.0, "reward": 0.8240966796875, "reward_std": 0.01190880499780178, "rewards//mean": 0.8240966796875, "rewards//std": 0.02292521297931671, "step": 611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1224, "grad_norm": 0.681674599647522, "kl": 0.039917706744745374, "learning_rate": 4.84320497372973e-06, "loss": 0.004, "num_tokens": 3992328.0, "reward": 0.86822509765625, "reward_std": 0.014179814606904984, "rewards//mean": 0.86822509765625, "rewards//std": 0.02591339498758316, "step": 612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1226, "grad_norm": 0.6957495808601379, "kl": 0.0385527154430747, "learning_rate": 4.8426514354904096e-06, "loss": 0.0039, "num_tokens": 3998832.0, "reward": 0.88836669921875, "reward_std": 0.014827560633420944, "rewards//mean": 0.88836669921875, "rewards//std": 0.03412356227636337, "step": 613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1228, "grad_norm": 0.6795756220817566, "kl": 0.04442503722384572, "learning_rate": 4.842096953631371e-06, "loss": 0.0044, "num_tokens": 4005336.0, "reward": 0.8365478515625, "reward_std": 0.01342087984085083, "rewards//mean": 0.8365478515625, "rewards//std": 0.026831572875380516, "step": 614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.123, "grad_norm": 0.6525948643684387, "kl": 0.04063747264444828, "learning_rate": 4.841541528375961e-06, "loss": 0.0041, "num_tokens": 4011760.0, "reward": 0.87091064453125, "reward_std": 0.016083110123872757, "rewards//mean": 0.87091064453125, "rewards//std": 0.02499214932322502, "step": 615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1232, "grad_norm": 0.721349835395813, "kl": 0.036919957026839256, "learning_rate": 4.840985159947902e-06, "loss": 0.0037, "num_tokens": 4018352.0, "reward": 0.797607421875, "reward_std": 0.010739820078015327, "rewards//mean": 0.797607421875, "rewards//std": 0.021109074354171753, "step": 616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1234, "grad_norm": 0.614031970500946, "kl": 0.041909544728696346, "learning_rate": 4.8404278485713005e-06, "loss": 0.0042, "num_tokens": 4024936.0, "reward": 0.86962890625, "reward_std": 0.015312773175537586, "rewards//mean": 0.86962890625, "rewards//std": 0.04199381172657013, "step": 617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1236, "grad_norm": 0.7404701113700867, "kl": 0.037076502572745085, "learning_rate": 4.839869594470642e-06, "loss": 0.0037, "num_tokens": 4031464.0, "reward": 0.85089111328125, "reward_std": 0.013311058282852173, "rewards//mean": 0.85089111328125, "rewards//std": 0.02026229538023472, "step": 618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1238, "grad_norm": 0.6280961036682129, "kl": 0.04641252523288131, "learning_rate": 4.839310397870791e-06, "loss": 0.0046, "num_tokens": 4038104.0, "reward": 0.82000732421875, "reward_std": 0.010196110233664513, "rewards//mean": 0.82000732421875, "rewards//std": 0.02073199860751629, "step": 619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.124, "grad_norm": 0.6301640272140503, "kl": 0.03853551088832319, "learning_rate": 4.838750258996992e-06, "loss": 0.0039, "num_tokens": 4044672.0, "reward": 0.827392578125, "reward_std": 0.010251728817820549, "rewards//mean": 0.827392578125, "rewards//std": 0.032954175025224686, "step": 620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1242, "grad_norm": 0.7439788579940796, "kl": 0.03770592529326677, "learning_rate": 4.838189178074867e-06, "loss": 0.0038, "num_tokens": 4051200.0, "reward": 0.833740234375, "reward_std": 0.01518142782151699, "rewards//mean": 0.833740234375, "rewards//std": 0.019160354509949684, "step": 621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1244, "grad_norm": 0.6230072379112244, "kl": 0.043836275581270456, "learning_rate": 4.837627155330421e-06, "loss": 0.0044, "num_tokens": 4057696.0, "reward": 0.869873046875, "reward_std": 0.010131296701729298, "rewards//mean": 0.869873046875, "rewards//std": 0.026873571798205376, "step": 622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1246, "grad_norm": 0.7052754759788513, "kl": 0.040967176435515285, "learning_rate": 4.837064190990036e-06, "loss": 0.0041, "num_tokens": 4064232.0, "reward": 0.79345703125, "reward_std": 0.00889979861676693, "rewards//mean": 0.79345703125, "rewards//std": 0.028590822592377663, "step": 623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1248, "grad_norm": 0.6665441393852234, "kl": 0.04415145283564925, "learning_rate": 4.836500285280476e-06, "loss": 0.0044, "num_tokens": 4070696.0, "reward": 0.86328125, "reward_std": 0.01058972254395485, "rewards//mean": 0.86328125, "rewards//std": 0.02436969056725502, "step": 624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.125, "grad_norm": 0.6481565237045288, "kl": 0.032439467730000615, "learning_rate": 4.83593543842888e-06, "loss": 0.0032, "num_tokens": 4077192.0, "reward": 0.88177490234375, "reward_std": 0.015364980325102806, "rewards//mean": 0.88177490234375, "rewards//std": 0.027341466397047043, "step": 625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1252, "grad_norm": 0.6698442697525024, "kl": 0.04040778544731438, "learning_rate": 4.835369650662767e-06, "loss": 0.004, "num_tokens": 4083736.0, "reward": 0.8175048828125, "reward_std": 0.013933517970144749, "rewards//mean": 0.8175048828125, "rewards//std": 0.032722536474466324, "step": 626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1254, "grad_norm": 0.6354655623435974, "kl": 0.03824352379888296, "learning_rate": 4.83480292221004e-06, "loss": 0.0038, "num_tokens": 4090264.0, "reward": 0.8193359375, "reward_std": 0.011118542402982712, "rewards//mean": 0.8193359375, "rewards//std": 0.0255344957113266, "step": 627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1256, "grad_norm": 0.5788552165031433, "kl": 0.036963594146072865, "learning_rate": 4.834235253298973e-06, "loss": 0.0037, "num_tokens": 4096880.0, "reward": 0.8267822265625, "reward_std": 0.012452752329409122, "rewards//mean": 0.8267822265625, "rewards//std": 0.023012209683656693, "step": 628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1258, "grad_norm": 0.6248599290847778, "kl": 0.04226681496948004, "learning_rate": 4.833666644158227e-06, "loss": 0.0042, "num_tokens": 4103368.0, "reward": 0.84954833984375, "reward_std": 0.008312016725540161, "rewards//mean": 0.84954833984375, "rewards//std": 0.021179145202040672, "step": 629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.126, "grad_norm": 0.6156426072120667, "kl": 0.03705371776595712, "learning_rate": 4.833097095016835e-06, "loss": 0.0037, "num_tokens": 4109936.0, "reward": 0.83428955078125, "reward_std": 0.012196335941553116, "rewards//mean": 0.83428955078125, "rewards//std": 0.018437961116433144, "step": 630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1262, "grad_norm": 0.6548982858657837, "kl": 0.038468639831990004, "learning_rate": 4.832526606104213e-06, "loss": 0.0038, "num_tokens": 4116472.0, "reward": 0.84368896484375, "reward_std": 0.013091469183564186, "rewards//mean": 0.84368896484375, "rewards//std": 0.029972108080983162, "step": 631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1264, "grad_norm": 0.7338609099388123, "kl": 0.038367170840501785, "learning_rate": 4.831955177650153e-06, "loss": 0.0038, "num_tokens": 4122992.0, "reward": 0.820556640625, "reward_std": 0.01122078113257885, "rewards//mean": 0.820556640625, "rewards//std": 0.025542795658111572, "step": 632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1266, "grad_norm": 0.7307354211807251, "kl": 0.04504147078841925, "learning_rate": 4.831382809884826e-06, "loss": 0.0045, "num_tokens": 4129504.0, "reward": 0.84442138671875, "reward_std": 0.01341638807207346, "rewards//mean": 0.84442138671875, "rewards//std": 0.03062807209789753, "step": 633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1268, "grad_norm": 0.6589616537094116, "kl": 0.0420961850322783, "learning_rate": 4.830809503038781e-06, "loss": 0.0042, "num_tokens": 4136008.0, "reward": 0.84881591796875, "reward_std": 0.014670501463115215, "rewards//mean": 0.84881591796875, "rewards//std": 0.0334705226123333, "step": 634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.127, "grad_norm": 0.5887529850006104, "kl": 0.036529581528156996, "learning_rate": 4.830235257342948e-06, "loss": 0.0037, "num_tokens": 4142480.0, "reward": 0.811279296875, "reward_std": 0.008088908158242702, "rewards//mean": 0.811279296875, "rewards//std": 0.022128576412796974, "step": 635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1272, "grad_norm": 0.6737771034240723, "kl": 0.039779233280569315, "learning_rate": 4.829660073028631e-06, "loss": 0.004, "num_tokens": 4149000.0, "reward": 0.858642578125, "reward_std": 0.015279535204172134, "rewards//mean": 0.858642578125, "rewards//std": 0.03358030319213867, "step": 636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1274, "grad_norm": 0.6876733303070068, "kl": 0.03895897837355733, "learning_rate": 4.829083950327516e-06, "loss": 0.0039, "num_tokens": 4155560.0, "reward": 0.82916259765625, "reward_std": 0.01598961651325226, "rewards//mean": 0.82916259765625, "rewards//std": 0.033176615834236145, "step": 637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1276, "grad_norm": 0.6112537384033203, "kl": 0.03786856750957668, "learning_rate": 4.828506889471664e-06, "loss": 0.0038, "num_tokens": 4162040.0, "reward": 0.83795166015625, "reward_std": 0.00955367460846901, "rewards//mean": 0.83795166015625, "rewards//std": 0.0371297150850296, "step": 638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1278, "grad_norm": 0.6563738584518433, "kl": 0.039538444718346, "learning_rate": 4.827928890693515e-06, "loss": 0.004, "num_tokens": 4168552.0, "reward": 0.8182373046875, "reward_std": 0.013406902551651001, "rewards//mean": 0.8182373046875, "rewards//std": 0.019978374242782593, "step": 639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.128, "grad_norm": 0.6196538805961609, "kl": 0.04083075327798724, "learning_rate": 4.8273499542258885e-06, "loss": 0.0041, "num_tokens": 4175080.0, "reward": 0.82525634765625, "reward_std": 0.01369745098054409, "rewards//mean": 0.82525634765625, "rewards//std": 0.027859212830662727, "step": 640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1282, "grad_norm": 0.6548612713813782, "kl": 0.04186346894130111, "learning_rate": 4.826770080301978e-06, "loss": 0.0042, "num_tokens": 4181592.0, "reward": 0.8125, "reward_std": 0.013666237704455853, "rewards//mean": 0.8125, "rewards//std": 0.02883964590728283, "step": 641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1284, "grad_norm": 0.6164513826370239, "kl": 0.03509138128720224, "learning_rate": 4.826189269155357e-06, "loss": 0.0035, "num_tokens": 4188112.0, "reward": 0.81109619140625, "reward_std": 0.011790347285568714, "rewards//mean": 0.81109619140625, "rewards//std": 0.027990398928523064, "step": 642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1286, "grad_norm": 0.5807366967201233, "kl": 0.03635508916340768, "learning_rate": 4.825607521019978e-06, "loss": 0.0036, "num_tokens": 4194632.0, "reward": 0.857666015625, "reward_std": 0.010109765455126762, "rewards//mean": 0.857666015625, "rewards//std": 0.021864313632249832, "step": 643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1288, "grad_norm": 0.6619797945022583, "kl": 0.036573923425748944, "learning_rate": 4.825024836130166e-06, "loss": 0.0037, "num_tokens": 4201152.0, "reward": 0.88934326171875, "reward_std": 0.011922219768166542, "rewards//mean": 0.88934326171875, "rewards//std": 0.023908693343400955, "step": 644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.129, "grad_norm": 0.7236711382865906, "kl": 0.031774683156982064, "learning_rate": 4.824441214720629e-06, "loss": 0.0032, "num_tokens": 4207632.0, "reward": 0.88836669921875, "reward_std": 0.010845713317394257, "rewards//mean": 0.88836669921875, "rewards//std": 0.02806331403553486, "step": 645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1292, "grad_norm": 0.617940366268158, "kl": 0.035609227837994695, "learning_rate": 4.823856657026448e-06, "loss": 0.0036, "num_tokens": 4214288.0, "reward": 0.84954833984375, "reward_std": 0.011059116572141647, "rewards//mean": 0.84954833984375, "rewards//std": 0.019783737137913704, "step": 646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1294, "grad_norm": 0.6680132150650024, "kl": 0.03969927388243377, "learning_rate": 4.823271163283084e-06, "loss": 0.004, "num_tokens": 4220816.0, "reward": 0.82989501953125, "reward_std": 0.011213782243430614, "rewards//mean": 0.82989501953125, "rewards//std": 0.02739180251955986, "step": 647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1296, "grad_norm": 0.657684326171875, "kl": 0.03403781563974917, "learning_rate": 4.822684733726373e-06, "loss": 0.0034, "num_tokens": 4227408.0, "reward": 0.79302978515625, "reward_std": 0.01171116717159748, "rewards//mean": 0.79302978515625, "rewards//std": 0.02404443360865116, "step": 648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1298, "grad_norm": 0.6134021282196045, "kl": 0.03312623081728816, "learning_rate": 4.822097368592529e-06, "loss": 0.0033, "num_tokens": 4233896.0, "reward": 0.87103271484375, "reward_std": 0.015468908473849297, "rewards//mean": 0.87103271484375, "rewards//std": 0.031067268922924995, "step": 649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.13, "grad_norm": 0.6155107617378235, "kl": 0.03560430929064751, "learning_rate": 4.821509068118143e-06, "loss": 0.0036, "num_tokens": 4240480.0, "reward": 0.8189697265625, "reward_std": 0.010782364755868912, "rewards//mean": 0.8189697265625, "rewards//std": 0.027780653908848763, "step": 650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1302, "grad_norm": 0.6770623922348022, "kl": 0.034555776976048946, "learning_rate": 4.8209198325401815e-06, "loss": 0.0035, "num_tokens": 4246936.0, "reward": 0.80694580078125, "reward_std": 0.00983287114650011, "rewards//mean": 0.80694580078125, "rewards//std": 0.015606461092829704, "step": 651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1304, "grad_norm": 0.674925684928894, "kl": 0.03344253194518387, "learning_rate": 4.82032966209599e-06, "loss": 0.0033, "num_tokens": 4253448.0, "reward": 0.85491943359375, "reward_std": 0.011441394686698914, "rewards//mean": 0.85491943359375, "rewards//std": 0.0302005335688591, "step": 652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1306, "grad_norm": 0.623838484287262, "kl": 0.04128697048872709, "learning_rate": 4.819738557023287e-06, "loss": 0.0041, "num_tokens": 4260000.0, "reward": 0.873779296875, "reward_std": 0.012213251553475857, "rewards//mean": 0.873779296875, "rewards//std": 0.02301911450922489, "step": 653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1308, "grad_norm": 0.6719110608100891, "kl": 0.03923142282292247, "learning_rate": 4.819146517560171e-06, "loss": 0.0039, "num_tokens": 4266544.0, "reward": 0.84075927734375, "reward_std": 0.011564802378416061, "rewards//mean": 0.84075927734375, "rewards//std": 0.021539175882935524, "step": 654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.131, "grad_norm": 0.6305099129676819, "kl": 0.03507873183116317, "learning_rate": 4.818553543945115e-06, "loss": 0.0035, "num_tokens": 4273048.0, "reward": 0.85064697265625, "reward_std": 0.01198691874742508, "rewards//mean": 0.85064697265625, "rewards//std": 0.02057294361293316, "step": 655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1312, "grad_norm": 0.6202968955039978, "kl": 0.032768177799880505, "learning_rate": 4.817959636416969e-06, "loss": 0.0033, "num_tokens": 4279552.0, "reward": 0.857177734375, "reward_std": 0.017117980867624283, "rewards//mean": 0.857177734375, "rewards//std": 0.03547430410981178, "step": 656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1314, "grad_norm": 0.6903461217880249, "kl": 0.04518736433237791, "learning_rate": 4.8173647952149584e-06, "loss": 0.0045, "num_tokens": 4286096.0, "reward": 0.775634765625, "reward_std": 0.009762182831764221, "rewards//mean": 0.775634765625, "rewards//std": 0.023197297006845474, "step": 657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1316, "grad_norm": 0.6239075064659119, "kl": 0.03696441138163209, "learning_rate": 4.816769020578685e-06, "loss": 0.0037, "num_tokens": 4292704.0, "reward": 0.84454345703125, "reward_std": 0.010429391637444496, "rewards//mean": 0.84454345703125, "rewards//std": 0.03256458044052124, "step": 658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1318, "grad_norm": 0.6943949460983276, "kl": 0.03952537663280964, "learning_rate": 4.816172312748128e-06, "loss": 0.004, "num_tokens": 4299208.0, "reward": 0.85125732421875, "reward_std": 0.00947652943432331, "rewards//mean": 0.85125732421875, "rewards//std": 0.01891135238111019, "step": 659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.132, "grad_norm": 0.6111933588981628, "kl": 0.03215678269043565, "learning_rate": 4.81557467196364e-06, "loss": 0.0032, "num_tokens": 4305728.0, "reward": 0.841064453125, "reward_std": 0.010960794985294342, "rewards//mean": 0.841064453125, "rewards//std": 0.014530673623085022, "step": 660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1322, "grad_norm": 0.6737866401672363, "kl": 0.036603975808247924, "learning_rate": 4.814976098465951e-06, "loss": 0.0037, "num_tokens": 4312296.0, "reward": 0.81103515625, "reward_std": 0.008650883100926876, "rewards//mean": 0.81103515625, "rewards//std": 0.015570652671158314, "step": 661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1324, "grad_norm": 0.6496907472610474, "kl": 0.040806922828778625, "learning_rate": 4.814376592496167e-06, "loss": 0.0041, "num_tokens": 4318760.0, "reward": 0.8494873046875, "reward_std": 0.01330764964222908, "rewards//mean": 0.8494873046875, "rewards//std": 0.026180516928434372, "step": 662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1326, "grad_norm": 0.6427296996116638, "kl": 0.034578942228108644, "learning_rate": 4.813776154295767e-06, "loss": 0.0035, "num_tokens": 4325256.0, "reward": 0.850341796875, "reward_std": 0.011855723336338997, "rewards//mean": 0.850341796875, "rewards//std": 0.028415564447641373, "step": 663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1328, "grad_norm": 0.6374642252922058, "kl": 0.03211224218830466, "learning_rate": 4.81317478410661e-06, "loss": 0.0032, "num_tokens": 4331768.0, "reward": 0.80657958984375, "reward_std": 0.0074308766052126884, "rewards//mean": 0.80657958984375, "rewards//std": 0.012738611549139023, "step": 664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.133, "grad_norm": 0.7321417927742004, "kl": 0.04336266568861902, "learning_rate": 4.812572482170926e-06, "loss": 0.0043, "num_tokens": 4338224.0, "reward": 0.84429931640625, "reward_std": 0.01558034960180521, "rewards//mean": 0.84429931640625, "rewards//std": 0.02651967667043209, "step": 665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1332, "grad_norm": 0.6395056843757629, "kl": 0.03777423477731645, "learning_rate": 4.811969248731323e-06, "loss": 0.0038, "num_tokens": 4344768.0, "reward": 0.81304931640625, "reward_std": 0.01339129637926817, "rewards//mean": 0.81304931640625, "rewards//std": 0.02242206782102585, "step": 666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1334, "grad_norm": 0.6424086689949036, "kl": 0.036166464909911156, "learning_rate": 4.811365084030784e-06, "loss": 0.0036, "num_tokens": 4351168.0, "reward": 0.84857177734375, "reward_std": 0.011064697988331318, "rewards//mean": 0.84857177734375, "rewards//std": 0.02043117955327034, "step": 667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1336, "grad_norm": 0.654913604259491, "kl": 0.03382642939686775, "learning_rate": 4.8107599883126634e-06, "loss": 0.0034, "num_tokens": 4357736.0, "reward": 0.81500244140625, "reward_std": 0.0177701935172081, "rewards//mean": 0.81500244140625, "rewards//std": 0.03539564833045006, "step": 668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1338, "grad_norm": 0.6352917551994324, "kl": 0.0319740588311106, "learning_rate": 4.810153961820697e-06, "loss": 0.0032, "num_tokens": 4364320.0, "reward": 0.824951171875, "reward_std": 0.012290971353650093, "rewards//mean": 0.824951171875, "rewards//std": 0.023291075602173805, "step": 669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.134, "grad_norm": 0.6589383482933044, "kl": 0.032449120189994574, "learning_rate": 4.809547004798991e-06, "loss": 0.0032, "num_tokens": 4370872.0, "reward": 0.8184814453125, "reward_std": 0.013435855507850647, "rewards//mean": 0.8184814453125, "rewards//std": 0.018970992416143417, "step": 670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1342, "grad_norm": 0.6411390900611877, "kl": 0.03467783285304904, "learning_rate": 4.808939117492028e-06, "loss": 0.0035, "num_tokens": 4377256.0, "reward": 0.8438720703125, "reward_std": 0.009257117286324501, "rewards//mean": 0.8438720703125, "rewards//std": 0.031696103513240814, "step": 671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1344, "grad_norm": 0.666558027267456, "kl": 0.037281798431649804, "learning_rate": 4.808330300144664e-06, "loss": 0.0037, "num_tokens": 4383744.0, "reward": 0.82806396484375, "reward_std": 0.011676324531435966, "rewards//mean": 0.82806396484375, "rewards//std": 0.01860387995839119, "step": 672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1346, "grad_norm": 0.7036078572273254, "kl": 0.041853292379528284, "learning_rate": 4.807720553002132e-06, "loss": 0.0042, "num_tokens": 4390200.0, "reward": 0.83892822265625, "reward_std": 0.009568301029503345, "rewards//mean": 0.83892822265625, "rewards//std": 0.03169437125325203, "step": 673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1348, "grad_norm": 0.6020171046257019, "kl": 0.03041183133609593, "learning_rate": 4.807109876310037e-06, "loss": 0.003, "num_tokens": 4396848.0, "reward": 0.853759765625, "reward_std": 0.011211195029318333, "rewards//mean": 0.853759765625, "rewards//std": 0.02409859374165535, "step": 674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.135, "grad_norm": 0.6017598509788513, "kl": 0.03304919390939176, "learning_rate": 4.806498270314359e-06, "loss": 0.0033, "num_tokens": 4403320.0, "reward": 0.83056640625, "reward_std": 0.013944298960268497, "rewards//mean": 0.83056640625, "rewards//std": 0.03543758764863014, "step": 675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1352, "grad_norm": 0.6396259665489197, "kl": 0.035610402934253216, "learning_rate": 4.805885735261454e-06, "loss": 0.0036, "num_tokens": 4409832.0, "reward": 0.8157958984375, "reward_std": 0.014298718422651291, "rewards//mean": 0.8157958984375, "rewards//std": 0.025393905118107796, "step": 676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1354, "grad_norm": 0.6395999789237976, "kl": 0.034702015575021505, "learning_rate": 4.805272271398051e-06, "loss": 0.0035, "num_tokens": 4416296.0, "reward": 0.8299560546875, "reward_std": 0.010690449737012386, "rewards//mean": 0.8299560546875, "rewards//std": 0.019512252882122993, "step": 677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1356, "grad_norm": 0.6502525210380554, "kl": 0.03755046101287007, "learning_rate": 4.804657878971252e-06, "loss": 0.0038, "num_tokens": 4422784.0, "reward": 0.84521484375, "reward_std": 0.010795504786074162, "rewards//mean": 0.84521484375, "rewards//std": 0.022973036393523216, "step": 678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1358, "grad_norm": 0.6069939136505127, "kl": 0.03229914209805429, "learning_rate": 4.804042558228535e-06, "loss": 0.0032, "num_tokens": 4429288.0, "reward": 0.86700439453125, "reward_std": 0.013926582410931587, "rewards//mean": 0.86700439453125, "rewards//std": 0.027930304408073425, "step": 679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.136, "grad_norm": 1.218776822090149, "kl": 0.037880075396969914, "learning_rate": 4.803426309417752e-06, "loss": 0.0038, "num_tokens": 4435824.0, "reward": 0.86199951171875, "reward_std": 0.010651204735040665, "rewards//mean": 0.86199951171875, "rewards//std": 0.022628381848335266, "step": 680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1362, "grad_norm": 0.6532304883003235, "kl": 0.035220161313191056, "learning_rate": 4.802809132787125e-06, "loss": 0.0035, "num_tokens": 4442240.0, "reward": 0.8717041015625, "reward_std": 0.012849440798163414, "rewards//mean": 0.8717041015625, "rewards//std": 0.02241506241261959, "step": 681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1364, "grad_norm": 0.7135369181632996, "kl": 0.03639285918325186, "learning_rate": 4.802191028585257e-06, "loss": 0.0036, "num_tokens": 4448760.0, "reward": 0.83935546875, "reward_std": 0.009929399937391281, "rewards//mean": 0.83935546875, "rewards//std": 0.025614995509386063, "step": 682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1366, "grad_norm": 0.7048020362854004, "kl": 0.04157875198870897, "learning_rate": 4.801571997061117e-06, "loss": 0.0042, "num_tokens": 4455248.0, "reward": 0.77557373046875, "reward_std": 0.01009085588157177, "rewards//mean": 0.77557373046875, "rewards//std": 0.016904599964618683, "step": 683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1368, "grad_norm": 0.6199202537536621, "kl": 0.035192727111279964, "learning_rate": 4.800952038464051e-06, "loss": 0.0035, "num_tokens": 4461736.0, "reward": 0.80517578125, "reward_std": 0.00936543196439743, "rewards//mean": 0.80517578125, "rewards//std": 0.0161209125071764, "step": 684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.137, "grad_norm": 0.5773229002952576, "kl": 0.030090483836829662, "learning_rate": 4.800331153043781e-06, "loss": 0.003, "num_tokens": 4468272.0, "reward": 0.850341796875, "reward_std": 0.009957228787243366, "rewards//mean": 0.850341796875, "rewards//std": 0.024318700656294823, "step": 685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1372, "grad_norm": 0.5711527466773987, "kl": 0.034664512844756246, "learning_rate": 4.799709341050397e-06, "loss": 0.0035, "num_tokens": 4474696.0, "reward": 0.8583984375, "reward_std": 0.012437745928764343, "rewards//mean": 0.8583984375, "rewards//std": 0.02202022820711136, "step": 686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1374, "grad_norm": 0.6654622554779053, "kl": 0.03352417494170368, "learning_rate": 4.799086602734364e-06, "loss": 0.0034, "num_tokens": 4481176.0, "reward": 0.869140625, "reward_std": 0.013702813535928726, "rewards//mean": 0.869140625, "rewards//std": 0.031673409044742584, "step": 687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1376, "grad_norm": 0.7128596305847168, "kl": 0.038529411889612675, "learning_rate": 4.798462938346524e-06, "loss": 0.0039, "num_tokens": 4487768.0, "reward": 0.8201904296875, "reward_std": 0.015007439069449902, "rewards//mean": 0.8201904296875, "rewards//std": 0.021631482988595963, "step": 688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1378, "grad_norm": 0.6258558034896851, "kl": 0.038319848012179136, "learning_rate": 4.7978383481380865e-06, "loss": 0.0038, "num_tokens": 4494224.0, "reward": 0.833984375, "reward_std": 0.018096666783094406, "rewards//mean": 0.833984375, "rewards//std": 0.0328446701169014, "step": 689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.138, "grad_norm": 0.7074935436248779, "kl": 0.033116700826212764, "learning_rate": 4.797212832360637e-06, "loss": 0.0033, "num_tokens": 4500672.0, "reward": 0.86041259765625, "reward_std": 0.014863163232803345, "rewards//mean": 0.86041259765625, "rewards//std": 0.03233366087079048, "step": 690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1382, "grad_norm": 0.6275817155838013, "kl": 0.036013098899275064, "learning_rate": 4.796586391266135e-06, "loss": 0.0036, "num_tokens": 4507152.0, "reward": 0.794921875, "reward_std": 0.009963938035070896, "rewards//mean": 0.794921875, "rewards//std": 0.03113352507352829, "step": 691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1384, "grad_norm": 0.6234140992164612, "kl": 0.03075044380966574, "learning_rate": 4.795959025106907e-06, "loss": 0.0031, "num_tokens": 4513600.0, "reward": 0.80718994140625, "reward_std": 0.012309936806559563, "rewards//mean": 0.80718994140625, "rewards//std": 0.03157665953040123, "step": 692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1386, "grad_norm": 0.6930115222930908, "kl": 0.03649276262149215, "learning_rate": 4.7953307341356595e-06, "loss": 0.0036, "num_tokens": 4520136.0, "reward": 0.83502197265625, "reward_std": 0.011502781882882118, "rewards//mean": 0.83502197265625, "rewards//std": 0.02733703702688217, "step": 693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1388, "grad_norm": 0.7128375172615051, "kl": 0.03017679904587567, "learning_rate": 4.794701518605467e-06, "loss": 0.003, "num_tokens": 4526608.0, "reward": 0.83087158203125, "reward_std": 0.014898988418281078, "rewards//mean": 0.83087158203125, "rewards//std": 0.0271587036550045, "step": 694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.139, "grad_norm": 0.6921178698539734, "kl": 0.03340840828604996, "learning_rate": 4.794071378769776e-06, "loss": 0.0033, "num_tokens": 4533112.0, "reward": 0.83184814453125, "reward_std": 0.009263399988412857, "rewards//mean": 0.83184814453125, "rewards//std": 0.02486584521830082, "step": 695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1392, "grad_norm": 0.6168423295021057, "kl": 0.03176119248382747, "learning_rate": 4.7934403148824085e-06, "loss": 0.0032, "num_tokens": 4539592.0, "reward": 0.83331298828125, "reward_std": 0.011103109456598759, "rewards//mean": 0.83331298828125, "rewards//std": 0.028772840276360512, "step": 696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1394, "grad_norm": 0.6904219388961792, "kl": 0.03471069340594113, "learning_rate": 4.792808327197556e-06, "loss": 0.0035, "num_tokens": 4546088.0, "reward": 0.855712890625, "reward_std": 0.013138792477548122, "rewards//mean": 0.855712890625, "rewards//std": 0.027080070227384567, "step": 697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1396, "grad_norm": 0.6291631460189819, "kl": 0.035946730989962816, "learning_rate": 4.792175415969786e-06, "loss": 0.0036, "num_tokens": 4552576.0, "reward": 0.8463134765625, "reward_std": 0.01710490882396698, "rewards//mean": 0.8463134765625, "rewards//std": 0.03874010592699051, "step": 698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1398, "grad_norm": 0.7094471454620361, "kl": 0.03311455622315407, "learning_rate": 4.79154158145403e-06, "loss": 0.0033, "num_tokens": 4559080.0, "reward": 0.83734130859375, "reward_std": 0.012111399322748184, "rewards//mean": 0.83734130859375, "rewards//std": 0.02404317446053028, "step": 699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.14, "grad_norm": 0.5819594860076904, "kl": 0.030330684036016464, "learning_rate": 4.790906823905599e-06, "loss": 0.003, "num_tokens": 4565544.0, "reward": 0.8709716796875, "reward_std": 0.007809075061231852, "rewards//mean": 0.8709716796875, "rewards//std": 0.017934059724211693, "step": 700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1402, "grad_norm": 0.6491738557815552, "kl": 0.03869724064134061, "learning_rate": 4.790271143580174e-06, "loss": 0.0039, "num_tokens": 4572000.0, "reward": 0.88092041015625, "reward_std": 0.008929949253797531, "rewards//mean": 0.88092041015625, "rewards//std": 0.018714213743805885, "step": 701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1404, "grad_norm": 0.6010414958000183, "kl": 0.030946203973144293, "learning_rate": 4.789634540733807e-06, "loss": 0.0031, "num_tokens": 4578432.0, "reward": 0.8525390625, "reward_std": 0.01049017533659935, "rewards//mean": 0.8525390625, "rewards//std": 0.024150049313902855, "step": 702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1406, "grad_norm": 0.6383006572723389, "kl": 0.03664790280163288, "learning_rate": 4.78899701562292e-06, "loss": 0.0037, "num_tokens": 4584928.0, "reward": 0.88543701171875, "reward_std": 0.0163101963698864, "rewards//mean": 0.88543701171875, "rewards//std": 0.04444585740566254, "step": 703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1408, "grad_norm": 0.6771392226219177, "kl": 0.027444692328572273, "learning_rate": 4.788358568504308e-06, "loss": 0.0027, "num_tokens": 4591416.0, "reward": 0.81182861328125, "reward_std": 0.011187758296728134, "rewards//mean": 0.81182861328125, "rewards//std": 0.029450898990035057, "step": 704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.141, "grad_norm": 0.6804143190383911, "kl": 0.030982508091256022, "learning_rate": 4.78771919963514e-06, "loss": 0.0031, "num_tokens": 4597880.0, "reward": 0.86578369140625, "reward_std": 0.016307076439261436, "rewards//mean": 0.86578369140625, "rewards//std": 0.03342481330037117, "step": 705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1412, "grad_norm": 0.6565612554550171, "kl": 0.032242163084447384, "learning_rate": 4.787078909272951e-06, "loss": 0.0032, "num_tokens": 4604512.0, "reward": 0.85791015625, "reward_std": 0.01364915445446968, "rewards//mean": 0.85791015625, "rewards//std": 0.033476460725069046, "step": 706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1414, "grad_norm": 0.6641219854354858, "kl": 0.032242062501609325, "learning_rate": 4.786437697675651e-06, "loss": 0.0032, "num_tokens": 4611120.0, "reward": 0.82781982421875, "reward_std": 0.009526567533612251, "rewards//mean": 0.82781982421875, "rewards//std": 0.02961287833750248, "step": 707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1416, "grad_norm": 0.6178065538406372, "kl": 0.033733992371708155, "learning_rate": 4.78579556510152e-06, "loss": 0.0034, "num_tokens": 4617624.0, "reward": 0.8480224609375, "reward_std": 0.009033482521772385, "rewards//mean": 0.8480224609375, "rewards//std": 0.030951879918575287, "step": 708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1418, "grad_norm": 0.6321604251861572, "kl": 0.034345758613198996, "learning_rate": 4.785152511809208e-06, "loss": 0.0034, "num_tokens": 4624160.0, "reward": 0.823486328125, "reward_std": 0.010395782068371773, "rewards//mean": 0.823486328125, "rewards//std": 0.02851766347885132, "step": 709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.142, "grad_norm": 0.6739100813865662, "kl": 0.038603739347308874, "learning_rate": 4.784508538057738e-06, "loss": 0.0039, "num_tokens": 4630568.0, "reward": 0.8701171875, "reward_std": 0.012175248935818672, "rewards//mean": 0.8701171875, "rewards//std": 0.026666609570384026, "step": 710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1422, "grad_norm": 0.6938429474830627, "kl": 0.0384972074534744, "learning_rate": 4.783863644106502e-06, "loss": 0.0038, "num_tokens": 4637120.0, "reward": 0.8455810546875, "reward_std": 0.020297762006521225, "rewards//mean": 0.8455810546875, "rewards//std": 0.03314540535211563, "step": 711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1424, "grad_norm": 0.6242883205413818, "kl": 0.034390477230772376, "learning_rate": 4.783217830215264e-06, "loss": 0.0034, "num_tokens": 4643592.0, "reward": 0.82000732421875, "reward_std": 0.009873378090560436, "rewards//mean": 0.82000732421875, "rewards//std": 0.017734911292791367, "step": 712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1426, "grad_norm": 0.635278046131134, "kl": 0.03100576065480709, "learning_rate": 4.782571096644157e-06, "loss": 0.0031, "num_tokens": 4650128.0, "reward": 0.85980224609375, "reward_std": 0.012011419981718063, "rewards//mean": 0.85980224609375, "rewards//std": 0.027970923110842705, "step": 713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1428, "grad_norm": 0.6182490587234497, "kl": 0.03299332037568092, "learning_rate": 4.7819234436536845e-06, "loss": 0.0033, "num_tokens": 4656776.0, "reward": 0.84429931640625, "reward_std": 0.016069753095507622, "rewards//mean": 0.84429931640625, "rewards//std": 0.026299571618437767, "step": 714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.143, "grad_norm": 0.7233075499534607, "kl": 0.03955763205885887, "learning_rate": 4.781274871504722e-06, "loss": 0.004, "num_tokens": 4663352.0, "reward": 0.86322021484375, "reward_std": 0.009645191952586174, "rewards//mean": 0.86322021484375, "rewards//std": 0.01891375333070755, "step": 715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1432, "grad_norm": 0.6025941967964172, "kl": 0.03357991902157664, "learning_rate": 4.780625380458513e-06, "loss": 0.0034, "num_tokens": 4669752.0, "reward": 0.83551025390625, "reward_std": 0.009832639247179031, "rewards//mean": 0.83551025390625, "rewards//std": 0.022564740851521492, "step": 716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1434, "grad_norm": 0.6996767520904541, "kl": 0.03547016205266118, "learning_rate": 4.7799749707766754e-06, "loss": 0.0035, "num_tokens": 4676280.0, "reward": 0.8021240234375, "reward_std": 0.013020787388086319, "rewards//mean": 0.8021240234375, "rewards//std": 0.03197190538048744, "step": 717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1436, "grad_norm": 0.7296597361564636, "kl": 0.039803872583433986, "learning_rate": 4.779323642721191e-06, "loss": 0.004, "num_tokens": 4682800.0, "reward": 0.8380126953125, "reward_std": 0.014720786362886429, "rewards//mean": 0.8380126953125, "rewards//std": 0.03200786933302879, "step": 718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1438, "grad_norm": 0.5901254415512085, "kl": 0.04078292986378074, "learning_rate": 4.778671396554417e-06, "loss": 0.0041, "num_tokens": 4689280.0, "reward": 0.83428955078125, "reward_std": 0.012603724375367165, "rewards//mean": 0.83428955078125, "rewards//std": 0.04113903269171715, "step": 719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.144, "grad_norm": 0.6492975950241089, "kl": 0.04299718514084816, "learning_rate": 4.778018232539075e-06, "loss": 0.0043, "num_tokens": 4695808.0, "reward": 0.875244140625, "reward_std": 0.01032957248389721, "rewards//mean": 0.875244140625, "rewards//std": 0.022583600133657455, "step": 720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1442, "grad_norm": 0.6899735927581787, "kl": 0.037773449905216694, "learning_rate": 4.777364150938263e-06, "loss": 0.0038, "num_tokens": 4702496.0, "reward": 0.81561279296875, "reward_std": 0.01541346125304699, "rewards//mean": 0.81561279296875, "rewards//std": 0.03828706964850426, "step": 721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1444, "grad_norm": 0.6682742238044739, "kl": 0.03817844996228814, "learning_rate": 4.776709152015443e-06, "loss": 0.0038, "num_tokens": 4709128.0, "reward": 0.833251953125, "reward_std": 0.013756323605775833, "rewards//mean": 0.833251953125, "rewards//std": 0.027231693267822266, "step": 722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1446, "grad_norm": 0.6066592931747437, "kl": 0.03351732715964317, "learning_rate": 4.776053236034449e-06, "loss": 0.0034, "num_tokens": 4715600.0, "reward": 0.83123779296875, "reward_std": 0.01098795235157013, "rewards//mean": 0.83123779296875, "rewards//std": 0.022230185568332672, "step": 723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1448, "grad_norm": 0.5982468128204346, "kl": 0.031167512759566307, "learning_rate": 4.775396403259483e-06, "loss": 0.0031, "num_tokens": 4722040.0, "reward": 0.86956787109375, "reward_std": 0.012180990539491177, "rewards//mean": 0.86956787109375, "rewards//std": 0.022354451939463615, "step": 724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.145, "grad_norm": 0.743056058883667, "kl": 0.041748383082449436, "learning_rate": 4.774738653955119e-06, "loss": 0.0042, "num_tokens": 4728568.0, "reward": 0.86041259765625, "reward_std": 0.020265260711312294, "rewards//mean": 0.86041259765625, "rewards//std": 0.036869507282972336, "step": 725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1452, "grad_norm": 0.5705842971801758, "kl": 0.025937842903658748, "learning_rate": 4.7740799883862966e-06, "loss": 0.0026, "num_tokens": 4735136.0, "reward": 0.814453125, "reward_std": 0.008926140144467354, "rewards//mean": 0.814453125, "rewards//std": 0.018082424998283386, "step": 726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1454, "grad_norm": 0.6330570578575134, "kl": 0.03641601069830358, "learning_rate": 4.773420406818327e-06, "loss": 0.0036, "num_tokens": 4741624.0, "reward": 0.85540771484375, "reward_std": 0.009088218212127686, "rewards//mean": 0.85540771484375, "rewards//std": 0.0307695921510458, "step": 727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1456, "grad_norm": 0.7037932276725769, "kl": 0.03219994530081749, "learning_rate": 4.772759909516889e-06, "loss": 0.0032, "num_tokens": 4748176.0, "reward": 0.86865234375, "reward_std": 0.01569826528429985, "rewards//mean": 0.86865234375, "rewards//std": 0.026277761906385422, "step": 728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1458, "grad_norm": 0.6403439044952393, "kl": 0.03100749058648944, "learning_rate": 4.772098496748031e-06, "loss": 0.0031, "num_tokens": 4754688.0, "reward": 0.82391357421875, "reward_std": 0.013501830399036407, "rewards//mean": 0.82391357421875, "rewards//std": 0.03286166116595268, "step": 729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.146, "grad_norm": 0.7018864750862122, "kl": 0.03662884212099016, "learning_rate": 4.7714361687781705e-06, "loss": 0.0037, "num_tokens": 4761136.0, "reward": 0.8272705078125, "reward_std": 0.011517598293721676, "rewards//mean": 0.8272705078125, "rewards//std": 0.025793731212615967, "step": 730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1462, "grad_norm": 0.7192935347557068, "kl": 0.03436392149887979, "learning_rate": 4.770772925874093e-06, "loss": 0.0034, "num_tokens": 4767784.0, "reward": 0.814208984375, "reward_std": 0.012513557448983192, "rewards//mean": 0.814208984375, "rewards//std": 0.031202493235468864, "step": 731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1464, "grad_norm": 0.6513822078704834, "kl": 0.03598013357259333, "learning_rate": 4.770108768302953e-06, "loss": 0.0036, "num_tokens": 4774360.0, "reward": 0.8048095703125, "reward_std": 0.011015706695616245, "rewards//mean": 0.8048095703125, "rewards//std": 0.020267244428396225, "step": 732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1466, "grad_norm": 0.6309319138526917, "kl": 0.03459942061454058, "learning_rate": 4.769443696332272e-06, "loss": 0.0035, "num_tokens": 4780872.0, "reward": 0.85797119140625, "reward_std": 0.013075457885861397, "rewards//mean": 0.85797119140625, "rewards//std": 0.03282524645328522, "step": 733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1468, "grad_norm": 0.6636326313018799, "kl": 0.04208630672656, "learning_rate": 4.768777710229941e-06, "loss": 0.0042, "num_tokens": 4787488.0, "reward": 0.84857177734375, "reward_std": 0.011825092136859894, "rewards//mean": 0.84857177734375, "rewards//std": 0.03186964988708496, "step": 734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.147, "grad_norm": 0.6215096712112427, "kl": 0.03159097209572792, "learning_rate": 4.768110810264221e-06, "loss": 0.0032, "num_tokens": 4794088.0, "reward": 0.86846923828125, "reward_std": 0.0126973120495677, "rewards//mean": 0.86846923828125, "rewards//std": 0.03158193454146385, "step": 735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1472, "grad_norm": 0.8456329703330994, "kl": 0.03009815188124776, "learning_rate": 4.767442996703737e-06, "loss": 0.003, "num_tokens": 4800736.0, "reward": 0.8399658203125, "reward_std": 0.011654919944703579, "rewards//mean": 0.8399658203125, "rewards//std": 0.01840067096054554, "step": 736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1474, "grad_norm": 0.6510376334190369, "kl": 0.03821468539535999, "learning_rate": 4.7667742698174855e-06, "loss": 0.0038, "num_tokens": 4807216.0, "reward": 0.84649658203125, "reward_std": 0.01183587871491909, "rewards//mean": 0.84649658203125, "rewards//std": 0.022038694471120834, "step": 737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1476, "grad_norm": 0.7117304801940918, "kl": 0.03541584825143218, "learning_rate": 4.766104629874829e-06, "loss": 0.0035, "num_tokens": 4813776.0, "reward": 0.836669921875, "reward_std": 0.014451714232563972, "rewards//mean": 0.836669921875, "rewards//std": 0.03537174314260483, "step": 738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1478, "grad_norm": 0.6581489443778992, "kl": 0.03779011429287493, "learning_rate": 4.765434077145499e-06, "loss": 0.0038, "num_tokens": 4820216.0, "reward": 0.79486083984375, "reward_std": 0.009967784397304058, "rewards//mean": 0.79486083984375, "rewards//std": 0.02294658124446869, "step": 739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.148, "grad_norm": 0.6047603487968445, "kl": 0.03846903960220516, "learning_rate": 4.764762611899593e-06, "loss": 0.0038, "num_tokens": 4826760.0, "reward": 0.8321533203125, "reward_std": 0.014414850622415543, "rewards//mean": 0.8321533203125, "rewards//std": 0.033378418534994125, "step": 740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1482, "grad_norm": 0.636673092842102, "kl": 0.03530361200682819, "learning_rate": 4.764090234407578e-06, "loss": 0.0035, "num_tokens": 4833336.0, "reward": 0.80633544921875, "reward_std": 0.011949518695473671, "rewards//mean": 0.80633544921875, "rewards//std": 0.033745281398296356, "step": 741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1484, "grad_norm": 0.6843360066413879, "kl": 0.03777982760220766, "learning_rate": 4.763416944940287e-06, "loss": 0.0038, "num_tokens": 4839888.0, "reward": 0.8756103515625, "reward_std": 0.010489646345376968, "rewards//mean": 0.8756103515625, "rewards//std": 0.052274663001298904, "step": 742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1486, "grad_norm": 0.6168850064277649, "kl": 0.03202090901322663, "learning_rate": 4.762742743768921e-06, "loss": 0.0032, "num_tokens": 4846600.0, "reward": 0.852294921875, "reward_std": 0.015699390321969986, "rewards//mean": 0.852294921875, "rewards//std": 0.024178866297006607, "step": 743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1488, "grad_norm": 0.6348932981491089, "kl": 0.03048681584186852, "learning_rate": 4.762067631165049e-06, "loss": 0.003, "num_tokens": 4853128.0, "reward": 0.8388671875, "reward_std": 0.012227789498865604, "rewards//mean": 0.8388671875, "rewards//std": 0.026106689125299454, "step": 744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.149, "grad_norm": 0.7314891815185547, "kl": 0.03466706443578005, "learning_rate": 4.761391607400606e-06, "loss": 0.0035, "num_tokens": 4859608.0, "reward": 0.79595947265625, "reward_std": 0.01198391243815422, "rewards//mean": 0.79595947265625, "rewards//std": 0.018891330808401108, "step": 745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1492, "grad_norm": 0.633370041847229, "kl": 0.03609757055528462, "learning_rate": 4.7607146727478935e-06, "loss": 0.0036, "num_tokens": 4866080.0, "reward": 0.84503173828125, "reward_std": 0.008432768285274506, "rewards//mean": 0.84503173828125, "rewards//std": 0.011395055800676346, "step": 746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1494, "grad_norm": 0.7546378374099731, "kl": 0.03983141761273146, "learning_rate": 4.760036827479582e-06, "loss": 0.004, "num_tokens": 4872552.0, "reward": 0.87335205078125, "reward_std": 0.010378586128354073, "rewards//mean": 0.87335205078125, "rewards//std": 0.023331904783844948, "step": 747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1496, "grad_norm": 0.7977150678634644, "kl": 0.03384686843492091, "learning_rate": 4.759358071868705e-06, "loss": 0.0034, "num_tokens": 4879048.0, "reward": 0.848388671875, "reward_std": 0.012764126062393188, "rewards//mean": 0.848388671875, "rewards//std": 0.032263416796922684, "step": 748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1498, "grad_norm": 0.7911922335624695, "kl": 0.03975024959072471, "learning_rate": 4.758678406188668e-06, "loss": 0.004, "num_tokens": 4885656.0, "reward": 0.87164306640625, "reward_std": 0.01325987372547388, "rewards//mean": 0.87164306640625, "rewards//std": 0.02184206061065197, "step": 749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.15, "grad_norm": 0.5640290975570679, "kl": 0.03782695718109608, "learning_rate": 4.757997830713239e-06, "loss": 0.0038, "num_tokens": 4892056.0, "reward": 0.850830078125, "reward_std": 0.011497659608721733, "rewards//mean": 0.850830078125, "rewards//std": 0.019806774333119392, "step": 750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1502, "grad_norm": 0.6675816774368286, "kl": 0.038821807596832514, "learning_rate": 4.757316345716554e-06, "loss": 0.0039, "num_tokens": 4898448.0, "reward": 0.8446044921875, "reward_std": 0.015330525115132332, "rewards//mean": 0.8446044921875, "rewards//std": 0.037256550043821335, "step": 751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1504, "grad_norm": 0.7459909319877625, "kl": 0.03718794463202357, "learning_rate": 4.756633951473114e-06, "loss": 0.0037, "num_tokens": 4905000.0, "reward": 0.8050537109375, "reward_std": 0.011927587911486626, "rewards//mean": 0.8050537109375, "rewards//std": 0.021973086521029472, "step": 752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1506, "grad_norm": 0.6190351247787476, "kl": 0.03919585421681404, "learning_rate": 4.755950648257789e-06, "loss": 0.0039, "num_tokens": 4911648.0, "reward": 0.8504638671875, "reward_std": 0.013286629691720009, "rewards//mean": 0.8504638671875, "rewards//std": 0.02287762239575386, "step": 753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1508, "grad_norm": 0.6951397657394409, "kl": 0.03513274388387799, "learning_rate": 4.755266436345812e-06, "loss": 0.0035, "num_tokens": 4918208.0, "reward": 0.76495361328125, "reward_std": 0.007555033545941114, "rewards//mean": 0.76495361328125, "rewards//std": 0.017532316967844963, "step": 754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.151, "grad_norm": 0.6622002124786377, "kl": 0.03432131186127663, "learning_rate": 4.754581316012785e-06, "loss": 0.0034, "num_tokens": 4924768.0, "reward": 0.7640380859375, "reward_std": 0.01022288203239441, "rewards//mean": 0.7640380859375, "rewards//std": 0.03617635741829872, "step": 755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1512, "grad_norm": 0.6431364417076111, "kl": 0.04085060302168131, "learning_rate": 4.753895287534673e-06, "loss": 0.0041, "num_tokens": 4931248.0, "reward": 0.839111328125, "reward_std": 0.012506959959864616, "rewards//mean": 0.839111328125, "rewards//std": 0.023703385144472122, "step": 756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1514, "grad_norm": 0.6833599209785461, "kl": 0.038734698202461004, "learning_rate": 4.753208351187809e-06, "loss": 0.0039, "num_tokens": 4937744.0, "reward": 0.82275390625, "reward_std": 0.018028873950242996, "rewards//mean": 0.82275390625, "rewards//std": 0.034114301204681396, "step": 757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1516, "grad_norm": 0.731910228729248, "kl": 0.044021994806826115, "learning_rate": 4.75252050724889e-06, "loss": 0.0044, "num_tokens": 4944224.0, "reward": 0.84613037109375, "reward_std": 0.010319128632545471, "rewards//mean": 0.84613037109375, "rewards//std": 0.023381808772683144, "step": 758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1518, "grad_norm": 0.7127944827079773, "kl": 0.030607926659286022, "learning_rate": 4.751831755994981e-06, "loss": 0.0031, "num_tokens": 4950800.0, "reward": 0.84515380859375, "reward_std": 0.01648303121328354, "rewards//mean": 0.84515380859375, "rewards//std": 0.03060285560786724, "step": 759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.152, "grad_norm": 0.5836415886878967, "kl": 0.037573390174657106, "learning_rate": 4.75114209770351e-06, "loss": 0.0038, "num_tokens": 4957328.0, "reward": 0.8306884765625, "reward_std": 0.01186058297753334, "rewards//mean": 0.8306884765625, "rewards//std": 0.023043762892484665, "step": 760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1522, "grad_norm": 0.727539598941803, "kl": 0.038794671185314655, "learning_rate": 4.75045153265227e-06, "loss": 0.0039, "num_tokens": 4963840.0, "reward": 0.8233642578125, "reward_std": 0.010286824777722359, "rewards//mean": 0.8233642578125, "rewards//std": 0.02826681174337864, "step": 761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1524, "grad_norm": 0.6977169513702393, "kl": 0.04045017482712865, "learning_rate": 4.749760061119423e-06, "loss": 0.004, "num_tokens": 4970328.0, "reward": 0.83831787109375, "reward_std": 0.011999544687569141, "rewards//mean": 0.83831787109375, "rewards//std": 0.016227491199970245, "step": 762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1526, "grad_norm": 0.7427241206169128, "kl": 0.03671771287918091, "learning_rate": 4.749067683383491e-06, "loss": 0.0037, "num_tokens": 4976840.0, "reward": 0.79437255859375, "reward_std": 0.012813002802431583, "rewards//mean": 0.79437255859375, "rewards//std": 0.020884044468402863, "step": 763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1528, "grad_norm": 0.6940443515777588, "kl": 0.035541698802262545, "learning_rate": 4.748374399723366e-06, "loss": 0.0036, "num_tokens": 4983216.0, "reward": 0.7872314453125, "reward_std": 0.011262375861406326, "rewards//mean": 0.7872314453125, "rewards//std": 0.02338545024394989, "step": 764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.153, "grad_norm": 0.6882827877998352, "kl": 0.041853323113173246, "learning_rate": 4.747680210418302e-06, "loss": 0.0042, "num_tokens": 4989792.0, "reward": 0.83917236328125, "reward_std": 0.014110451564192772, "rewards//mean": 0.83917236328125, "rewards//std": 0.03160110116004944, "step": 765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1532, "grad_norm": 0.6096701622009277, "kl": 0.037586374673992395, "learning_rate": 4.746985115747918e-06, "loss": 0.0038, "num_tokens": 4996456.0, "reward": 0.8651123046875, "reward_std": 0.010422743856906891, "rewards//mean": 0.8651123046875, "rewards//std": 0.02676604874432087, "step": 766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1534, "grad_norm": 0.6226118206977844, "kl": 0.03840547287836671, "learning_rate": 4.746289115992198e-06, "loss": 0.0038, "num_tokens": 5002968.0, "reward": 0.8330078125, "reward_std": 0.011964459903538227, "rewards//mean": 0.8330078125, "rewards//std": 0.02288324572145939, "step": 767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1536, "grad_norm": 0.5931572318077087, "kl": 0.03918908000923693, "learning_rate": 4.74559221143149e-06, "loss": 0.0039, "num_tokens": 5009488.0, "reward": 0.886474609375, "reward_std": 0.013542715460062027, "rewards//mean": 0.886474609375, "rewards//std": 0.021775512024760246, "step": 768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1538, "grad_norm": 0.6926736831665039, "kl": 0.0409997534006834, "learning_rate": 4.744894402346508e-06, "loss": 0.0041, "num_tokens": 5016080.0, "reward": 0.86126708984375, "reward_std": 0.01152787171304226, "rewards//mean": 0.86126708984375, "rewards//std": 0.017427530139684677, "step": 769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.154, "grad_norm": 0.6112828254699707, "kl": 0.03526647645048797, "learning_rate": 4.744195689018331e-06, "loss": 0.0035, "num_tokens": 5022568.0, "reward": 0.8729248046875, "reward_std": 0.013185831718146801, "rewards//mean": 0.8729248046875, "rewards//std": 0.03000018745660782, "step": 770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1542, "grad_norm": 0.7000649571418762, "kl": 0.03744476521387696, "learning_rate": 4.743496071728396e-06, "loss": 0.0037, "num_tokens": 5029120.0, "reward": 0.828369140625, "reward_std": 0.009715208783745766, "rewards//mean": 0.828369140625, "rewards//std": 0.021663999184966087, "step": 771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1544, "grad_norm": 0.7032372951507568, "kl": 0.04153305198997259, "learning_rate": 4.742795550758514e-06, "loss": 0.0042, "num_tokens": 5035608.0, "reward": 0.83880615234375, "reward_std": 0.014286772347986698, "rewards//mean": 0.83880615234375, "rewards//std": 0.02569752372801304, "step": 772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1546, "grad_norm": 0.6795111298561096, "kl": 0.04108638036996126, "learning_rate": 4.742094126390851e-06, "loss": 0.0041, "num_tokens": 5042016.0, "reward": 0.81842041015625, "reward_std": 0.010245722718536854, "rewards//mean": 0.81842041015625, "rewards//std": 0.018109088763594627, "step": 773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1548, "grad_norm": 0.6258853077888489, "kl": 0.03198192222043872, "learning_rate": 4.7413917989079415e-06, "loss": 0.0032, "num_tokens": 5048688.0, "reward": 0.85015869140625, "reward_std": 0.014012346044182777, "rewards//mean": 0.85015869140625, "rewards//std": 0.02717375010251999, "step": 774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.155, "grad_norm": 0.6283664703369141, "kl": 0.0385423768311739, "learning_rate": 4.740688568592685e-06, "loss": 0.0039, "num_tokens": 5055208.0, "reward": 0.83642578125, "reward_std": 0.007748948875814676, "rewards//mean": 0.83642578125, "rewards//std": 0.016955677419900894, "step": 775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1552, "grad_norm": 0.7539633512496948, "kl": 0.044441851787269115, "learning_rate": 4.73998443572834e-06, "loss": 0.0044, "num_tokens": 5061736.0, "reward": 0.84857177734375, "reward_std": 0.013371797278523445, "rewards//mean": 0.84857177734375, "rewards//std": 0.02089998498558998, "step": 776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1554, "grad_norm": 0.6789582967758179, "kl": 0.0363217368721962, "learning_rate": 4.7392794005985324e-06, "loss": 0.0036, "num_tokens": 5068240.0, "reward": 0.85369873046875, "reward_std": 0.01439887098968029, "rewards//mean": 0.85369873046875, "rewards//std": 0.03139637038111687, "step": 777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1556, "grad_norm": 0.6625312566757202, "kl": 0.0394647023640573, "learning_rate": 4.7385734634872504e-06, "loss": 0.0039, "num_tokens": 5074752.0, "reward": 0.8798828125, "reward_std": 0.009248141199350357, "rewards//mean": 0.8798828125, "rewards//std": 0.019722526893019676, "step": 778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1558, "grad_norm": 0.6864038109779358, "kl": 0.03931661881506443, "learning_rate": 4.7378666246788444e-06, "loss": 0.0039, "num_tokens": 5081280.0, "reward": 0.82672119140625, "reward_std": 0.010209892876446247, "rewards//mean": 0.82672119140625, "rewards//std": 0.021703006699681282, "step": 779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.156, "grad_norm": 0.7165336608886719, "kl": 0.045301258796826005, "learning_rate": 4.73715888445803e-06, "loss": 0.0045, "num_tokens": 5087808.0, "reward": 0.78179931640625, "reward_std": 0.008207255974411964, "rewards//mean": 0.78179931640625, "rewards//std": 0.011643425561487675, "step": 780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1562, "grad_norm": 0.6524578332901001, "kl": 0.032820957247167826, "learning_rate": 4.736450243109885e-06, "loss": 0.0033, "num_tokens": 5094328.0, "reward": 0.8231201171875, "reward_std": 0.011289829388260841, "rewards//mean": 0.8231201171875, "rewards//std": 0.02681577205657959, "step": 781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1564, "grad_norm": 0.6669304370880127, "kl": 0.0449559367261827, "learning_rate": 4.735740700919848e-06, "loss": 0.0045, "num_tokens": 5100752.0, "reward": 0.8516845703125, "reward_std": 0.011253753677010536, "rewards//mean": 0.8516845703125, "rewards//std": 0.024655919522047043, "step": 782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1566, "grad_norm": 0.6886339783668518, "kl": 0.045894182519987226, "learning_rate": 4.7350302581737255e-06, "loss": 0.0046, "num_tokens": 5107296.0, "reward": 0.84368896484375, "reward_std": 0.010614918544888496, "rewards//mean": 0.84368896484375, "rewards//std": 0.026406412944197655, "step": 783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1568, "grad_norm": 0.6456897854804993, "kl": 0.04228213196620345, "learning_rate": 4.734318915157682e-06, "loss": 0.0042, "num_tokens": 5113960.0, "reward": 0.8406982421875, "reward_std": 0.014344006776809692, "rewards//mean": 0.8406982421875, "rewards//std": 0.03267809748649597, "step": 784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.157, "grad_norm": 0.6308059692382812, "kl": 0.03701934986747801, "learning_rate": 4.7336066721582464e-06, "loss": 0.0037, "num_tokens": 5120456.0, "reward": 0.822265625, "reward_std": 0.013070710003376007, "rewards//mean": 0.822265625, "rewards//std": 0.02879762277007103, "step": 785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1572, "grad_norm": 0.6404659152030945, "kl": 0.040786292403936386, "learning_rate": 4.73289352946231e-06, "loss": 0.0041, "num_tokens": 5126968.0, "reward": 0.85162353515625, "reward_std": 0.011420845054090023, "rewards//mean": 0.85162353515625, "rewards//std": 0.02786138653755188, "step": 786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1574, "grad_norm": 0.6793121099472046, "kl": 0.046549074817448854, "learning_rate": 4.732179487357127e-06, "loss": 0.0047, "num_tokens": 5133368.0, "reward": 0.85272216796875, "reward_std": 0.014255034737288952, "rewards//mean": 0.85272216796875, "rewards//std": 0.018813444301486015, "step": 787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1576, "grad_norm": 0.6696339249610901, "kl": 0.04117098404094577, "learning_rate": 4.731464546130315e-06, "loss": 0.0041, "num_tokens": 5139944.0, "reward": 0.78778076171875, "reward_std": 0.011665645986795425, "rewards//mean": 0.78778076171875, "rewards//std": 0.02138470858335495, "step": 788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1578, "grad_norm": 0.6691408753395081, "kl": 0.036057604011148214, "learning_rate": 4.730748706069849e-06, "loss": 0.0036, "num_tokens": 5146472.0, "reward": 0.80499267578125, "reward_std": 0.011604068800807, "rewards//mean": 0.80499267578125, "rewards//std": 0.030689295381307602, "step": 789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.158, "grad_norm": 0.6431038975715637, "kl": 0.03318591718561947, "learning_rate": 4.730031967464071e-06, "loss": 0.0033, "num_tokens": 5153088.0, "reward": 0.8427734375, "reward_std": 0.011495914310216904, "rewards//mean": 0.8427734375, "rewards//std": 0.014894897118210793, "step": 790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1582, "grad_norm": 0.6348263621330261, "kl": 0.037593565415591, "learning_rate": 4.729314330601684e-06, "loss": 0.0038, "num_tokens": 5159672.0, "reward": 0.81695556640625, "reward_std": 0.01404502335935831, "rewards//mean": 0.81695556640625, "rewards//std": 0.01725819706916809, "step": 791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1584, "grad_norm": 0.6973318457603455, "kl": 0.041428486816585064, "learning_rate": 4.72859579577175e-06, "loss": 0.0041, "num_tokens": 5166144.0, "reward": 0.7879638671875, "reward_std": 0.012339320033788681, "rewards//mean": 0.7879638671875, "rewards//std": 0.017778070643544197, "step": 792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1586, "grad_norm": 0.6690841913223267, "kl": 0.043853968381881714, "learning_rate": 4.7278763632636974e-06, "loss": 0.0044, "num_tokens": 5172720.0, "reward": 0.8472900390625, "reward_std": 0.014657915569841862, "rewards//mean": 0.8472900390625, "rewards//std": 0.02887086011469364, "step": 793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1588, "grad_norm": 0.6164650321006775, "kl": 0.04399008536711335, "learning_rate": 4.727156033367312e-06, "loss": 0.0044, "num_tokens": 5179160.0, "reward": 0.85150146484375, "reward_std": 0.012938274070620537, "rewards//mean": 0.85150146484375, "rewards//std": 0.01753145270049572, "step": 794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.159, "grad_norm": 0.6124863624572754, "kl": 0.03883708012290299, "learning_rate": 4.7264348063727415e-06, "loss": 0.0039, "num_tokens": 5185632.0, "reward": 0.84783935546875, "reward_std": 0.008572231978178024, "rewards//mean": 0.84783935546875, "rewards//std": 0.01818166859447956, "step": 795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1592, "grad_norm": 0.6611150503158569, "kl": 0.036757583264261484, "learning_rate": 4.725712682570498e-06, "loss": 0.0037, "num_tokens": 5192240.0, "reward": 0.7779541015625, "reward_std": 0.01289368700236082, "rewards//mean": 0.7779541015625, "rewards//std": 0.02997797727584839, "step": 796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1594, "grad_norm": 0.5918968319892883, "kl": 0.04113738890737295, "learning_rate": 4.724989662251452e-06, "loss": 0.0041, "num_tokens": 5198688.0, "reward": 0.80126953125, "reward_std": 0.012578248977661133, "rewards//mean": 0.80126953125, "rewards//std": 0.03841540217399597, "step": 797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1596, "grad_norm": 0.5858060717582703, "kl": 0.03528264374472201, "learning_rate": 4.724265745706837e-06, "loss": 0.0035, "num_tokens": 5205104.0, "reward": 0.85760498046875, "reward_std": 0.013114457949995995, "rewards//mean": 0.85760498046875, "rewards//std": 0.03279987350106239, "step": 798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1598, "grad_norm": 0.6399039030075073, "kl": 0.03707644832320511, "learning_rate": 4.723540933228245e-06, "loss": 0.0037, "num_tokens": 5211616.0, "reward": 0.85491943359375, "reward_std": 0.011643383651971817, "rewards//mean": 0.85491943359375, "rewards//std": 0.02243286743760109, "step": 799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.16, "grad_norm": 0.7004696726799011, "kl": 0.038236090214923024, "learning_rate": 4.7228152251076295e-06, "loss": 0.0038, "num_tokens": 5218080.0, "reward": 0.85076904296875, "reward_std": 0.01267247460782528, "rewards//mean": 0.85076904296875, "rewards//std": 0.018986446782946587, "step": 800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1602, "grad_norm": 0.6958453059196472, "kl": 0.035821242490783334, "learning_rate": 4.7220886216373095e-06, "loss": 0.0036, "num_tokens": 5224624.0, "reward": 0.8458251953125, "reward_std": 0.012326447293162346, "rewards//mean": 0.8458251953125, "rewards//std": 0.0214402936398983, "step": 801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1604, "grad_norm": 0.6648064851760864, "kl": 0.03554333420470357, "learning_rate": 4.7213611231099575e-06, "loss": 0.0036, "num_tokens": 5231176.0, "reward": 0.8375244140625, "reward_std": 0.015521214343607426, "rewards//mean": 0.8375244140625, "rewards//std": 0.0276188924908638, "step": 802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1606, "grad_norm": 0.694705605506897, "kl": 0.04491404443979263, "learning_rate": 4.7206327298186105e-06, "loss": 0.0045, "num_tokens": 5237688.0, "reward": 0.85906982421875, "reward_std": 0.011747853830456734, "rewards//mean": 0.85906982421875, "rewards//std": 0.017038391903042793, "step": 803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1608, "grad_norm": 0.700727641582489, "kl": 0.03822877514176071, "learning_rate": 4.7199034420566656e-06, "loss": 0.0038, "num_tokens": 5244160.0, "reward": 0.86480712890625, "reward_std": 0.012476685456931591, "rewards//mean": 0.86480712890625, "rewards//std": 0.028035791590809822, "step": 804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.161, "grad_norm": 0.6538044214248657, "kl": 0.036812544567510486, "learning_rate": 4.7191732601178795e-06, "loss": 0.0037, "num_tokens": 5250784.0, "reward": 0.81585693359375, "reward_std": 0.013583207502961159, "rewards//mean": 0.81585693359375, "rewards//std": 0.023019032552838326, "step": 805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1612, "grad_norm": 0.6818084716796875, "kl": 0.04158745566383004, "learning_rate": 4.71844218429637e-06, "loss": 0.0042, "num_tokens": 5257312.0, "reward": 0.86090087890625, "reward_std": 0.012027601711452007, "rewards//mean": 0.86090087890625, "rewards//std": 0.0386737696826458, "step": 806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1614, "grad_norm": 0.6377874612808228, "kl": 0.03743350366130471, "learning_rate": 4.717710214886614e-06, "loss": 0.0037, "num_tokens": 5263848.0, "reward": 0.84771728515625, "reward_std": 0.01110083982348442, "rewards//mean": 0.84771728515625, "rewards//std": 0.03908414766192436, "step": 807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1616, "grad_norm": 0.6489036679267883, "kl": 0.04202882153913379, "learning_rate": 4.716977352183449e-06, "loss": 0.0042, "num_tokens": 5270336.0, "reward": 0.87298583984375, "reward_std": 0.011964542791247368, "rewards//mean": 0.87298583984375, "rewards//std": 0.04070214927196503, "step": 808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1618, "grad_norm": 0.6986774206161499, "kl": 0.04818084090948105, "learning_rate": 4.716243596482071e-06, "loss": 0.0048, "num_tokens": 5276920.0, "reward": 0.855712890625, "reward_std": 0.014097800478339195, "rewards//mean": 0.855712890625, "rewards//std": 0.04160777106881142, "step": 809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.162, "grad_norm": 0.7664393782615662, "kl": 0.03928353264927864, "learning_rate": 4.715508948078037e-06, "loss": 0.0039, "num_tokens": 5283392.0, "reward": 0.865234375, "reward_std": 0.016030853614211082, "rewards//mean": 0.865234375, "rewards//std": 0.042117632925510406, "step": 810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1622, "grad_norm": 0.6129316091537476, "kl": 0.04093162016943097, "learning_rate": 4.714773407267264e-06, "loss": 0.0041, "num_tokens": 5289944.0, "reward": 0.8447265625, "reward_std": 0.012979458086192608, "rewards//mean": 0.8447265625, "rewards//std": 0.024140017107129097, "step": 811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1624, "grad_norm": 0.6467927694320679, "kl": 0.04427446564659476, "learning_rate": 4.714036974346028e-06, "loss": 0.0044, "num_tokens": 5296464.0, "reward": 0.82745361328125, "reward_std": 0.014607482589781284, "rewards//mean": 0.82745361328125, "rewards//std": 0.034782618284225464, "step": 812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1626, "grad_norm": 0.7141029238700867, "kl": 0.03883550688624382, "learning_rate": 4.7132996496109625e-06, "loss": 0.0039, "num_tokens": 5303040.0, "reward": 0.80206298828125, "reward_std": 0.010535812005400658, "rewards//mean": 0.80206298828125, "rewards//std": 0.017134075984358788, "step": 813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1628, "grad_norm": 0.6874210834503174, "kl": 0.03825618769042194, "learning_rate": 4.712561433359064e-06, "loss": 0.0038, "num_tokens": 5309568.0, "reward": 0.84271240234375, "reward_std": 0.011939950287342072, "rewards//mean": 0.84271240234375, "rewards//std": 0.024391964077949524, "step": 814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.163, "grad_norm": 0.6792731285095215, "kl": 0.047371129505336285, "learning_rate": 4.7118223258876845e-06, "loss": 0.0047, "num_tokens": 5316040.0, "reward": 0.79132080078125, "reward_std": 0.011203013360500336, "rewards//mean": 0.79132080078125, "rewards//std": 0.01953502930700779, "step": 815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1632, "grad_norm": 0.606471598148346, "kl": 0.03948652045801282, "learning_rate": 4.711082327494536e-06, "loss": 0.0039, "num_tokens": 5322584.0, "reward": 0.84771728515625, "reward_std": 0.014606360346078873, "rewards//mean": 0.84771728515625, "rewards//std": 0.02167089842259884, "step": 816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1634, "grad_norm": 0.6929451823234558, "kl": 0.04543322464451194, "learning_rate": 4.710341438477691e-06, "loss": 0.0045, "num_tokens": 5329112.0, "reward": 0.82305908203125, "reward_std": 0.010059753432869911, "rewards//mean": 0.82305908203125, "rewards//std": 0.024692347273230553, "step": 817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1636, "grad_norm": 0.6723214387893677, "kl": 0.04372272826731205, "learning_rate": 4.709599659135579e-06, "loss": 0.0044, "num_tokens": 5335608.0, "reward": 0.8341064453125, "reward_std": 0.013317547738552094, "rewards//mean": 0.8341064453125, "rewards//std": 0.024159763008356094, "step": 818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1638, "grad_norm": 0.6974331140518188, "kl": 0.05139124137349427, "learning_rate": 4.708856989766988e-06, "loss": 0.0051, "num_tokens": 5342152.0, "reward": 0.82366943359375, "reward_std": 0.011363249272108078, "rewards//mean": 0.82366943359375, "rewards//std": 0.02652880735695362, "step": 819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.164, "grad_norm": 0.7144467234611511, "kl": 0.04704140080139041, "learning_rate": 4.708113430671066e-06, "loss": 0.0047, "num_tokens": 5348688.0, "reward": 0.82305908203125, "reward_std": 0.011271877214312553, "rewards//mean": 0.82305908203125, "rewards//std": 0.01991415023803711, "step": 820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1642, "grad_norm": 0.6751676797866821, "kl": 0.034764718497172, "learning_rate": 4.707368982147318e-06, "loss": 0.0035, "num_tokens": 5355232.0, "reward": 0.8125, "reward_std": 0.010452823713421822, "rewards//mean": 0.8125, "rewards//std": 0.022423502057790756, "step": 821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1644, "grad_norm": 0.7800047397613525, "kl": 0.04225182137452066, "learning_rate": 4.706623644495608e-06, "loss": 0.0042, "num_tokens": 5361824.0, "reward": 0.86041259765625, "reward_std": 0.00938594713807106, "rewards//mean": 0.86041259765625, "rewards//std": 0.027074409648776054, "step": 822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1646, "grad_norm": 0.6850138902664185, "kl": 0.042775253765285015, "learning_rate": 4.705877418016157e-06, "loss": 0.0043, "num_tokens": 5368328.0, "reward": 0.83941650390625, "reward_std": 0.011559306643903255, "rewards//mean": 0.83941650390625, "rewards//std": 0.021588314324617386, "step": 823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1648, "grad_norm": 0.6577291488647461, "kl": 0.04705011798068881, "learning_rate": 4.705130303009547e-06, "loss": 0.0047, "num_tokens": 5374976.0, "reward": 0.843994140625, "reward_std": 0.015368317253887653, "rewards//mean": 0.843994140625, "rewards//std": 0.03274773806333542, "step": 824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.165, "grad_norm": 0.6514129638671875, "kl": 0.034172447165474296, "learning_rate": 4.7043822997767145e-06, "loss": 0.0034, "num_tokens": 5381512.0, "reward": 0.85137939453125, "reward_std": 0.015748068690299988, "rewards//mean": 0.85137939453125, "rewards//std": 0.0229300819337368, "step": 825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1652, "grad_norm": 0.7008969783782959, "kl": 0.0387201386038214, "learning_rate": 4.703633408618955e-06, "loss": 0.0039, "num_tokens": 5388152.0, "reward": 0.82940673828125, "reward_std": 0.010123923420906067, "rewards//mean": 0.82940673828125, "rewards//std": 0.024310529232025146, "step": 826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1654, "grad_norm": 0.7166876792907715, "kl": 0.04297814145684242, "learning_rate": 4.702883629837922e-06, "loss": 0.0043, "num_tokens": 5394632.0, "reward": 0.8524169921875, "reward_std": 0.015698876231908798, "rewards//mean": 0.8524169921875, "rewards//std": 0.031592775136232376, "step": 827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1656, "grad_norm": 0.7098883390426636, "kl": 0.04341789707541466, "learning_rate": 4.7021329637356274e-06, "loss": 0.0043, "num_tokens": 5401120.0, "reward": 0.8165283203125, "reward_std": 0.014469243586063385, "rewards//mean": 0.8165283203125, "rewards//std": 0.030440986156463623, "step": 828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1658, "grad_norm": 0.677291750907898, "kl": 0.04045027890242636, "learning_rate": 4.701381410614437e-06, "loss": 0.004, "num_tokens": 5407688.0, "reward": 0.84991455078125, "reward_std": 0.01516575738787651, "rewards//mean": 0.84991455078125, "rewards//std": 0.031054111197590828, "step": 829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.166, "grad_norm": 0.611182689666748, "kl": 0.03975608246400952, "learning_rate": 4.700628970777078e-06, "loss": 0.004, "num_tokens": 5414168.0, "reward": 0.87420654296875, "reward_std": 0.011042594909667969, "rewards//mean": 0.87420654296875, "rewards//std": 0.02186768874526024, "step": 830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1662, "grad_norm": 0.6226016283035278, "kl": 0.03868942544795573, "learning_rate": 4.699875644526633e-06, "loss": 0.0039, "num_tokens": 5420696.0, "reward": 0.849853515625, "reward_std": 0.009386981837451458, "rewards//mean": 0.849853515625, "rewards//std": 0.018160944804549217, "step": 831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1664, "grad_norm": 0.7501590847969055, "kl": 0.04054065514355898, "learning_rate": 4.699121432166542e-06, "loss": 0.0041, "num_tokens": 5427272.0, "reward": 0.86236572265625, "reward_std": 0.013934629037976265, "rewards//mean": 0.86236572265625, "rewards//std": 0.03134860470890999, "step": 832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1666, "grad_norm": 0.6518362760543823, "kl": 0.03787135658785701, "learning_rate": 4.6983663340006e-06, "loss": 0.0038, "num_tokens": 5433880.0, "reward": 0.8211669921875, "reward_std": 0.010044453665614128, "rewards//mean": 0.8211669921875, "rewards//std": 0.024274777621030807, "step": 833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1668, "grad_norm": 0.7006896734237671, "kl": 0.03864523209631443, "learning_rate": 4.697610350332962e-06, "loss": 0.0039, "num_tokens": 5440504.0, "reward": 0.87115478515625, "reward_std": 0.013994935899972916, "rewards//mean": 0.87115478515625, "rewards//std": 0.020862286910414696, "step": 834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.167, "grad_norm": 0.7370618581771851, "kl": 0.043036503717303276, "learning_rate": 4.696853481468137e-06, "loss": 0.0043, "num_tokens": 5447056.0, "reward": 0.811279296875, "reward_std": 0.010254578664898872, "rewards//mean": 0.811279296875, "rewards//std": 0.016512421891093254, "step": 835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1672, "grad_norm": 0.6868679523468018, "kl": 0.0437261825427413, "learning_rate": 4.6960957277109945e-06, "loss": 0.0044, "num_tokens": 5453480.0, "reward": 0.8250732421875, "reward_std": 0.01129057165235281, "rewards//mean": 0.8250732421875, "rewards//std": 0.018155528232455254, "step": 836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1674, "grad_norm": 0.6761255860328674, "kl": 0.039087172131985426, "learning_rate": 4.695337089366754e-06, "loss": 0.0039, "num_tokens": 5460008.0, "reward": 0.85760498046875, "reward_std": 0.012311861850321293, "rewards//mean": 0.85760498046875, "rewards//std": 0.02614312246441841, "step": 837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1676, "grad_norm": 0.6296420097351074, "kl": 0.04748300788924098, "learning_rate": 4.694577566740996e-06, "loss": 0.0047, "num_tokens": 5466456.0, "reward": 0.87274169921875, "reward_std": 0.014107296243309975, "rewards//mean": 0.87274169921875, "rewards//std": 0.03133362904191017, "step": 838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1678, "grad_norm": 0.717560887336731, "kl": 0.04282214818522334, "learning_rate": 4.693817160139657e-06, "loss": 0.0043, "num_tokens": 5473096.0, "reward": 0.83001708984375, "reward_std": 0.012546509504318237, "rewards//mean": 0.83001708984375, "rewards//std": 0.025458427146077156, "step": 839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.168, "grad_norm": 0.7495692372322083, "kl": 0.04018871276639402, "learning_rate": 4.693055869869029e-06, "loss": 0.004, "num_tokens": 5479584.0, "reward": 0.86773681640625, "reward_std": 0.015347221866250038, "rewards//mean": 0.86773681640625, "rewards//std": 0.03341394290328026, "step": 840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1682, "grad_norm": 0.6433113813400269, "kl": 0.04005603678524494, "learning_rate": 4.692293696235758e-06, "loss": 0.004, "num_tokens": 5486064.0, "reward": 0.8238525390625, "reward_std": 0.012554389424622059, "rewards//mean": 0.8238525390625, "rewards//std": 0.022996416315436363, "step": 841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1684, "grad_norm": 0.7169666886329651, "kl": 0.03929765592329204, "learning_rate": 4.6915306395468485e-06, "loss": 0.0039, "num_tokens": 5492592.0, "reward": 0.83978271484375, "reward_std": 0.0161487627774477, "rewards//mean": 0.83978271484375, "rewards//std": 0.026185357943177223, "step": 842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1686, "grad_norm": 0.7075477838516235, "kl": 0.042687549255788326, "learning_rate": 4.690766700109659e-06, "loss": 0.0043, "num_tokens": 5499096.0, "reward": 0.85882568359375, "reward_std": 0.015180788934230804, "rewards//mean": 0.85882568359375, "rewards//std": 0.03275507688522339, "step": 843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1688, "grad_norm": 0.6575949192047119, "kl": 0.04145486559718847, "learning_rate": 4.690001878231906e-06, "loss": 0.0041, "num_tokens": 5505616.0, "reward": 0.8748779296875, "reward_std": 0.014031785540282726, "rewards//mean": 0.8748779296875, "rewards//std": 0.019375229254364967, "step": 844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.169, "grad_norm": 0.6833431124687195, "kl": 0.03976182406768203, "learning_rate": 4.689236174221658e-06, "loss": 0.004, "num_tokens": 5512120.0, "reward": 0.7947998046875, "reward_std": 0.01373874768614769, "rewards//mean": 0.7947998046875, "rewards//std": 0.020948907360434532, "step": 845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1692, "grad_norm": 0.7554038166999817, "kl": 0.03467200044542551, "learning_rate": 4.688469588387339e-06, "loss": 0.0035, "num_tokens": 5518696.0, "reward": 0.87481689453125, "reward_std": 0.01231304369866848, "rewards//mean": 0.87481689453125, "rewards//std": 0.024154357612133026, "step": 846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1694, "grad_norm": 0.7133338451385498, "kl": 0.044337095227092505, "learning_rate": 4.687702121037734e-06, "loss": 0.0044, "num_tokens": 5525248.0, "reward": 0.76873779296875, "reward_std": 0.010633913800120354, "rewards//mean": 0.76873779296875, "rewards//std": 0.03006640449166298, "step": 847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1696, "grad_norm": 0.570580005645752, "kl": 0.039059948176145554, "learning_rate": 4.6869337724819745e-06, "loss": 0.0039, "num_tokens": 5531752.0, "reward": 0.8349609375, "reward_std": 0.008149969391524792, "rewards//mean": 0.8349609375, "rewards//std": 0.019212428480386734, "step": 848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1698, "grad_norm": 0.7679130434989929, "kl": 0.045566384214907885, "learning_rate": 4.686164543029554e-06, "loss": 0.0046, "num_tokens": 5538288.0, "reward": 0.7852783203125, "reward_std": 0.011804823763668537, "rewards//mean": 0.7852783203125, "rewards//std": 0.020575610920786858, "step": 849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.17, "grad_norm": 0.6468374729156494, "kl": 0.04972790856845677, "learning_rate": 4.685394432990316e-06, "loss": 0.005, "num_tokens": 5544752.0, "reward": 0.85662841796875, "reward_std": 0.010980328544974327, "rewards//mean": 0.85662841796875, "rewards//std": 0.03794073686003685, "step": 850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1702, "grad_norm": 0.8263505101203918, "kl": 0.04581076675094664, "learning_rate": 4.684623442674463e-06, "loss": 0.0046, "num_tokens": 5551272.0, "reward": 0.8409423828125, "reward_std": 0.012282561510801315, "rewards//mean": 0.8409423828125, "rewards//std": 0.022003378719091415, "step": 851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1704, "grad_norm": 0.6327822804450989, "kl": 0.04290678119286895, "learning_rate": 4.683851572392548e-06, "loss": 0.0043, "num_tokens": 5557872.0, "reward": 0.84039306640625, "reward_std": 0.010098406113684177, "rewards//mean": 0.84039306640625, "rewards//std": 0.01718788407742977, "step": 852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1706, "grad_norm": 0.5984649062156677, "kl": 0.041823903331533074, "learning_rate": 4.68307882245548e-06, "loss": 0.0042, "num_tokens": 5564400.0, "reward": 0.8529052734375, "reward_std": 0.008947965689003468, "rewards//mean": 0.8529052734375, "rewards//std": 0.029349274933338165, "step": 853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1708, "grad_norm": 0.6533376574516296, "kl": 0.04166531004011631, "learning_rate": 4.682305193174524e-06, "loss": 0.0042, "num_tokens": 5570936.0, "reward": 0.8236083984375, "reward_std": 0.00973258726298809, "rewards//mean": 0.8236083984375, "rewards//std": 0.011671505868434906, "step": 854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.171, "grad_norm": 0.6973181366920471, "kl": 0.038289697375148535, "learning_rate": 4.681530684861298e-06, "loss": 0.0038, "num_tokens": 5577480.0, "reward": 0.79681396484375, "reward_std": 0.007634049281477928, "rewards//mean": 0.79681396484375, "rewards//std": 0.018681829795241356, "step": 855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1712, "grad_norm": 0.6991540789604187, "kl": 0.04020833037793636, "learning_rate": 4.680755297827772e-06, "loss": 0.004, "num_tokens": 5583984.0, "reward": 0.82098388671875, "reward_std": 0.012616423889994621, "rewards//mean": 0.82098388671875, "rewards//std": 0.022867942228913307, "step": 856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1714, "grad_norm": 0.6678091883659363, "kl": 0.03987215249799192, "learning_rate": 4.6799790323862735e-06, "loss": 0.004, "num_tokens": 5590536.0, "reward": 0.82659912109375, "reward_std": 0.014262435957789421, "rewards//mean": 0.82659912109375, "rewards//std": 0.033164750784635544, "step": 857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1716, "grad_norm": 0.6583226919174194, "kl": 0.04954609926789999, "learning_rate": 4.679201888849481e-06, "loss": 0.005, "num_tokens": 5596984.0, "reward": 0.85125732421875, "reward_std": 0.018312999978661537, "rewards//mean": 0.85125732421875, "rewards//std": 0.034297019243240356, "step": 858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1718, "grad_norm": 0.6026672720909119, "kl": 0.035488207591697574, "learning_rate": 4.678423867530428e-06, "loss": 0.0035, "num_tokens": 5603520.0, "reward": 0.85992431640625, "reward_std": 0.009927749633789062, "rewards//mean": 0.85992431640625, "rewards//std": 0.02918913960456848, "step": 859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.172, "grad_norm": 0.779086709022522, "kl": 0.04589052964001894, "learning_rate": 4.677644968742503e-06, "loss": 0.0046, "num_tokens": 5610072.0, "reward": 0.84796142578125, "reward_std": 0.013327594846487045, "rewards//mean": 0.84796142578125, "rewards//std": 0.03296468406915665, "step": 860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1722, "grad_norm": 0.7797392010688782, "kl": 0.03917522868141532, "learning_rate": 4.676865192799443e-06, "loss": 0.0039, "num_tokens": 5616520.0, "reward": 0.8319091796875, "reward_std": 0.015295634046196938, "rewards//mean": 0.8319091796875, "rewards//std": 0.031780049204826355, "step": 861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1724, "grad_norm": 0.7025132179260254, "kl": 0.04026870639063418, "learning_rate": 4.676084540015345e-06, "loss": 0.004, "num_tokens": 5623024.0, "reward": 0.8492431640625, "reward_std": 0.011275040917098522, "rewards//mean": 0.8492431640625, "rewards//std": 0.015386302955448627, "step": 862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1726, "grad_norm": 0.7985981106758118, "kl": 0.04054851154796779, "learning_rate": 4.675303010704654e-06, "loss": 0.0041, "num_tokens": 5629568.0, "reward": 0.8460693359375, "reward_std": 0.011221226304769516, "rewards//mean": 0.8460693359375, "rewards//std": 0.023945782333612442, "step": 863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1728, "grad_norm": 0.7008482813835144, "kl": 0.04197323229163885, "learning_rate": 4.674520605182171e-06, "loss": 0.0042, "num_tokens": 5636168.0, "reward": 0.8560791015625, "reward_std": 0.01303877867758274, "rewards//mean": 0.8560791015625, "rewards//std": 0.03236343339085579, "step": 864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.173, "grad_norm": 0.660435676574707, "kl": 0.038663000566884875, "learning_rate": 4.673737323763048e-06, "loss": 0.0039, "num_tokens": 5642648.0, "reward": 0.86883544921875, "reward_std": 0.009736208245158195, "rewards//mean": 0.86883544921875, "rewards//std": 0.020384449511766434, "step": 865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1732, "grad_norm": 0.6624974608421326, "kl": 0.043648229679092765, "learning_rate": 4.672953166762791e-06, "loss": 0.0044, "num_tokens": 5649224.0, "reward": 0.8408203125, "reward_std": 0.013061913661658764, "rewards//mean": 0.8408203125, "rewards//std": 0.02628237009048462, "step": 866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1734, "grad_norm": 0.6637141704559326, "kl": 0.04361239541321993, "learning_rate": 4.672168134497258e-06, "loss": 0.0044, "num_tokens": 5655744.0, "reward": 0.8326416015625, "reward_std": 0.01376126054674387, "rewards//mean": 0.8326416015625, "rewards//std": 0.02711889147758484, "step": 867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1736, "grad_norm": 0.7084535956382751, "kl": 0.04428035207092762, "learning_rate": 4.671382227282661e-06, "loss": 0.0044, "num_tokens": 5662272.0, "reward": 0.88690185546875, "reward_std": 0.018743984401226044, "rewards//mean": 0.88690185546875, "rewards//std": 0.03211710602045059, "step": 868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1738, "grad_norm": 0.6958263516426086, "kl": 0.038506853859871626, "learning_rate": 4.670595445435561e-06, "loss": 0.0039, "num_tokens": 5668808.0, "reward": 0.853515625, "reward_std": 0.014965204522013664, "rewards//mean": 0.853515625, "rewards//std": 0.03174978867173195, "step": 869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.174, "grad_norm": 0.6819318532943726, "kl": 0.042381568579003215, "learning_rate": 4.669807789272877e-06, "loss": 0.0042, "num_tokens": 5675336.0, "reward": 0.839111328125, "reward_std": 0.014586934819817543, "rewards//mean": 0.839111328125, "rewards//std": 0.04526562616229057, "step": 870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1742, "grad_norm": 0.618557333946228, "kl": 0.04537222860381007, "learning_rate": 4.669019259111873e-06, "loss": 0.0045, "num_tokens": 5681856.0, "reward": 0.83349609375, "reward_std": 0.011118524707853794, "rewards//mean": 0.83349609375, "rewards//std": 0.023046717047691345, "step": 871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1744, "grad_norm": 0.6631488800048828, "kl": 0.04104706086218357, "learning_rate": 4.668229855270172e-06, "loss": 0.0041, "num_tokens": 5688536.0, "reward": 0.8538818359375, "reward_std": 0.013846810907125473, "rewards//mean": 0.8538818359375, "rewards//std": 0.02200612984597683, "step": 872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1746, "grad_norm": 0.5976673364639282, "kl": 0.04125231131911278, "learning_rate": 4.667439578065745e-06, "loss": 0.0041, "num_tokens": 5695080.0, "reward": 0.83306884765625, "reward_std": 0.01117327157407999, "rewards//mean": 0.83306884765625, "rewards//std": 0.02727050706744194, "step": 873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1748, "grad_norm": 0.6268535256385803, "kl": 0.037555265706032515, "learning_rate": 4.666648427816914e-06, "loss": 0.0038, "num_tokens": 5701576.0, "reward": 0.85052490234375, "reward_std": 0.00950030330568552, "rewards//mean": 0.85052490234375, "rewards//std": 0.018535403534770012, "step": 874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.175, "grad_norm": 0.677286684513092, "kl": 0.03843085980042815, "learning_rate": 4.665856404842356e-06, "loss": 0.0038, "num_tokens": 5708056.0, "reward": 0.86199951171875, "reward_std": 0.017461739480495453, "rewards//mean": 0.86199951171875, "rewards//std": 0.03396883234381676, "step": 875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1752, "grad_norm": 0.5880584120750427, "kl": 0.03914790088310838, "learning_rate": 4.665063509461098e-06, "loss": 0.0039, "num_tokens": 5714608.0, "reward": 0.84222412109375, "reward_std": 0.009593464434146881, "rewards//mean": 0.84222412109375, "rewards//std": 0.024935755878686905, "step": 876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1754, "grad_norm": 0.6908532977104187, "kl": 0.04660891415551305, "learning_rate": 4.664269741992516e-06, "loss": 0.0047, "num_tokens": 5721080.0, "reward": 0.8328857421875, "reward_std": 0.01860264502465725, "rewards//mean": 0.8328857421875, "rewards//std": 0.030949924141168594, "step": 877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1756, "grad_norm": 0.656211256980896, "kl": 0.04456622153520584, "learning_rate": 4.663475102756341e-06, "loss": 0.0045, "num_tokens": 5727464.0, "reward": 0.86834716796875, "reward_std": 0.011676906608045101, "rewards//mean": 0.86834716796875, "rewards//std": 0.024166887626051903, "step": 878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1758, "grad_norm": 0.6266794800758362, "kl": 0.043249581241980195, "learning_rate": 4.662679592072653e-06, "loss": 0.0043, "num_tokens": 5733976.0, "reward": 0.8218994140625, "reward_std": 0.009951181709766388, "rewards//mean": 0.8218994140625, "rewards//std": 0.017493095248937607, "step": 879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.176, "grad_norm": 0.6799640655517578, "kl": 0.03653007443062961, "learning_rate": 4.661883210261884e-06, "loss": 0.0037, "num_tokens": 5740520.0, "reward": 0.8427734375, "reward_std": 0.011458927765488625, "rewards//mean": 0.8427734375, "rewards//std": 0.02008756436407566, "step": 880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1762, "grad_norm": 0.6607202887535095, "kl": 0.047988517209887505, "learning_rate": 4.661085957644817e-06, "loss": 0.0048, "num_tokens": 5747056.0, "reward": 0.84716796875, "reward_std": 0.017563801258802414, "rewards//mean": 0.84716796875, "rewards//std": 0.03399338945746422, "step": 881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1764, "grad_norm": 0.761650562286377, "kl": 0.044383881613612175, "learning_rate": 4.660287834542585e-06, "loss": 0.0044, "num_tokens": 5753488.0, "reward": 0.838623046875, "reward_std": 0.011798453517258167, "rewards//mean": 0.838623046875, "rewards//std": 0.021719828248023987, "step": 882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1766, "grad_norm": 0.6888118982315063, "kl": 0.04098816681653261, "learning_rate": 4.659488841276671e-06, "loss": 0.0041, "num_tokens": 5760008.0, "reward": 0.8052978515625, "reward_std": 0.00954475812613964, "rewards//mean": 0.8052978515625, "rewards//std": 0.02581954002380371, "step": 883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1768, "grad_norm": 0.7232738137245178, "kl": 0.03795227757655084, "learning_rate": 4.65868897816891e-06, "loss": 0.0038, "num_tokens": 5766448.0, "reward": 0.82147216796875, "reward_std": 0.00959029421210289, "rewards//mean": 0.82147216796875, "rewards//std": 0.021806685253977776, "step": 884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.177, "grad_norm": 0.6652290225028992, "kl": 0.042733343318104744, "learning_rate": 4.6578882455414865e-06, "loss": 0.0043, "num_tokens": 5773016.0, "reward": 0.80224609375, "reward_std": 0.011170993559062481, "rewards//mean": 0.80224609375, "rewards//std": 0.01766917295753956, "step": 885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1772, "grad_norm": 0.6574732661247253, "kl": 0.0360676443669945, "learning_rate": 4.657086643716937e-06, "loss": 0.0036, "num_tokens": 5779536.0, "reward": 0.82135009765625, "reward_std": 0.011647794395685196, "rewards//mean": 0.82135009765625, "rewards//std": 0.027937347069382668, "step": 886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1774, "grad_norm": 0.6396461725234985, "kl": 0.04124319599941373, "learning_rate": 4.656284173018144e-06, "loss": 0.0041, "num_tokens": 5785952.0, "reward": 0.86334228515625, "reward_std": 0.012334790080785751, "rewards//mean": 0.86334228515625, "rewards//std": 0.025793951004743576, "step": 887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1776, "grad_norm": 0.6651362180709839, "kl": 0.041016052244231105, "learning_rate": 4.655480833768344e-06, "loss": 0.0041, "num_tokens": 5792528.0, "reward": 0.86566162109375, "reward_std": 0.010340459644794464, "rewards//mean": 0.86566162109375, "rewards//std": 0.03301836550235748, "step": 888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1778, "grad_norm": 0.7577189803123474, "kl": 0.04309235932305455, "learning_rate": 4.654676626291123e-06, "loss": 0.0043, "num_tokens": 5799048.0, "reward": 0.85028076171875, "reward_std": 0.015380732715129852, "rewards//mean": 0.85028076171875, "rewards//std": 0.022989420220255852, "step": 889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.178, "grad_norm": 0.5894119143486023, "kl": 0.04171917075291276, "learning_rate": 4.653871550910414e-06, "loss": 0.0042, "num_tokens": 5805568.0, "reward": 0.84588623046875, "reward_std": 0.012175725772976875, "rewards//mean": 0.84588623046875, "rewards//std": 0.027795566245913506, "step": 890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1782, "grad_norm": 0.646531343460083, "kl": 0.043966994155198336, "learning_rate": 4.653065607950502e-06, "loss": 0.0044, "num_tokens": 5812064.0, "reward": 0.8343505859375, "reward_std": 0.012879867106676102, "rewards//mean": 0.8343505859375, "rewards//std": 0.022774165496230125, "step": 891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1784, "grad_norm": 0.6579074859619141, "kl": 0.04005965869873762, "learning_rate": 4.65225879773602e-06, "loss": 0.004, "num_tokens": 5818600.0, "reward": 0.85028076171875, "reward_std": 0.01558544673025608, "rewards//mean": 0.85028076171875, "rewards//std": 0.030758274719119072, "step": 892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1786, "grad_norm": 0.77456134557724, "kl": 0.03999159322120249, "learning_rate": 4.651451120591952e-06, "loss": 0.004, "num_tokens": 5825048.0, "reward": 0.83135986328125, "reward_std": 0.010922490619122982, "rewards//mean": 0.83135986328125, "rewards//std": 0.01836886815726757, "step": 893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1788, "grad_norm": 0.6243206858634949, "kl": 0.037127970019355416, "learning_rate": 4.650642576843631e-06, "loss": 0.0037, "num_tokens": 5831704.0, "reward": 0.86248779296875, "reward_std": 0.011517677456140518, "rewards//mean": 0.86248779296875, "rewards//std": 0.028773892670869827, "step": 894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.179, "grad_norm": 0.6566209197044373, "kl": 0.042215952184051275, "learning_rate": 4.649833166816736e-06, "loss": 0.0042, "num_tokens": 5838304.0, "reward": 0.86236572265625, "reward_std": 0.011435369029641151, "rewards//mean": 0.86236572265625, "rewards//std": 0.023661118000745773, "step": 895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1792, "grad_norm": 0.7270216941833496, "kl": 0.04611260769888759, "learning_rate": 4.649022890837298e-06, "loss": 0.0046, "num_tokens": 5844856.0, "reward": 0.84814453125, "reward_std": 0.012920394539833069, "rewards//mean": 0.84814453125, "rewards//std": 0.020297471433877945, "step": 896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1794, "grad_norm": 0.6820149421691895, "kl": 0.03484724368900061, "learning_rate": 4.648211749231698e-06, "loss": 0.0035, "num_tokens": 5851392.0, "reward": 0.83905029296875, "reward_std": 0.012019114568829536, "rewards//mean": 0.83905029296875, "rewards//std": 0.02499941736459732, "step": 897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1796, "grad_norm": 0.668484091758728, "kl": 0.04221570072695613, "learning_rate": 4.6473997423266615e-06, "loss": 0.0042, "num_tokens": 5857856.0, "reward": 0.86260986328125, "reward_std": 0.015107907354831696, "rewards//mean": 0.86260986328125, "rewards//std": 0.019370052963495255, "step": 898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1798, "grad_norm": 0.6678747534751892, "kl": 0.04167328402400017, "learning_rate": 4.646586870449266e-06, "loss": 0.0042, "num_tokens": 5864392.0, "reward": 0.8489990234375, "reward_std": 0.010178307071328163, "rewards//mean": 0.8489990234375, "rewards//std": 0.03089313581585884, "step": 899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.18, "grad_norm": 0.6929616332054138, "kl": 0.039298949064686894, "learning_rate": 4.645773133926936e-06, "loss": 0.0039, "num_tokens": 5870864.0, "reward": 0.803955078125, "reward_std": 0.011242110282182693, "rewards//mean": 0.803955078125, "rewards//std": 0.018918653950095177, "step": 900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1802, "grad_norm": 0.6821906566619873, "kl": 0.03505802736617625, "learning_rate": 4.644958533087443e-06, "loss": 0.0035, "num_tokens": 5877504.0, "reward": 0.86138916015625, "reward_std": 0.011615011841058731, "rewards//mean": 0.86138916015625, "rewards//std": 0.023958658799529076, "step": 901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1804, "grad_norm": 0.675546407699585, "kl": 0.04049504571594298, "learning_rate": 4.64414306825891e-06, "loss": 0.004, "num_tokens": 5884008.0, "reward": 0.86346435546875, "reward_std": 0.017759256064891815, "rewards//mean": 0.86346435546875, "rewards//std": 0.03580428287386894, "step": 902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1806, "grad_norm": 0.6656494140625, "kl": 0.04367375490255654, "learning_rate": 4.643326739769805e-06, "loss": 0.0044, "num_tokens": 5890592.0, "reward": 0.853515625, "reward_std": 0.011980659328401089, "rewards//mean": 0.853515625, "rewards//std": 0.025094378739595413, "step": 903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1808, "grad_norm": 0.6163875460624695, "kl": 0.04099143436178565, "learning_rate": 4.642509547948947e-06, "loss": 0.0041, "num_tokens": 5897120.0, "reward": 0.8583984375, "reward_std": 0.015584884211421013, "rewards//mean": 0.8583984375, "rewards//std": 0.031024422496557236, "step": 904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.181, "grad_norm": 0.6468968391418457, "kl": 0.04323425958864391, "learning_rate": 4.6416914931254984e-06, "loss": 0.0043, "num_tokens": 5903480.0, "reward": 0.8427734375, "reward_std": 0.010204439982771873, "rewards//mean": 0.8427734375, "rewards//std": 0.022053200751543045, "step": 905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1812, "grad_norm": 0.6285822987556458, "kl": 0.042249388294294477, "learning_rate": 4.640872575628973e-06, "loss": 0.0042, "num_tokens": 5909928.0, "reward": 0.8438720703125, "reward_std": 0.014041738584637642, "rewards//mean": 0.8438720703125, "rewards//std": 0.026087839156389236, "step": 906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1814, "grad_norm": 0.6541216373443604, "kl": 0.03615096234716475, "learning_rate": 4.6400527957892295e-06, "loss": 0.0036, "num_tokens": 5916424.0, "reward": 0.83563232421875, "reward_std": 0.011880407109856606, "rewards//mean": 0.83563232421875, "rewards//std": 0.022275084629654884, "step": 907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1816, "grad_norm": 0.6855373382568359, "kl": 0.041469790041446686, "learning_rate": 4.639232153936476e-06, "loss": 0.0041, "num_tokens": 5922968.0, "reward": 0.79296875, "reward_std": 0.008580282330513, "rewards//mean": 0.79296875, "rewards//std": 0.01980830356478691, "step": 908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1818, "grad_norm": 0.6281968355178833, "kl": 0.039629788370803, "learning_rate": 4.638410650401267e-06, "loss": 0.004, "num_tokens": 5929408.0, "reward": 0.8785400390625, "reward_std": 0.01470201276242733, "rewards//mean": 0.8785400390625, "rewards//std": 0.031278807669878006, "step": 909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.182, "grad_norm": 0.7264119386672974, "kl": 0.046727875247597694, "learning_rate": 4.637588285514504e-06, "loss": 0.0047, "num_tokens": 5935848.0, "reward": 0.81976318359375, "reward_std": 0.00966187845915556, "rewards//mean": 0.81976318359375, "rewards//std": 0.020947733893990517, "step": 910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1822, "grad_norm": 0.6796708703041077, "kl": 0.04055432416498661, "learning_rate": 4.636765059607434e-06, "loss": 0.0041, "num_tokens": 5942368.0, "reward": 0.85064697265625, "reward_std": 0.014742225408554077, "rewards//mean": 0.85064697265625, "rewards//std": 0.0238548144698143, "step": 911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1824, "grad_norm": 0.5940519571304321, "kl": 0.04338231682777405, "learning_rate": 4.6359409730116546e-06, "loss": 0.0043, "num_tokens": 5948928.0, "reward": 0.83978271484375, "reward_std": 0.010560210794210434, "rewards//mean": 0.83978271484375, "rewards//std": 0.01866886019706726, "step": 912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1826, "grad_norm": 0.715964674949646, "kl": 0.053954784758388996, "learning_rate": 4.635116026059107e-06, "loss": 0.0054, "num_tokens": 5955528.0, "reward": 0.82318115234375, "reward_std": 0.014161437749862671, "rewards//mean": 0.82318115234375, "rewards//std": 0.022154470905661583, "step": 913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1828, "grad_norm": 0.8227600455284119, "kl": 0.04878250975161791, "learning_rate": 4.634290219082078e-06, "loss": 0.0049, "num_tokens": 5962024.0, "reward": 0.84210205078125, "reward_std": 0.011272008530795574, "rewards//mean": 0.84210205078125, "rewards//std": 0.02768971212208271, "step": 914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.183, "grad_norm": 0.6412074565887451, "kl": 0.036306142108514905, "learning_rate": 4.633463552413205e-06, "loss": 0.0036, "num_tokens": 5968568.0, "reward": 0.8790283203125, "reward_std": 0.008153083734214306, "rewards//mean": 0.8790283203125, "rewards//std": 0.024890564382076263, "step": 915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1832, "grad_norm": 0.6921270489692688, "kl": 0.04746776120737195, "learning_rate": 4.632636026385468e-06, "loss": 0.0047, "num_tokens": 5975152.0, "reward": 0.86175537109375, "reward_std": 0.013499973341822624, "rewards//mean": 0.86175537109375, "rewards//std": 0.02922024019062519, "step": 916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1834, "grad_norm": 0.6777154207229614, "kl": 0.043070978252217174, "learning_rate": 4.631807641332195e-06, "loss": 0.0043, "num_tokens": 5981696.0, "reward": 0.85870361328125, "reward_std": 0.01207971666008234, "rewards//mean": 0.85870361328125, "rewards//std": 0.030101627111434937, "step": 917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1836, "grad_norm": 0.7225809097290039, "kl": 0.03947909642010927, "learning_rate": 4.630978397587058e-06, "loss": 0.0039, "num_tokens": 5988504.0, "reward": 0.85260009765625, "reward_std": 0.015898864716291428, "rewards//mean": 0.85260009765625, "rewards//std": 0.039611972868442535, "step": 918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1838, "grad_norm": 0.6816273331642151, "kl": 0.043278159108012915, "learning_rate": 4.630148295484078e-06, "loss": 0.0043, "num_tokens": 5995032.0, "reward": 0.842041015625, "reward_std": 0.010010555386543274, "rewards//mean": 0.842041015625, "rewards//std": 0.023937247693538666, "step": 919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.184, "grad_norm": 0.7362410426139832, "kl": 0.046471588080748916, "learning_rate": 4.62931733535762e-06, "loss": 0.0046, "num_tokens": 6001544.0, "reward": 0.87420654296875, "reward_std": 0.014177069067955017, "rewards//mean": 0.87420654296875, "rewards//std": 0.028630439192056656, "step": 920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1842, "grad_norm": 0.6675663590431213, "kl": 0.03981461701914668, "learning_rate": 4.628485517542393e-06, "loss": 0.004, "num_tokens": 6008048.0, "reward": 0.8458251953125, "reward_std": 0.00875449925661087, "rewards//mean": 0.8458251953125, "rewards//std": 0.021909615024924278, "step": 921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1844, "grad_norm": 0.781726062297821, "kl": 0.04557593772187829, "learning_rate": 4.627652842373454e-06, "loss": 0.0046, "num_tokens": 6014448.0, "reward": 0.8353271484375, "reward_std": 0.015135039575397968, "rewards//mean": 0.8353271484375, "rewards//std": 0.030355332419276237, "step": 922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1846, "grad_norm": 1.0834941864013672, "kl": 0.04436720535159111, "learning_rate": 4.626819310186204e-06, "loss": 0.0044, "num_tokens": 6020960.0, "reward": 0.89306640625, "reward_std": 0.011011643335223198, "rewards//mean": 0.89306640625, "rewards//std": 0.01766917295753956, "step": 923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1848, "grad_norm": 0.738615870475769, "kl": 0.04177575511857867, "learning_rate": 4.625984921316392e-06, "loss": 0.0042, "num_tokens": 6027448.0, "reward": 0.87786865234375, "reward_std": 0.012972611002624035, "rewards//mean": 0.87786865234375, "rewards//std": 0.017829405143857002, "step": 924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.185, "grad_norm": 0.6873884201049805, "kl": 0.04864580603316426, "learning_rate": 4.625149676100107e-06, "loss": 0.0049, "num_tokens": 6034000.0, "reward": 0.847900390625, "reward_std": 0.012141989544034004, "rewards//mean": 0.847900390625, "rewards//std": 0.023866314440965652, "step": 925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1852, "grad_norm": 0.7753527760505676, "kl": 0.042073047487065196, "learning_rate": 4.624313574873787e-06, "loss": 0.0042, "num_tokens": 6040528.0, "reward": 0.85040283203125, "reward_std": 0.013579750433564186, "rewards//mean": 0.85040283203125, "rewards//std": 0.03724491596221924, "step": 926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1854, "grad_norm": 0.6963757276535034, "kl": 0.05082692578434944, "learning_rate": 4.623476617974212e-06, "loss": 0.0051, "num_tokens": 6047024.0, "reward": 0.7879638671875, "reward_std": 0.01067714486271143, "rewards//mean": 0.7879638671875, "rewards//std": 0.02702271193265915, "step": 927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1856, "grad_norm": 0.6761671900749207, "kl": 0.0422394210472703, "learning_rate": 4.62263880573851e-06, "loss": 0.0042, "num_tokens": 6053544.0, "reward": 0.82110595703125, "reward_std": 0.01217557117342949, "rewards//mean": 0.82110595703125, "rewards//std": 0.016807610169053078, "step": 928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1858, "grad_norm": 0.7448234558105469, "kl": 0.0404583215713501, "learning_rate": 4.6218001385041504e-06, "loss": 0.004, "num_tokens": 6060168.0, "reward": 0.88616943359375, "reward_std": 0.008901288732886314, "rewards//mean": 0.88616943359375, "rewards//std": 0.014124834910035133, "step": 929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.186, "grad_norm": 0.6166906356811523, "kl": 0.04047831706702709, "learning_rate": 4.6209606166089495e-06, "loss": 0.004, "num_tokens": 6066720.0, "reward": 0.82843017578125, "reward_std": 0.012854673899710178, "rewards//mean": 0.82843017578125, "rewards//std": 0.026895461603999138, "step": 930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1862, "grad_norm": 0.8489100933074951, "kl": 0.0454192163888365, "learning_rate": 4.620120240391065e-06, "loss": 0.0045, "num_tokens": 6073216.0, "reward": 0.79522705078125, "reward_std": 0.008993346244096756, "rewards//mean": 0.79522705078125, "rewards//std": 0.01763218827545643, "step": 931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1864, "grad_norm": 0.65506911277771, "kl": 0.039396191481500864, "learning_rate": 4.619279010189002e-06, "loss": 0.0039, "num_tokens": 6079744.0, "reward": 0.82257080078125, "reward_std": 0.008998468518257141, "rewards//mean": 0.82257080078125, "rewards//std": 0.018332570791244507, "step": 932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1866, "grad_norm": 0.6887573599815369, "kl": 0.04090560902841389, "learning_rate": 4.618436926341607e-06, "loss": 0.0041, "num_tokens": 6086280.0, "reward": 0.871337890625, "reward_std": 0.014091781340539455, "rewards//mean": 0.871337890625, "rewards//std": 0.019610146060585976, "step": 933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1868, "grad_norm": 0.6609535813331604, "kl": 0.03739321604371071, "learning_rate": 4.617593989188071e-06, "loss": 0.0037, "num_tokens": 6092832.0, "reward": 0.8531494140625, "reward_std": 0.013474280014634132, "rewards//mean": 0.8531494140625, "rewards//std": 0.030062692239880562, "step": 934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.187, "grad_norm": 0.7688198089599609, "kl": 0.0410981688182801, "learning_rate": 4.616750199067929e-06, "loss": 0.0041, "num_tokens": 6099320.0, "reward": 0.85919189453125, "reward_std": 0.013847172260284424, "rewards//mean": 0.85919189453125, "rewards//std": 0.025952504947781563, "step": 935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1872, "grad_norm": 0.643115758895874, "kl": 0.04064711183309555, "learning_rate": 4.615905556321061e-06, "loss": 0.0041, "num_tokens": 6105904.0, "reward": 0.8336181640625, "reward_std": 0.012946332804858685, "rewards//mean": 0.8336181640625, "rewards//std": 0.01982930861413479, "step": 936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1874, "grad_norm": 0.7157496809959412, "kl": 0.041027721017599106, "learning_rate": 4.615060061287688e-06, "loss": 0.0041, "num_tokens": 6112448.0, "reward": 0.8719482421875, "reward_std": 0.017627859488129616, "rewards//mean": 0.8719482421875, "rewards//std": 0.030068732798099518, "step": 937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1876, "grad_norm": 0.7336728572845459, "kl": 0.04614663729444146, "learning_rate": 4.614213714308374e-06, "loss": 0.0046, "num_tokens": 6118968.0, "reward": 0.82769775390625, "reward_std": 0.00945833045989275, "rewards//mean": 0.82769775390625, "rewards//std": 0.01365844439715147, "step": 938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1878, "grad_norm": 0.6875671148300171, "kl": 0.06045741937123239, "learning_rate": 4.6133665157240306e-06, "loss": 0.006, "num_tokens": 6125440.0, "reward": 0.875244140625, "reward_std": 0.01146995835006237, "rewards//mean": 0.875244140625, "rewards//std": 0.03147299960255623, "step": 939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.188, "grad_norm": 0.6396803855895996, "kl": 0.03798764920793474, "learning_rate": 4.612518465875906e-06, "loss": 0.0038, "num_tokens": 6131888.0, "reward": 0.84844970703125, "reward_std": 0.013945532031357288, "rewards//mean": 0.84844970703125, "rewards//std": 0.03200189769268036, "step": 940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1882, "grad_norm": 0.683394193649292, "kl": 0.04144331160932779, "learning_rate": 4.611669565105597e-06, "loss": 0.0041, "num_tokens": 6138344.0, "reward": 0.85504150390625, "reward_std": 0.011722896248102188, "rewards//mean": 0.85504150390625, "rewards//std": 0.025172600522637367, "step": 941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1884, "grad_norm": 0.6219683885574341, "kl": 0.042291250778362155, "learning_rate": 4.610819813755038e-06, "loss": 0.0042, "num_tokens": 6144864.0, "reward": 0.865966796875, "reward_std": 0.016607439145445824, "rewards//mean": 0.865966796875, "rewards//std": 0.031426794826984406, "step": 942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1886, "grad_norm": 0.6586542725563049, "kl": 0.03660945198498666, "learning_rate": 4.609969212166512e-06, "loss": 0.0037, "num_tokens": 6151488.0, "reward": 0.8427734375, "reward_std": 0.010870680212974548, "rewards//mean": 0.8427734375, "rewards//std": 0.01233302429318428, "step": 943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1888, "grad_norm": 0.6816508769989014, "kl": 0.03906205715611577, "learning_rate": 4.609117760682639e-06, "loss": 0.0039, "num_tokens": 6158024.0, "reward": 0.82928466796875, "reward_std": 0.010280165821313858, "rewards//mean": 0.82928466796875, "rewards//std": 0.02844797447323799, "step": 944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.189, "grad_norm": 0.6851620674133301, "kl": 0.04188694804906845, "learning_rate": 4.608265459646384e-06, "loss": 0.0042, "num_tokens": 6164552.0, "reward": 0.8612060546875, "reward_std": 0.012371834367513657, "rewards//mean": 0.8612060546875, "rewards//std": 0.02456735260784626, "step": 945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1892, "grad_norm": 0.7054226994514465, "kl": 0.03868069825693965, "learning_rate": 4.607412309401054e-06, "loss": 0.0039, "num_tokens": 6171072.0, "reward": 0.843505859375, "reward_std": 0.013521338813006878, "rewards//mean": 0.843505859375, "rewards//std": 0.029321666806936264, "step": 946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1894, "grad_norm": 0.6950118541717529, "kl": 0.0383919773157686, "learning_rate": 4.606558310290298e-06, "loss": 0.0038, "num_tokens": 6177488.0, "reward": 0.79071044921875, "reward_std": 0.00887228175997734, "rewards//mean": 0.79071044921875, "rewards//std": 0.017130540683865547, "step": 947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1896, "grad_norm": 0.712725818157196, "kl": 0.05234731128439307, "learning_rate": 4.605703462658107e-06, "loss": 0.0052, "num_tokens": 6183992.0, "reward": 0.85357666015625, "reward_std": 0.0174628384411335, "rewards//mean": 0.85357666015625, "rewards//std": 0.03561139106750488, "step": 948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1898, "grad_norm": 0.7520847916603088, "kl": 0.04160321340896189, "learning_rate": 4.604847766848812e-06, "loss": 0.0042, "num_tokens": 6190416.0, "reward": 0.87713623046875, "reward_std": 0.015189571306109428, "rewards//mean": 0.87713623046875, "rewards//std": 0.02517079748213291, "step": 949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.19, "grad_norm": 0.6863551735877991, "kl": 0.04947540210559964, "learning_rate": 4.60399122320709e-06, "loss": 0.0049, "num_tokens": 6196872.0, "reward": 0.8363037109375, "reward_std": 0.015436182729899883, "rewards//mean": 0.8363037109375, "rewards//std": 0.020560890436172485, "step": 950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1902, "grad_norm": 0.6834877729415894, "kl": 0.045546500477939844, "learning_rate": 4.603133832077953e-06, "loss": 0.0046, "num_tokens": 6203424.0, "reward": 0.86474609375, "reward_std": 0.012943100184202194, "rewards//mean": 0.86474609375, "rewards//std": 0.04526362195611, "step": 951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1904, "grad_norm": 0.7253804802894592, "kl": 0.04165040701627731, "learning_rate": 4.602275593806761e-06, "loss": 0.0042, "num_tokens": 6209920.0, "reward": 0.85626220703125, "reward_std": 0.015964912250638008, "rewards//mean": 0.85626220703125, "rewards//std": 0.02952994965016842, "step": 952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1906, "grad_norm": 0.6976743936538696, "kl": 0.04115398577414453, "learning_rate": 4.601416508739211e-06, "loss": 0.0041, "num_tokens": 6216496.0, "reward": 0.82269287109375, "reward_std": 0.017609048634767532, "rewards//mean": 0.82269287109375, "rewards//std": 0.026324884966015816, "step": 953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1908, "grad_norm": 0.6280128359794617, "kl": 0.04923348594456911, "learning_rate": 4.600556577221342e-06, "loss": 0.0049, "num_tokens": 6223056.0, "reward": 0.83355712890625, "reward_std": 0.012344447895884514, "rewards//mean": 0.83355712890625, "rewards//std": 0.02510998211801052, "step": 954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.191, "grad_norm": 0.7094703316688538, "kl": 0.04144132649526, "learning_rate": 4.599695799599537e-06, "loss": 0.0041, "num_tokens": 6229536.0, "reward": 0.86810302734375, "reward_std": 0.01247455459088087, "rewards//mean": 0.86810302734375, "rewards//std": 0.022549975663423538, "step": 955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1912, "grad_norm": 0.7454099655151367, "kl": 0.04297591699287295, "learning_rate": 4.5988341762205125e-06, "loss": 0.0043, "num_tokens": 6236112.0, "reward": 0.8541259765625, "reward_std": 0.01617106795310974, "rewards//mean": 0.8541259765625, "rewards//std": 0.029840312898159027, "step": 956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1914, "grad_norm": 0.7461182475090027, "kl": 0.04712600726634264, "learning_rate": 4.5979717074313336e-06, "loss": 0.0047, "num_tokens": 6242616.0, "reward": 0.8643798828125, "reward_std": 0.009678803384304047, "rewards//mean": 0.8643798828125, "rewards//std": 0.0312691293656826, "step": 957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1916, "grad_norm": 0.6835668683052063, "kl": 0.04492692369967699, "learning_rate": 4.5971083935794026e-06, "loss": 0.0045, "num_tokens": 6249200.0, "reward": 0.84637451171875, "reward_std": 0.011677243746817112, "rewards//mean": 0.84637451171875, "rewards//std": 0.03571368753910065, "step": 958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1918, "grad_norm": 0.7025179862976074, "kl": 0.038464770652353764, "learning_rate": 4.5962442350124605e-06, "loss": 0.0038, "num_tokens": 6255744.0, "reward": 0.818115234375, "reward_std": 0.00972269382327795, "rewards//mean": 0.818115234375, "rewards//std": 0.029014429077506065, "step": 959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.192, "grad_norm": 0.7778719663619995, "kl": 0.04615939827635884, "learning_rate": 4.595379232078592e-06, "loss": 0.0046, "num_tokens": 6262224.0, "reward": 0.84735107421875, "reward_std": 0.017559725791215897, "rewards//mean": 0.84735107421875, "rewards//std": 0.021091047674417496, "step": 960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1922, "grad_norm": 0.6818391680717468, "kl": 0.05492235207930207, "learning_rate": 4.5945133851262185e-06, "loss": 0.0055, "num_tokens": 6268784.0, "reward": 0.83404541015625, "reward_std": 0.011023140512406826, "rewards//mean": 0.83404541015625, "rewards//std": 0.021553227677941322, "step": 961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1924, "grad_norm": 0.7694291472434998, "kl": 0.05033859750255942, "learning_rate": 4.593646694504105e-06, "loss": 0.005, "num_tokens": 6275304.0, "reward": 0.8218994140625, "reward_std": 0.011773250997066498, "rewards//mean": 0.8218994140625, "rewards//std": 0.020430902019143105, "step": 962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1926, "grad_norm": 0.692175567150116, "kl": 0.048755659721791744, "learning_rate": 4.5927791605613525e-06, "loss": 0.0049, "num_tokens": 6281896.0, "reward": 0.83563232421875, "reward_std": 0.013875285163521767, "rewards//mean": 0.83563232421875, "rewards//std": 0.02050882764160633, "step": 963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1928, "grad_norm": 0.7087896466255188, "kl": 0.04856082936748862, "learning_rate": 4.591910783647405e-06, "loss": 0.0049, "num_tokens": 6288408.0, "reward": 0.85595703125, "reward_std": 0.012770500034093857, "rewards//mean": 0.85595703125, "rewards//std": 0.021378425881266594, "step": 964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.193, "grad_norm": 0.6834204196929932, "kl": 0.05175302270799875, "learning_rate": 4.591041564112043e-06, "loss": 0.0052, "num_tokens": 6294968.0, "reward": 0.85260009765625, "reward_std": 0.017285553738474846, "rewards//mean": 0.85260009765625, "rewards//std": 0.0240419153124094, "step": 965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1932, "grad_norm": 0.69572913646698, "kl": 0.044029935728758574, "learning_rate": 4.59017150230539e-06, "loss": 0.0044, "num_tokens": 6301496.0, "reward": 0.8465576171875, "reward_std": 0.013601857237517834, "rewards//mean": 0.8465576171875, "rewards//std": 0.03215508535504341, "step": 966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1934, "grad_norm": 0.8224660754203796, "kl": 0.04022330488078296, "learning_rate": 4.589300598577906e-06, "loss": 0.004, "num_tokens": 6308064.0, "reward": 0.836181640625, "reward_std": 0.008823839016258717, "rewards//mean": 0.836181640625, "rewards//std": 0.02338447794318199, "step": 967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1936, "grad_norm": 0.7732920050621033, "kl": 0.04558022366836667, "learning_rate": 4.58842885328039e-06, "loss": 0.0046, "num_tokens": 6314568.0, "reward": 0.83984375, "reward_std": 0.013668116182088852, "rewards//mean": 0.83984375, "rewards//std": 0.03754492849111557, "step": 968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1938, "grad_norm": 0.6807088851928711, "kl": 0.05070904782041907, "learning_rate": 4.587556266763982e-06, "loss": 0.0051, "num_tokens": 6321104.0, "reward": 0.77545166015625, "reward_std": 0.009375620633363724, "rewards//mean": 0.77545166015625, "rewards//std": 0.01899680867791176, "step": 969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.194, "grad_norm": 0.6597617268562317, "kl": 0.04255011701025069, "learning_rate": 4.586682839380159e-06, "loss": 0.0043, "num_tokens": 6327720.0, "reward": 0.8553466796875, "reward_std": 0.014259079471230507, "rewards//mean": 0.8553466796875, "rewards//std": 0.0322866328060627, "step": 970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1942, "grad_norm": 0.6452547907829285, "kl": 0.04155507404357195, "learning_rate": 4.585808571480739e-06, "loss": 0.0042, "num_tokens": 6334232.0, "reward": 0.83111572265625, "reward_std": 0.009173991158604622, "rewards//mean": 0.83111572265625, "rewards//std": 0.018265563994646072, "step": 971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1944, "grad_norm": 0.6843393445014954, "kl": 0.04133532801643014, "learning_rate": 4.584933463417874e-06, "loss": 0.0041, "num_tokens": 6340720.0, "reward": 0.83685302734375, "reward_std": 0.012771625071763992, "rewards//mean": 0.83685302734375, "rewards//std": 0.020036159083247185, "step": 972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1946, "grad_norm": 0.6603941917419434, "kl": 0.052238917676731944, "learning_rate": 4.584057515544061e-06, "loss": 0.0052, "num_tokens": 6347392.0, "reward": 0.8248291015625, "reward_std": 0.012165896594524384, "rewards//mean": 0.8248291015625, "rewards//std": 0.030538296326994896, "step": 973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1948, "grad_norm": 0.6929309368133545, "kl": 0.04384617321193218, "learning_rate": 4.583180728212128e-06, "loss": 0.0044, "num_tokens": 6353848.0, "reward": 0.88787841796875, "reward_std": 0.0120087293908, "rewards//mean": 0.88787841796875, "rewards//std": 0.01843632012605667, "step": 974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.195, "grad_norm": 0.6984637379646301, "kl": 0.03931608865968883, "learning_rate": 4.582303101775249e-06, "loss": 0.0039, "num_tokens": 6360504.0, "reward": 0.82904052734375, "reward_std": 0.011513952165842056, "rewards//mean": 0.82904052734375, "rewards//std": 0.033347733318805695, "step": 975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1952, "grad_norm": 0.6983664631843567, "kl": 0.047918176744133234, "learning_rate": 4.5814246365869285e-06, "loss": 0.0048, "num_tokens": 6367160.0, "reward": 0.85308837890625, "reward_std": 0.012053456157445908, "rewards//mean": 0.85308837890625, "rewards//std": 0.030966732650995255, "step": 976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1954, "grad_norm": 0.6621443033218384, "kl": 0.045792046934366226, "learning_rate": 4.580545333001014e-06, "loss": 0.0046, "num_tokens": 6373640.0, "reward": 0.84912109375, "reward_std": 0.012664642184972763, "rewards//mean": 0.84912109375, "rewards//std": 0.025794023647904396, "step": 977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1956, "grad_norm": 0.7002736926078796, "kl": 0.040672547183930874, "learning_rate": 4.579665191371687e-06, "loss": 0.0041, "num_tokens": 6380208.0, "reward": 0.860107421875, "reward_std": 0.017162654548883438, "rewards//mean": 0.860107421875, "rewards//std": 0.03360193222761154, "step": 978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1958, "grad_norm": 0.6573413610458374, "kl": 0.043526260647922754, "learning_rate": 4.578784212053471e-06, "loss": 0.0044, "num_tokens": 6386752.0, "reward": 0.85009765625, "reward_std": 0.013537352904677391, "rewards//mean": 0.85009765625, "rewards//std": 0.015786921605467796, "step": 979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.196, "grad_norm": 0.6943175792694092, "kl": 0.05249407817609608, "learning_rate": 4.577902395401222e-06, "loss": 0.0052, "num_tokens": 6393320.0, "reward": 0.83551025390625, "reward_std": 0.011513197794556618, "rewards//mean": 0.83551025390625, "rewards//std": 0.032235193997621536, "step": 980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1962, "grad_norm": 0.6779718399047852, "kl": 0.04732549702748656, "learning_rate": 4.577019741770137e-06, "loss": 0.0047, "num_tokens": 6399824.0, "reward": 0.83935546875, "reward_std": 0.012707693502306938, "rewards//mean": 0.83935546875, "rewards//std": 0.026397312059998512, "step": 981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1964, "grad_norm": 0.7734324336051941, "kl": 0.05470086168497801, "learning_rate": 4.576136251515748e-06, "loss": 0.0055, "num_tokens": 6406368.0, "reward": 0.7996826171875, "reward_std": 0.010812653228640556, "rewards//mean": 0.7996826171875, "rewards//std": 0.022728921845555305, "step": 982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1966, "grad_norm": 0.7584691643714905, "kl": 0.049642365891486406, "learning_rate": 4.575251924993926e-06, "loss": 0.005, "num_tokens": 6412840.0, "reward": 0.87274169921875, "reward_std": 0.016802702099084854, "rewards//mean": 0.87274169921875, "rewards//std": 0.020981669425964355, "step": 983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1968, "grad_norm": 0.6663657426834106, "kl": 0.05738833174109459, "learning_rate": 4.574366762560876e-06, "loss": 0.0057, "num_tokens": 6419368.0, "reward": 0.8297119140625, "reward_std": 0.014144438318908215, "rewards//mean": 0.8297119140625, "rewards//std": 0.02907150238752365, "step": 984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.197, "grad_norm": 0.6651089191436768, "kl": 0.0494233095087111, "learning_rate": 4.573480764573143e-06, "loss": 0.0049, "num_tokens": 6425872.0, "reward": 0.85003662109375, "reward_std": 0.011118832975625992, "rewards//mean": 0.85003662109375, "rewards//std": 0.022995345294475555, "step": 985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1972, "grad_norm": 0.6475024223327637, "kl": 0.04345037881284952, "learning_rate": 4.572593931387604e-06, "loss": 0.0043, "num_tokens": 6432344.0, "reward": 0.819091796875, "reward_std": 0.013030166737735271, "rewards//mean": 0.819091796875, "rewards//std": 0.03384611755609512, "step": 986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1974, "grad_norm": 0.6717162728309631, "kl": 0.04292596550658345, "learning_rate": 4.571706263361479e-06, "loss": 0.0043, "num_tokens": 6438768.0, "reward": 0.8155517578125, "reward_std": 0.012478616088628769, "rewards//mean": 0.8155517578125, "rewards//std": 0.017322655767202377, "step": 987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1976, "grad_norm": 0.7700872421264648, "kl": 0.050918920896947384, "learning_rate": 4.570817760852319e-06, "loss": 0.0051, "num_tokens": 6445376.0, "reward": 0.80120849609375, "reward_std": 0.01503811962902546, "rewards//mean": 0.80120849609375, "rewards//std": 0.025027859956026077, "step": 988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1978, "grad_norm": 0.695588231086731, "kl": 0.04219944030046463, "learning_rate": 4.569928424218012e-06, "loss": 0.0042, "num_tokens": 6451944.0, "reward": 0.8466796875, "reward_std": 0.01621868647634983, "rewards//mean": 0.8466796875, "rewards//std": 0.03559103608131409, "step": 989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.198, "grad_norm": 0.7971217036247253, "kl": 0.04319071024656296, "learning_rate": 4.569038253816783e-06, "loss": 0.0043, "num_tokens": 6458592.0, "reward": 0.85443115234375, "reward_std": 0.016578156501054764, "rewards//mean": 0.85443115234375, "rewards//std": 0.03290539234876633, "step": 990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1982, "grad_norm": 0.7145017981529236, "kl": 0.047632794827222824, "learning_rate": 4.5681472500071935e-06, "loss": 0.0048, "num_tokens": 6465136.0, "reward": 0.7896728515625, "reward_std": 0.012131381779909134, "rewards//mean": 0.7896728515625, "rewards//std": 0.021147403866052628, "step": 991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1984, "grad_norm": 0.6252520084381104, "kl": 0.04791990341618657, "learning_rate": 4.567255413148139e-06, "loss": 0.0048, "num_tokens": 6471728.0, "reward": 0.8143310546875, "reward_std": 0.01138047780841589, "rewards//mean": 0.8143310546875, "rewards//std": 0.018662068992853165, "step": 992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1986, "grad_norm": 0.6275078058242798, "kl": 0.047070488799363375, "learning_rate": 4.566362743598851e-06, "loss": 0.0047, "num_tokens": 6478208.0, "reward": 0.8868408203125, "reward_std": 0.013486402109265327, "rewards//mean": 0.8868408203125, "rewards//std": 0.023591680452227592, "step": 993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1988, "grad_norm": 0.7225343585014343, "kl": 0.05764315905980766, "learning_rate": 4.565469241718896e-06, "loss": 0.0058, "num_tokens": 6484760.0, "reward": 0.810546875, "reward_std": 0.01867908425629139, "rewards//mean": 0.810546875, "rewards//std": 0.03193993121385574, "step": 994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.199, "grad_norm": 0.6515203714370728, "kl": 0.042084392393007874, "learning_rate": 4.564574907868179e-06, "loss": 0.0042, "num_tokens": 6491320.0, "reward": 0.79107666015625, "reward_std": 0.00986007135361433, "rewards//mean": 0.79107666015625, "rewards//std": 0.020763371139764786, "step": 995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1992, "grad_norm": 0.6625439524650574, "kl": 0.044497475028038025, "learning_rate": 4.563679742406935e-06, "loss": 0.0044, "num_tokens": 6497864.0, "reward": 0.84674072265625, "reward_std": 0.016481919214129448, "rewards//mean": 0.84674072265625, "rewards//std": 0.03052658401429653, "step": 996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1994, "grad_norm": 0.7319129109382629, "kl": 0.04618475493043661, "learning_rate": 4.562783745695738e-06, "loss": 0.0046, "num_tokens": 6504464.0, "reward": 0.81353759765625, "reward_std": 0.015955062583088875, "rewards//mean": 0.81353759765625, "rewards//std": 0.028766000643372536, "step": 997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1996, "grad_norm": 0.6688764095306396, "kl": 0.040976963471621275, "learning_rate": 4.561886918095495e-06, "loss": 0.0041, "num_tokens": 6511160.0, "reward": 0.852294921875, "reward_std": 0.013901523314416409, "rewards//mean": 0.852294921875, "rewards//std": 0.026882583275437355, "step": 998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1998, "grad_norm": 0.791424572467804, "kl": 0.07057768478989601, "learning_rate": 4.560989259967447e-06, "loss": 0.0071, "num_tokens": 6517640.0, "reward": 0.85760498046875, "reward_std": 0.017160706222057343, "rewards//mean": 0.85760498046875, "rewards//std": 0.02633695863187313, "step": 999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2, "grad_norm": 0.6725102066993713, "kl": 0.04978520795702934, "learning_rate": 4.560090771673174e-06, "loss": 0.005, "num_tokens": 6524192.0, "reward": 0.83251953125, "reward_std": 0.009535292163491249, "rewards//mean": 0.83251953125, "rewards//std": 0.033707186579704285, "step": 1000 } ], "logging_steps": 1, "max_steps": 5000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }